Merge branch 'cassandra-2.0' into cassandra-2.1 Conflicts: CHANGES.txt

commit: f4037edbfb1e471f104e836e96f61619ae030d42 [log] [tgz]
author: Sylvain Lebresne <sylvain@datastax.com> Fri Oct 17 11:54:43 2014 +0200
committer: Sylvain Lebresne <sylvain@datastax.com> Fri Oct 17 11:54:43 2014 +0200
tree: bf7507d25dc5d4853ad4099599ca5aceeea9bd14
parent: 24e4210a4f7e6d18346aed6114c39e85a115dc6c [diff]
parent: 29a8b882d8f4192588b85b77c41c00942508b8ce [diff]
diff --git a/.gitignore b/.gitignore
index 8dfa07c..c7cf9fd 100644
--- a/.gitignore
+++ b/.gitignore

@@ -6,13 +6,15 @@
 src/resources/org/apache/cassandra/config/
 logs/
 data/
+conf/hotspot_compiler
 
 # C* debs
 build-stamp
 build.properties
 debian/cassandra*debhelper*
-debian/cassandra.substvars
+debian/cassandra*.substvars
 debian/cassandra/
+debian/cassandra-tools/
 debian/files
 
 # gitignore doesn't help with modified files - you may wish to:

diff --git a/.rat-excludes b/.rat-excludes
index 503b3a6..d95b499 100644
--- a/.rat-excludes
+++ b/.rat-excludes

@@ -30,4 +30,5 @@
 examples/hadoop_word_count/conf/log4j.properties
 pylib/cqlshlib/test/**
 src/resources/org/apache/cassandra/config/version.properties
-**/hotspot_compiler
+conf/hotspot_compiler
+**/*-example.yaml

diff --git a/CHANGES.txt b/CHANGES.txt
index 544cf9a..7cd5154 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt

@@ -1,4 +1,89 @@
-2.0.11:
+2.1.1
+ * Fix IllegalArgumentException when a list of IN values containing tuples
+   is passed as a single arg to a prepared statement with the v1 or v2
+   protocol (CASSANDRA-8062)
+ * Fix ClassCastException in DISTINCT query on static columns with
+   query paging (CASSANDRA-8108)
+ * Fix NPE on null nested UDT inside a set (CASSANDRA-8105)
+ * Fix exception when querying secondary index on set items or map keys
+   when some clustering columns are specified (CASSANDRA-8073)
+ * Send proper error response when there is an error during native
+   protocol message decode (CASSANDRA-8118)
+ * Gossip should ignore generation numbers too far in the future (CASSANDRA-8113)
+ * Fix NPE when creating a table with frozen sets, lists (CASSANDRA-8104)
+ * Fix high memory use due to tracking reads on incrementally opened sstable
+   readers (CASSANDRA-8066)
+ * Fix EXECUTE request with skipMetadata=false returning no metadata
+   (CASSANDRA-8054)
+ * Allow concurrent use of CQLBulkOutputFormat (CASSANDRA-7776)
+ * Shutdown JVM on OOM (CASSANDRA-7507)
+ * Upgrade netty version and enable epoll event loop (CASSANDRA-7761)
+ * Don't duplicate sstables smaller than split size when using
+   the sstablesplitter tool (CASSANDRA-7616)
+ * Avoid re-parsing already prepared statements (CASSANDRA-7923)
+ * Fix some Thrift slice deletions and updates of COMPACT STORAGE
+   tables with some clustering columns omitted (CASSANDRA-7990)
+ * Fix filtering for CONTAINS on sets (CASSANDRA-8033)
+ * Properly track added size (CASSANDRA-7239)
+ * Allow compilation in java 8 (CASSANDRA-7208)
+ * Fix Assertion error on RangeTombstoneList diff (CASSANDRA-8013)
+ * Release references to overlapping sstables during compaction (CASSANDRA-7819)
+ * Send notification when opening compaction results early (CASSANDRA-8034)
+ * Make native server start block until properly bound (CASSANDRA-7885)
+ * (cqlsh) Fix IPv6 support (CASSANDRA-7988)
+ * Ignore fat clients when checking for endpoint collision (CASSANDRA-7939)
+ * Make sstablerepairedset take a list of files (CASSANDRA-7995)
+ * (cqlsh) Tab completeion for indexes on map keys (CASSANDRA-7972)
+ * (cqlsh) Fix UDT field selection in select clause (CASSANDRA-7891)
+ * Fix resource leak in event of corrupt sstable
+ * (cqlsh) Add command line option for cqlshrc file path (CASSANDRA-7131)
+ * Provide visibility into prepared statements churn (CASSANDRA-7921, CASSANDRA-7930)
+ * Invalidate prepared statements when their keyspace or table is
+   dropped (CASSANDRA-7566)
+ * cassandra-stress: fix support for NetworkTopologyStrategy (CASSANDRA-7945)
+ * Fix saving caches when a table is dropped (CASSANDRA-7784)
+ * Add better error checking of new stress profile (CASSANDRA-7716)
+ * Use ThreadLocalRandom and remove FBUtilities.threadLocalRandom (CASSANDRA-7934)
+ * Prevent operator mistakes due to simultaneous bootstrap (CASSANDRA-7069)
+ * cassandra-stress supports whitelist mode for node config (CASSANDRA-7658)
+ * GCInspector more closely tracks GC; cassandra-stress and nodetool report it (CASSANDRA-7916)
+ * nodetool won't output bogus ownership info without a keyspace (CASSANDRA-7173)
+ * Add human readable option to nodetool commands (CASSANDRA-5433)
+ * Don't try to set repairedAt on old sstables (CASSANDRA-7913)
+ * Add metrics for tracking PreparedStatement use (CASSANDRA-7719)
+ * (cqlsh) tab-completion for triggers (CASSANDRA-7824)
+ * (cqlsh) Support for query paging (CASSANDRA-7514)
+ * (cqlsh) Show progress of COPY operations (CASSANDRA-7789)
+ * Add syntax to remove multiple elements from a map (CASSANDRA-6599)
+ * Support non-equals conditions in lightweight transactions (CASSANDRA-6839)
+ * Add IF [NOT] EXISTS to create/drop triggers (CASSANDRA-7606)
+ * (cqlsh) Display the current logged-in user (CASSANDRA-7785)
+ * (cqlsh) Don't ignore CTRL-C during COPY FROM execution (CASSANDRA-7815)
+ * (cqlsh) Order UDTs according to cross-type dependencies in DESCRIBE
+   output (CASSANDRA-7659)
+ * (cqlsh) Fix handling of CAS statement results (CASSANDRA-7671)
+ * (cqlsh) COPY TO/FROM improvements (CASSANDRA-7405)
+ * Support list index operations with conditions (CASSANDRA-7499)
+ * Add max live/tombstoned cells to nodetool cfstats output (CASSANDRA-7731)
+ * Validate IPv6 wildcard addresses properly (CASSANDRA-7680)
+ * (cqlsh) Error when tracing query (CASSANDRA-7613)
+ * Avoid IOOBE when building SyntaxError message snippet (CASSANDRA-7569)
+ * SSTableExport uses correct validator to create string representation of partition
+   keys (CASSANDRA-7498)
+ * Avoid NPEs when receiving type changes for an unknown keyspace (CASSANDRA-7689)
+ * Add support for custom 2i validation (CASSANDRA-7575)
+ * Pig support for hadoop CqlInputFormat (CASSANDRA-6454)
+ * Add listen_interface and rpc_interface options (CASSANDRA-7417)
+ * Improve schema merge performance (CASSANDRA-7444)
+ * Adjust MT depth based on # of partition validating (CASSANDRA-5263)
+ * Optimise NativeCell comparisons (CASSANDRA-6755)
+ * Configurable client timeout for cqlsh (CASSANDRA-7516)
+ * Include snippet of CQL query near syntax error in messages (CASSANDRA-7111)
+ * Make repair -pr work with -local (CASSANDRA-7450)
+ * Fix error in sstableloader with -cph > 1 (CASSANDRA-8007)
+ * Fix snapshot repair error on indexed tables (CASSANDRA-8020)
+ * Do not exit nodetool repair when receiving JMX NOTIF_LOST (CASSANDRA-7909)
+Merged from 2.0:
  * Reject conditions on DELETE unless full PK is given (CASSANDRA-6430)
  * Properly reject the token function DELETE (CASSANDRA-7747)
  * Force batchlog replay before decommissioning a node (CASSANDRA-7446)
@@ -27,7 +112,6 @@
  * Fix possible overflow while sorting CL segments for replay (CASSANDRA-7992)
  * Increase nodetool Xmx (CASSANDRA-7956)
  * Archive any commitlog segments present at startup (CASSANDRA-6904)
- * Ignore fat clients when checking for endpoint collision (CASSANDRA-7939)
  * CrcCheckChance should adjust based on live CFMetadata not 
    sstable metadata (CASSANDRA-7978)
  * token() should only accept columns in the partitioning
@@ -41,9 +125,58 @@
  * Fix RowIndexEntry to report correct serializedSize (CASSANDRA-7948)
  * Make CQLSSTableWriter sync within partitions (CASSANDRA-7360)
  * Potentially use non-local replicas in CqlConfigHelper (CASSANDRA-7906)
- * Explicitly disallowing mixing multi-column and single-column
+ * Explicitly disallow mixing multi-column and single-column
    relations on clustering columns (CASSANDRA-7711)
  * Better error message when condition is set on PK column (CASSANDRA-7804)
+ * Don't send schema change responses and events for no-op DDL
+   statements (CASSANDRA-7600)
+ * (Hadoop) fix cluster initialisation for a split fetching (CASSANDRA-7774)
+ * Throw InvalidRequestException when queries contain relations on entire
+   collection columns (CASSANDRA-7506)
+ * (cqlsh) enable CTRL-R history search with libedit (CASSANDRA-7577)
+ * (Hadoop) allow ACFRW to limit nodes to local DC (CASSANDRA-7252)
+ * (cqlsh) cqlsh should automatically disable tracing when selecting
+   from system_traces (CASSANDRA-7641)
+ * (Hadoop) Add CqlOutputFormat (CASSANDRA-6927)
+ * Don't depend on cassandra config for nodetool ring (CASSANDRA-7508)
+ * (cqlsh) Fix failing cqlsh formatting tests (CASSANDRA-7703)
+ * Fix IncompatibleClassChangeError from hadoop2 (CASSANDRA-7229)
+ * Add 'nodetool sethintedhandoffthrottlekb' (CASSANDRA-7635)
+ * (cqlsh) Add tab-completion for CREATE/DROP USER IF [NOT] EXISTS (CASSANDRA-7611)
+ * Catch errors when the JVM pulls the rug out from GCInspector (CASSANDRA-5345)
+ * cqlsh fails when version number parts are not int (CASSANDRA-7524)
+ * Fix NPE when table dropped during streaming (CASSANDRA-7946)
+ * Fix wrong progress when streaming uncompressed (CASSANDRA-7878)
+ * Fix possible infinite loop in creating repair range (CASSANDRA-7983)
+ * Fix unit in nodetool for streaming throughput (CASSANDRA-7375)
+ * Fix spin loop in AtomicSortedColumns (CASSANDRA-7546)
+Merged from 1.2:
+ * Don't index tombstones (CASSANDRA-7828)
+ * Improve PasswordAuthenticator default super user setup (CASSANDRA-7788)
+
+
+2.1.0
+ * (cqlsh) Removed "ALTER TYPE <name> RENAME TO <name>" from tab-completion
+   (CASSANDRA-7895)
+ * Fixed IllegalStateException in anticompaction (CASSANDRA-7892)
+ * cqlsh: DESCRIBE support for frozen UDTs, tuples (CASSANDRA-7863)
+ * Avoid exposing internal classes over JMX (CASSANDRA-7879)
+ * Add null check for keys when freezing collection (CASSANDRA-7869)
+ * Improve stress workload realism (CASSANDRA-7519)
+
+
+2.1.0-rc7
+ * Add frozen keyword and require UDT to be frozen (CASSANDRA-7857)
+ * Track added sstable size correctly (CASSANDRA-7239)
+ * (cqlsh) Fix case insensitivity (CASSANDRA-7834)
+ * Fix failure to stream ranges when moving (CASSANDRA-7836)
+ * Correctly remove tmplink files (CASSANDRA-7803)
+ * (cqlsh) Fix column name formatting for functions, CAS operations,
+   and UDT field selections (CASSANDRA-7806)
+ * (cqlsh) Fix COPY FROM handling of null/empty primary key
+   values (CASSANDRA-7792)
+ * Fix ordering of static cells (CASSANDRA-7763)
+Merged from 2.0:
  * Forbid re-adding dropped counter columns (CASSANDRA-7831)
  * Fix CFMetaData#isThriftCompatible() for PK-only tables (CASSANDRA-7832)
  * Always reject inequality on the partition key without token()
@@ -51,13 +184,6 @@
  * Always send Paxos commit to all replicas (CASSANDRA-7479)
  * Make disruptor_thrift_server invocation pool configurable (CASSANDRA-7594)
  * Make repair no-op when RF=1 (CASSANDRA-7864)
- * Fix NPE when table dropped during streaming (CASSANDRA-7946)
- * Fix wrong progress when streaming uncompressed (CASSANDRA-7878)
- * Fix possible infinite loop in creating repair range (CASSANDRA-7983)
- * Fix unit in nodetool for streaming throughput (CASSANDRA-7375)
- * Do not exit nodetool repair when receiving JMX NOTIF_LOST (CASSANDRA-7909)
-Merged from 1.2:
- * Don't index tombstones (CASSANDRA-7828)
 
 
 2.0.10
@@ -67,24 +193,40 @@
  * Configure system.paxos with LeveledCompactionStrategy (CASSANDRA-7753)
  * Fix ALTER clustering column type from DateType to TimestampType when
    using DESC clustering order (CASSANRDA-7797)
- * Stop inheriting liveRatio and liveRatioComputedAt from previous
-   memtables (CASSANDRA-7796)
  * Throw EOFException if we run out of chunks in compressed datafile
    (CASSANDRA-7664)
- * Throw InvalidRequestException when queries contain relations on entire
-   collection columns (CASSANDRA-7506)
  * Fix PRSI handling of CQL3 row markers for row cleanup (CASSANDRA-7787)
- * (cqlsh) enable CTRL-R history search with libedit (CASSANDRA-7577)
  * Fix dropping collection when it's the last regular column (CASSANDRA-7744)
  * Properly reject operations on list index with conditions (CASSANDRA-7499)
- * (Hadoop) allow ACFRW to limit nodes to local DC (CASSANDRA-7252)
+ * Make StreamReceiveTask thread safe and gc friendly (CASSANDRA-7795)
+ * Validate empty cell names from counter updates (CASSANDRA-7798)
+Merged from 1.2:
+ * Don't allow compacted sstables to be marked as compacting (CASSANDRA-7145)
+ * Track expired tombstones (CASSANDRA-7810)
+
+
+2.1.0-rc6
+ * Fix OOM issue from netty caching over time (CASSANDRA-7743)
+ * json2sstable couldn't import JSON for CQL table (CASSANDRA-7477)
+ * Invalidate all caches on table drop (CASSANDRA-7561)
+ * Skip strict endpoint selection for ranges if RF == nodes (CASSANRA-7765)
+ * Fix Thrift range filtering without 2ary index lookups (CASSANDRA-7741)
+ * Add tracing entries about concurrent range requests (CASSANDRA-7599)
+ * (cqlsh) Fix DESCRIBE for NTS keyspaces (CASSANDRA-7729)
+ * Remove netty buffer ref-counting (CASSANDRA-7735)
+ * Pass mutated cf to index updater for use by PRSI (CASSANDRA-7742)
+ * Include stress yaml example in release and deb (CASSANDRA-7717)
+ * workaround for netty issue causing corrupted data off the wire (CASSANDRA-7695)
+ * cqlsh DESC CLUSTER fails retrieving ring information (CASSANDRA-7687)
+ * Fix binding null values inside UDT (CASSANDRA-7685)
+ * Fix UDT field selection with empty fields (CASSANDRA-7670)
+ * Bogus deserialization of static cells from sstable (CASSANDRA-7684)
+ * Fix NPE on compaction leftover cleanup for dropped table (CASSANDRA-7770)
+Merged from 2.0:
  * (cqlsh) Wait up to 10 sec for a tracing session (CASSANDRA-7222)
  * Fix NPE in FileCacheService.sizeInBytes (CASSANDRA-7756)
- * (cqlsh) cqlsh should automatically disable tracing when selecting
-   from system_traces (CASSANDRA-7641)
- * (Hadoop) Add CqlOutputFormat (CASSANDRA-6927)
- * Don't depend on cassandra config for nodetool ring (CASSANDRA-7508)
- * (cqlsh) Fix failing cqlsh formatting tests (CASSANDRA-7703)
+ * Remove duplicates from StorageService.getJoiningNodes (CASSANDRA-7478)
+ * Clone token map outside of hot gossip loops (CASSANDRA-7758)
  * Fix MS expiring map timeout for Paxos messages (CASSANDRA-7752)
  * Do not flush on truncate if durable_writes is false (CASSANDRA-7750)
  * Give CRR a default input_cql Statement (CASSANDRA-7226)
@@ -98,43 +240,122 @@
  * Avoid logging CompactionInterrupted at ERROR (CASSANDRA-7694)
  * Minor leak in sstable2jon (CASSANDRA-7709)
  * Add cassandra.auto_bootstrap system property (CASSANDRA-7650)
- * Remove CqlPagingRecordReader/CqlPagingInputFormat (CASSANDRA-7570)
- * Fix IncompatibleClassChangeError from hadoop2 (CASSANDRA-7229)
- * Add 'nodetool sethintedhandoffthrottlekb' (CASSANDRA-7635)
  * Update java driver (for hadoop) (CASSANDRA-7618)
- * Fix truncate to always flush (CASSANDRA-7511)
+ * Remove CqlPagingRecordReader/CqlPagingInputFormat (CASSANDRA-7570)
+ * Support connecting to ipv6 jmx with nodetool (CASSANDRA-7669)
+
+
+2.1.0-rc5
+ * Reject counters inside user types (CASSANDRA-7672)
+ * Switch to notification-based GCInspector (CASSANDRA-7638)
+ * (cqlsh) Handle nulls in UDTs and tuples correctly (CASSANDRA-7656)
+ * Don't use strict consistency when replacing (CASSANDRA-7568)
+ * Fix min/max cell name collection on 2.0 SSTables with range
+   tombstones (CASSANDRA-7593)
+ * Tolerate min/max cell names of different lengths (CASSANDRA-7651)
+ * Filter cached results correctly (CASSANDRA-7636)
+ * Fix tracing on the new SEPExecutor (CASSANDRA-7644)
  * Remove shuffle and taketoken (CASSANDRA-7601)
- * Switch liveRatio-related log messages to DEBUG (CASSANDRA-7467)
- * (cqlsh) Add tab-completion for CREATE/DROP USER IF [NOT] EXISTS (CASSANDRA-7611)
- * Always merge ranges owned by a single node (CASSANDRA-6930)
- * Pig support for hadoop CqlInputFormat (CASSANDRA-6454)
+ * Clean up Windows batch scripts (CASSANDRA-7619)
+ * Fix native protocol drop user type notification (CASSANDRA-7571)
+ * Give read access to system.schema_usertypes to all authenticated users
+   (CASSANDRA-7578)
+ * (cqlsh) Fix cqlsh display when zero rows are returned (CASSANDRA-7580)
+ * Get java version correctly when JAVA_TOOL_OPTIONS is set (CASSANDRA-7572)
+ * Fix NPE when dropping index from non-existent keyspace, AssertionError when
+   dropping non-existent index with IF EXISTS (CASSANDRA-7590)
+ * Fix sstablelevelresetter hang (CASSANDRA-7614)
+ * (cqlsh) Fix deserialization of blobs (CASSANDRA-7603)
+ * Use "keyspace updated" schema change message for UDT changes in v1 and
+   v2 protocols (CASSANDRA-7617)
+ * Fix tracing of range slices and secondary index lookups that are local
+   to the coordinator (CASSANDRA-7599)
+ * Set -Dcassandra.storagedir for all tool shell scripts (CASSANDRA-7587)
+ * Don't swap max/min col names when mutating sstable metadata (CASSANDRA-7596)
+ * (cqlsh) Correctly handle paged result sets (CASSANDRA-7625)
+ * (cqlsh) Improve waiting for a trace to complete (CASSANDRA-7626)
+ * Fix tracing of concurrent range slices and 2ary index queries (CASSANDRA-7626)
+ * Fix scrub against collection type (CASSANDRA-7665)
+Merged from 2.0:
+ * Set gc_grace_seconds to seven days for system schema tables (CASSANDRA-7668)
+ * SimpleSeedProvider no longer caches seeds forever (CASSANDRA-7663)
+ * Always flush on truncate (CASSANDRA-7511)
  * Fix ReversedType(DateType) mapping to native protocol (CASSANDRA-7576)
+ * Always merge ranges owned by a single node (CASSANDRA-6930)
+ * Track max/min timestamps for range tombstones (CASSANDRA-7647)
+ * Fix NPE when listing saved caches dir (CASSANDRA-7632)
+
+
+2.1.0-rc4
+ * Fix word count hadoop example (CASSANDRA-7200)
+ * Updated memtable_cleanup_threshold and memtable_flush_writers defaults 
+   (CASSANDRA-7551)
+ * (Windows) fix startup when WMI memory query fails (CASSANDRA-7505)
+ * Anti-compaction proceeds if any part of the repair failed (CASANDRA-7521)
+ * Add missing table name to DROP INDEX responses and notifications (CASSANDRA-7539)
+ * Bump CQL version to 3.2.0 and update CQL documentation (CASSANDRA-7527)
+ * Fix configuration error message when running nodetool ring (CASSANDRA-7508)
+ * Support conditional updates, tuple type, and the v3 protocol in cqlsh (CASSANDRA-7509)
+ * Handle queries on multiple secondary index types (CASSANDRA-7525)
+ * Fix cqlsh authentication with v3 native protocol (CASSANDRA-7564)
+ * Fix NPE when unknown prepared statement ID is used (CASSANDRA-7454)
+Merged from 2.0:
  * (Windows) force range-based repair to non-sequential mode (CASSANDRA-7541)
  * Fix range merging when DES scores are zero (CASSANDRA-7535)
  * Warn when SSL certificates have expired (CASSANDRA-7528)
+ * Fix error when doing reversed queries with static columns (CASSANDRA-7490)
+Merged from 1.2:
+ * Set correct stream ID on responses when non-Exception Throwables
+   are thrown while handling native protocol messages (CASSANDRA-7470)
+
+
+2.1.0-rc3
+ * Consider expiry when reconciling otherwise equal cells (CASSANDRA-7403)
+ * Introduce CQL support for stress tool (CASSANDRA-6146)
+ * Fix ClassCastException processing expired messages (CASSANDRA-7496)
+ * Fix prepared marker for collections inside UDT (CASSANDRA-7472)
+ * Remove left-over populate_io_cache_on_flush and replicate_on_write
+   uses (CASSANDRA-7493)
+ * (Windows) handle spaces in path names (CASSANDRA-7451)
+ * Ensure writes have completed after dropping a table, before recycling
+   commit log segments (CASSANDRA-7437)
+ * Remove left-over rows_per_partition_to_cache (CASSANDRA-7493)
+ * Fix error when CONTAINS is used with a bind marker (CASSANDRA-7502)
+ * Properly reject unknown UDT field (CASSANDRA-7484)
+Merged from 2.0:
+ * Fix CC#collectTimeOrderedData() tombstone optimisations (CASSANDRA-7394)
+ * Support DISTINCT for static columns and fix behaviour when DISTINC is
+   not use (CASSANDRA-7305).
  * Workaround JVM NPE on JMX bind failure (CASSANDRA-7254)
  * Fix race in FileCacheService RemovalListener (CASSANDRA-7278)
  * Fix inconsistent use of consistencyForCommit that allowed LOCAL_QUORUM
    operations to incorrect become full QUORUM (CASSANDRA-7345)
  * Properly handle unrecognized opcodes and flags (CASSANDRA-7440)
  * (Hadoop) close CqlRecordWriter clients when finished (CASSANDRA-7459)
+ * Commit disk failure policy (CASSANDRA-7429)
  * Make sure high level sstables get compacted (CASSANDRA-7414)
  * Fix AssertionError when using empty clustering columns and static columns
    (CASSANDRA-7455)
- * Add inter_dc_stream_throughput_outbound_megabits_per_sec (CASSANDRA-6596)
  * Add option to disable STCS in L0 (CASSANDRA-6621)
+ * Upgrade to snappy-java 1.0.5.2 (CASSANDRA-7476)
+
+
+2.1.0-rc2
+ * Fix heap size calculation for CompoundSparseCellName and 
+   CompoundSparseCellName.WithCollection (CASSANDRA-7421)
+ * Allow counter mutations in UNLOGGED batches (CASSANDRA-7351)
+ * Modify reconcile logic to always pick a tombstone over a counter cell
+   (CASSANDRA-7346)
+ * Avoid incremental compaction on Windows (CASSANDRA-7365)
+ * Fix exception when querying a composite-keyed table with a collection index
+   (CASSANDRA-7372)
+ * Use node's host id in place of counter ids (CASSANDRA-7366)
  * Fix error when doing reversed queries with static columns (CASSANDRA-7490)
- * Backport CASSNADRA-3569/CASSANDRA-6747 (CASSANDRA-7560)
+ * Backport CASSANDRA-6747 (CASSANDRA-7560)
  * Track max/min timestamps for range tombstones (CASSANDRA-7647)
  * Fix NPE when listing saved caches dir (CASSANDRA-7632)
  * Fix sstableloader unable to connect encrypted node (CASSANDRA-7585)
- * Make StreamReceiveTask thread safe and gc friendly (CASSANDRA-7795)
 Merged from 1.2:
- * Don't allow compacted sstables to be marked as compacting (CASSANDRA-7145)
- * Track expired tombstones (CASSANDRA-7810)
- * Validate empty cell names from counter updates (CASSANDRA-7798)
- * Improve PasswordAuthenticator default super user setup (CASSANDRA-7788)
- * Remove duplicates from StorageService.getJoiningNodes (CASSANDRA-7478)
  * Clone token map outside of hot gossip loops (CASSANDRA-7758)
  * Add stop method to EmbeddedCassandraService (CASSANDRA-7595)
  * Support connecting to ipv6 jmx with nodetool (CASSANDRA-7669)
@@ -147,62 +368,55 @@
  * Don't clear out range tombstones during compaction (CASSANDRA-7808)
 
 
-2.0.9
- * Fix CC#collectTimeOrderedData() tombstone optimisations (CASSANDRA-7394)
- * Fix assertion error in CL.ANY timeout handling (CASSANDRA-7364)
- * Handle empty CFs in Memtable#maybeUpdateLiveRatio() (CASSANDRA-7401)
- * Fix native protocol CAS batches (CASSANDRA-7337)
- * Add per-CF range read request latency metrics (CASSANDRA-7338)
- * Fix NPE in StreamTransferTask.createMessageForRetry() (CASSANDRA-7323)
- * Add conditional CREATE/DROP USER support (CASSANDRA-7264)
- * Swap local and global default read repair chances (CASSANDRA-7320)
- * Add missing iso8601 patterns for date strings (CASSANDRA-6973)
- * Support selecting multiple rows in a partition using IN (CASSANDRA-6875)
- * cqlsh: always emphasize the partition key in DESC output (CASSANDRA-7274)
+2.1.0-rc1
+ * Revert flush directory (CASSANDRA-6357)
+ * More efficient executor service for fast operations (CASSANDRA-4718)
+ * Move less common tools into a new cassandra-tools package (CASSANDRA-7160)
+ * Support more concurrent requests in native protocol (CASSANDRA-7231)
+ * Add tab-completion to debian nodetool packaging (CASSANDRA-6421)
+ * Change concurrent_compactors defaults (CASSANDRA-7139)
+ * Add PowerShell Windows launch scripts (CASSANDRA-7001)
+ * Make commitlog archive+restore more robust (CASSANDRA-6974)
+ * Fix marking commitlogsegments clean (CASSANDRA-6959)
+ * Add snapshot "manifest" describing files included (CASSANDRA-6326)
+ * Parallel streaming for sstableloader (CASSANDRA-3668)
+ * Fix bugs in supercolumns handling (CASSANDRA-7138)
+ * Fix ClassClassException on composite dense tables (CASSANDRA-7112)
+ * Cleanup and optimize collation and slice iterators (CASSANDRA-7107)
+ * Upgrade NBHM lib (CASSANDRA-7128)
+ * Optimize netty server (CASSANDRA-6861)
+ * Fix repair hang when given CF does not exist (CASSANDRA-7189)
+ * Allow c* to be shutdown in an embedded mode (CASSANDRA-5635)
+ * Add server side batching to native transport (CASSANDRA-5663)
+ * Make batchlog replay asynchronous (CASSANDRA-6134)
+ * remove unused classes (CASSANDRA-7197)
+ * Limit user types to the keyspace they are defined in (CASSANDRA-6643)
+ * Add validate method to CollectionType (CASSANDRA-7208)
+ * New serialization format for UDT values (CASSANDRA-7209, CASSANDRA-7261)
+ * Fix nodetool netstats (CASSANDRA-7270)
+ * Fix potential ClassCastException in HintedHandoffManager (CASSANDRA-7284)
+ * Use prepared statements internally (CASSANDRA-6975)
+ * Fix broken paging state with prepared statement (CASSANDRA-7120)
+ * Fix IllegalArgumentException in CqlStorage (CASSANDRA-7287)
+ * Allow nulls/non-existant fields in UDT (CASSANDRA-7206)
+ * Backport Thrift MultiSliceRequest (CASSANDRA-7027)
+ * Handle overlapping MultiSlices (CASSANDRA-7279)
+ * Fix DataOutputTest on Windows (CASSANDRA-7265)
+ * Embedded sets in user defined data-types are not updating (CASSANDRA-7267)
+ * Add tuple type to CQL/native protocol (CASSANDRA-7248)
+ * Fix CqlPagingRecordReader on tables with few rows (CASSANDRA-7322)
+Merged from 2.0:
  * Copy compaction options to make sure they are reloaded (CASSANDRA-7290)
  * Add option to do more aggressive tombstone compactions (CASSANDRA-6563)
  * Don't try to compact already-compacting files in HHOM (CASSANDRA-7288)
- * Add authentication support to shuffle (CASSANDRA-6484)
- * Cqlsh counts non-empty lines for "Blank lines" warning (CASSANDRA-7325)
- * Make StreamSession#closeSession() idempotent (CASSANDRA-7262)
- * Fix infinite loop on exception while streaming (CASSANDRA-7330)
- * Reference sstables before populating key cache (CASSANDRA-7234)
- * Account for range tombstones in min/max column names (CASSANDRA-7235)
- * Improve sub range repair validation (CASSANDRA-7317)
- * Accept subtypes for function results, type casts (CASSANDRA-6766)
- * Support DISTINCT for static columns and fix behaviour when DISTINC is
-   not use (CASSANDRA-7305).
- * Refuse range queries with strict bounds on compact tables since they
-   are broken (CASSANDRA-7059)
-Merged from 1.2:
- * Expose global ColumnFamily metrics (CASSANDRA-7273)
- * cqlsh: Fix CompositeType columns in DESCRIBE TABLE output (CASSANDRA-7399)
- * Expose global ColumnFamily metrics (CASSANDRA-7273)
- * Handle possible integer overflow in FastByteArrayOutputStream (CASSANDRA-7373)
- * cqlsh: 'ascii' values weren't formatted as text (CASSANDRA-7407)
- * cqlsh: ignore .cassandra permission errors (CASSANDRA-7266)
- * reduce failure detector initial value to 2s (CASSANDRA-7307)
- * Fix problem truncating on a node that was previously in a dead state (CASSANDRA-7318)
- * Don't insert tombstones that hide indexed values into 2i (CASSANDRA-7268)
- * Track metrics at a keyspace level (CASSANDRA-6539)
- * Add replace_address_first_boot flag to only replace if not bootstrapped
-   (CASSANDRA-7356)
- * Enable keepalive for native protocol (CASSANDRA-7380)
- * Check internal addresses for seeds (CASSANDRA-6523)
- * Fix potential / by 0 in HHOM page size calculation (CASSANDRA-7354)
- * Fix availability validation for LOCAL_ONE CL (CASSANDRA-7319)
- * Use LOCAL_ONE for non-superuser auth queries (CASSANDRA-7328)
- * Fix handling of empty counter replication mutations (CASSANDRA-7144)
-
-
-2.0.8
  * Always reallocate buffers in HSHA (CASSANDRA-6285)
  * (Hadoop) support authentication in CqlRecordReader (CASSANDRA-7221)
  * (Hadoop) Close java driver Cluster in CQLRR.close (CASSANDRA-7228)
- * Fix potential SlabAllocator yield-starvation (CASSANDRA-7133)
  * Warn when 'USING TIMESTAMP' is used on a CAS BATCH (CASSANDRA-7067)
- * Starting threads in OutboundTcpConnectionPool constructor causes race conditions (CASSANDRA-7177)
  * return all cpu values from BackgroundActivityMonitor.readAndCompute (CASSANDRA-7183)
+ * Correctly delete scheduled range xfers (CASSANDRA-7143)
+ * return all cpu values from BackgroundActivityMonitor.readAndCompute (CASSANDRA-7183)  
+ * reduce garbage creation in calculatePendingRanges (CASSANDRA-7191)
  * fix c* launch issues on Russian os's due to output of linux 'free' cmd (CASSANDRA-6162)
  * Fix disabling autocompaction (CASSANDRA-7187)
  * Fix potential NumberFormatException when deserializing IntegerType (CASSANDRA-7088)
@@ -210,56 +424,140 @@
  * cqlsh: Accept and execute CQL statement(s) from command-line parameter (CASSANDRA-7172)
  * Fix IllegalStateException in CqlPagingRecordReader (CASSANDRA-7198)
  * Fix the InvertedIndex trigger example (CASSANDRA-7211)
- * Correctly delete scheduled range xfers (CASSANDRA-7143)
- * Make batchlog replica selection rack-aware (CASSANDRA-6551)
- * Allow overriding cassandra-rackdc.properties file (CASSANDRA-7072)
- * Set JMX RMI port to 7199 (CASSANDRA-7087)
- * Use LOCAL_QUORUM for data reads at LOCAL_SERIAL (CASSANDRA-6939)
- * Log a warning for large batches (CASSANDRA-6487)
- * Queries on compact tables can return more rows that requested (CASSANDRA-7052)
- * USING TIMESTAMP for batches does not work (CASSANDRA-7053)
- * Fix performance regression from CASSANDRA-5614 (CASSANDRA-6949)
- * Merge groupable mutations in TriggerExecutor#execute() (CASSANDRA-7047)
- * Fix CFMetaData#getColumnDefinitionFromColumnName() (CASSANDRA-7074)
- * Plug holes in resource release when wiring up StreamSession (CASSANDRA-7073)
- * Re-add parameter columns to tracing session (CASSANDRA-6942)
- * Fix writetime/ttl functions for static columns (CASSANDRA-7081)
- * Suggest CTRL-C or semicolon after three blank lines in cqlsh (CASSANDRA-7142)
  * Add --resolve-ip option to 'nodetool ring' (CASSANDRA-7210)
- * Fix duplicated error messages on directory creation error at startup (CASSANDRA-5818)
  * reduce garbage on codec flag deserialization (CASSANDRA-7244) 
+ * Fix duplicated error messages on directory creation error at startup (CASSANDRA-5818)
  * Proper null handle for IF with map element access (CASSANDRA-7155)
  * Improve compaction visibility (CASSANDRA-7242)
+ * Correctly delete scheduled range xfers (CASSANDRA-7143)
+ * Make batchlog replica selection rack-aware (CASSANDRA-6551)
+ * Fix CFMetaData#getColumnDefinitionFromColumnName() (CASSANDRA-7074)
+ * Fix writetime/ttl functions for static columns (CASSANDRA-7081)
+ * Suggest CTRL-C or semicolon after three blank lines in cqlsh (CASSANDRA-7142)
  * Fix 2ndary index queries with DESC clustering order (CASSANDRA-6950)
  * Invalid key cache entries on DROP (CASSANDRA-6525)
  * Fix flapping RecoveryManagerTest (CASSANDRA-7084)
+ * Add missing iso8601 patterns for date strings (CASSANDRA-6973)
+ * Support selecting multiple rows in a partition using IN (CASSANDRA-6875)
+ * Add authentication support to shuffle (CASSANDRA-6484)
+ * Swap local and global default read repair chances (CASSANDRA-7320)
+ * Add conditional CREATE/DROP USER support (CASSANDRA-7264)
+ * Cqlsh counts non-empty lines for "Blank lines" warning (CASSANDRA-7325)
 Merged from 1.2:
  * Add Cloudstack snitch (CASSANDRA-7147)
  * Update system.peers correctly when relocating tokens (CASSANDRA-7126)
  * Add Google Compute Engine snitch (CASSANDRA-7132)
- * Fix nodetool display with vnodes (CASSANDRA-7082)
- * Fix schema concurrency exceptions (CASSANDRA-6841)
- * Fix BatchlogManager#deleteBatch() use of millisecond timsestamps
-   (CASSANDRA-6822)
- * Fix batchlog to account for CF truncation records (CASSANDRA-6999)
- * Fix CQLSH parsing of functions and BLOB literals (CASSANDRA-7018)
- * Require nodetool rebuild_index to specify index names (CASSANDRA-7038)
- * Ensure that batchlog and hint timeouts do not produce hints (CASSANDRA-7058)
- * Always clean up references in SerializingCache (CASSANDRA-6994)
- * Don't shut MessagingService down when replacing a node (CASSANDRA-6476)
- * fix npe when doing -Dcassandra.fd_initial_value_ms (CASSANDRA-6751)
- * Preserves CQL metadata when updating table from thrift (CASSANDRA-6831)
  * remove duplicate query for local tokens (CASSANDRA-7182)
- * raise streaming phi convict threshold level (CASSANDRA-7063)
- * reduce garbage creation in calculatePendingRanges (CASSANDRA-7191)
  * exit CQLSH with error status code if script fails (CASSANDRA-6344)
  * Fix bug with some IN queries missig results (CASSANDRA-7105)
  * Fix availability validation for LOCAL_ONE CL (CASSANDRA-7319)
  * Hint streaming can cause decommission to fail (CASSANDRA-7219)
- * RepairTask didn't send a correct message on IllegalArgumentException (CASSANDRA-7336)
 
 
-2.0.7
+2.1.0-beta2
+ * Increase default CL space to 8GB (CASSANDRA-7031)
+ * Add range tombstones to read repair digests (CASSANDRA-6863)
+ * Fix BTree.clear for large updates (CASSANDRA-6943)
+ * Fail write instead of logging a warning when unable to append to CL
+   (CASSANDRA-6764)
+ * Eliminate possibility of CL segment appearing twice in active list 
+   (CASSANDRA-6557)
+ * Apply DONTNEED fadvise to commitlog segments (CASSANDRA-6759)
+ * Switch CRC component to Adler and include it for compressed sstables 
+   (CASSANDRA-4165)
+ * Allow cassandra-stress to set compaction strategy options (CASSANDRA-6451)
+ * Add broadcast_rpc_address option to cassandra.yaml (CASSANDRA-5899)
+ * Auto reload GossipingPropertyFileSnitch config (CASSANDRA-5897)
+ * Fix overflow of memtable_total_space_in_mb (CASSANDRA-6573)
+ * Fix ABTC NPE and apply update function correctly (CASSANDRA-6692)
+ * Allow nodetool to use a file or prompt for password (CASSANDRA-6660)
+ * Fix AIOOBE when concurrently accessing ABSC (CASSANDRA-6742)
+ * Fix assertion error in ALTER TYPE RENAME (CASSANDRA-6705)
+ * Scrub should not always clear out repaired status (CASSANDRA-5351)
+ * Improve handling of range tombstone for wide partitions (CASSANDRA-6446)
+ * Fix ClassCastException for compact table with composites (CASSANDRA-6738)
+ * Fix potentially repairing with wrong nodes (CASSANDRA-6808)
+ * Change caching option syntax (CASSANDRA-6745)
+ * Fix stress to do proper counter reads (CASSANDRA-6835)
+ * Fix help message for stress counter_write (CASSANDRA-6824)
+ * Fix stress smart Thrift client to pick servers correctly (CASSANDRA-6848)
+ * Add logging levels (minimal, normal or verbose) to stress tool (CASSANDRA-6849)
+ * Fix race condition in Batch CLE (CASSANDRA-6860)
+ * Improve cleanup/scrub/upgradesstables failure handling (CASSANDRA-6774)
+ * ByteBuffer write() methods for serializing sstables (CASSANDRA-6781)
+ * Proper compare function for CollectionType (CASSANDRA-6783)
+ * Update native server to Netty 4 (CASSANDRA-6236)
+ * Fix off-by-one error in stress (CASSANDRA-6883)
+ * Make OpOrder AutoCloseable (CASSANDRA-6901)
+ * Remove sync repair JMX interface (CASSANDRA-6900)
+ * Add multiple memory allocation options for memtables (CASSANDRA-6689, 6694)
+ * Remove adjusted op rate from stress output (CASSANDRA-6921)
+ * Add optimized CF.hasColumns() implementations (CASSANDRA-6941)
+ * Serialize batchlog mutations with the version of the target node
+   (CASSANDRA-6931)
+ * Optimize CounterColumn#reconcile() (CASSANDRA-6953)
+ * Properly remove 1.2 sstable support in 2.1 (CASSANDRA-6869)
+ * Lock counter cells, not partitions (CASSANDRA-6880)
+ * Track presence of legacy counter shards in sstables (CASSANDRA-6888)
+ * Ensure safe resource cleanup when replacing sstables (CASSANDRA-6912)
+ * Add failure handler to async callback (CASSANDRA-6747)
+ * Fix AE when closing SSTable without releasing reference (CASSANDRA-7000)
+ * Clean up IndexInfo on keyspace/table drops (CASSANDRA-6924)
+ * Only snapshot relative SSTables when sequential repair (CASSANDRA-7024)
+ * Require nodetool rebuild_index to specify index names (CASSANDRA-7038)
+ * fix cassandra stress errors on reads with native protocol (CASSANDRA-7033)
+ * Use OpOrder to guard sstable references for reads (CASSANDRA-6919)
+ * Preemptive opening of compaction result (CASSANDRA-6916)
+ * Multi-threaded scrub/cleanup/upgradesstables (CASSANDRA-5547)
+ * Optimize cellname comparison (CASSANDRA-6934)
+ * Native protocol v3 (CASSANDRA-6855)
+ * Optimize Cell liveness checks and clean up Cell (CASSANDRA-7119)
+ * Support consistent range movements (CASSANDRA-2434)
+Merged from 2.0:
+ * Avoid race-prone second "scrub" of system keyspace (CASSANDRA-6797)
+ * Pool CqlRecordWriter clients by inetaddress rather than Range
+   (CASSANDRA-6665)
+ * Fix compaction_history timestamps (CASSANDRA-6784)
+ * Compare scores of full replica ordering in DES (CASSANDRA-6683)
+ * fix CME in SessionInfo updateProgress affecting netstats (CASSANDRA-6577)
+ * Allow repairing between specific replicas (CASSANDRA-6440)
+ * Allow per-dc enabling of hints (CASSANDRA-6157)
+ * Add compatibility for Hadoop 0.2.x (CASSANDRA-5201)
+ * Fix EstimatedHistogram races (CASSANDRA-6682)
+ * Failure detector correctly converts initial value to nanos (CASSANDRA-6658)
+ * Add nodetool taketoken to relocate vnodes (CASSANDRA-4445)
+ * Expose bulk loading progress over JMX (CASSANDRA-4757)
+ * Correctly handle null with IF conditions and TTL (CASSANDRA-6623)
+ * Account for range/row tombstones in tombstone drop
+   time histogram (CASSANDRA-6522)
+ * Stop CommitLogSegment.close() from calling sync() (CASSANDRA-6652)
+ * Make commitlog failure handling configurable (CASSANDRA-6364)
+ * Avoid overlaps in LCS (CASSANDRA-6688)
+ * Improve support for paginating over composites (CASSANDRA-4851)
+ * Fix count(*) queries in a mixed cluster (CASSANDRA-6707)
+ * Improve repair tasks(snapshot, differencing) concurrency (CASSANDRA-6566)
+ * Fix replaying pre-2.0 commit logs (CASSANDRA-6714)
+ * Add static columns to CQL3 (CASSANDRA-6561)
+ * Optimize single partition batch statements (CASSANDRA-6737)
+ * Disallow post-query re-ordering when paging (CASSANDRA-6722)
+ * Fix potential paging bug with deleted columns (CASSANDRA-6748)
+ * Fix NPE on BulkLoader caused by losing StreamEvent (CASSANDRA-6636)
+ * Fix truncating compression metadata (CASSANDRA-6791)
+ * Add CMSClassUnloadingEnabled JVM option (CASSANDRA-6541)
+ * Catch memtable flush exceptions during shutdown (CASSANDRA-6735)
+ * Fix upgradesstables NPE for non-CF-based indexes (CASSANDRA-6645)
+ * Fix UPDATE updating PRIMARY KEY columns implicitly (CASSANDRA-6782)
+ * Fix IllegalArgumentException when updating from 1.2 with SuperColumns
+   (CASSANDRA-6733)
+ * FBUtilities.singleton() should use the CF comparator (CASSANDRA-6778)
+ * Fix CQLSStableWriter.addRow(Map<String, Object>) (CASSANDRA-6526)
+ * Fix HSHA server introducing corrupt data (CASSANDRA-6285)
+ * Fix CAS conditions for COMPACT STORAGE tables (CASSANDRA-6813)
+ * Starting threads in OutboundTcpConnectionPool constructor causes race conditions (CASSANDRA-7177)
+ * Allow overriding cassandra-rackdc.properties file (CASSANDRA-7072)
+ * Set JMX RMI port to 7199 (CASSANDRA-7087)
+ * Use LOCAL_QUORUM for data reads at LOCAL_SERIAL (CASSANDRA-6939)
+ * Log a warning for large batches (CASSANDRA-6487)
  * Put nodes in hibernate when join_ring is false (CASSANDRA-6961)
  * Avoid early loading of non-system keyspaces before compaction-leftovers 
    cleanup at startup (CASSANDRA-6913)
@@ -311,7 +609,16 @@
    (CASSANDRA-6906)
  * Fix SSTable not released if stream session fails (CASSANDRA-6818)
  * Avoid build failure due to ANTLR timeout (CASSANDRA-6991)
+ * Queries on compact tables can return more rows that requested (CASSANDRA-7052)
+ * USING TIMESTAMP for batches does not work (CASSANDRA-7053)
+ * Fix performance regression from CASSANDRA-5614 (CASSANDRA-6949)
+ * Ensure that batchlog and hint timeouts do not produce hints (CASSANDRA-7058)
+ * Merge groupable mutations in TriggerExecutor#execute() (CASSANDRA-7047)
+ * Plug holes in resource release when wiring up StreamSession (CASSANDRA-7073)
+ * Re-add parameter columns to tracing session (CASSANDRA-6942)
+ * Preserves CQL metadata when updating table from thrift (CASSANDRA-6831)
 Merged from 1.2:
+ * Fix nodetool display with vnodes (CASSANDRA-7082)
  * Add UNLOGGED, COUNTER options to BATCH documentation (CASSANDRA-6816)
  * add extra SSL cipher suites (CASSANDRA-6613)
  * fix nodetool getsstables for blob PK (CASSANDRA-6803)
@@ -321,65 +628,50 @@
  * Schedule schema pulls on change (CASSANDRA-6971)
  * Non-droppable verbs shouldn't be dropped from OTC (CASSANDRA-6980)
  * Shutdown batchlog executor in SS#drain() (CASSANDRA-7025)
+ * Fix batchlog to account for CF truncation records (CASSANDRA-6999)
+ * Fix CQLSH parsing of functions and BLOB literals (CASSANDRA-7018)
  * Properly load trustore in the native protocol (CASSANDRA-6847)
+ * Always clean up references in SerializingCache (CASSANDRA-6994)
+ * Don't shut MessagingService down when replacing a node (CASSANDRA-6476)
+ * fix npe when doing -Dcassandra.fd_initial_value_ms (CASSANDRA-6751)
 
 
-2.0.6
- * Avoid race-prone second "scrub" of system keyspace (CASSANDRA-6797)
- * Pool CqlRecordWriter clients by inetaddress rather than Range
-   (CASSANDRA-6665)
- * Fix compaction_history timestamps (CASSANDRA-6784)
- * Compare scores of full replica ordering in DES (CASSANDRA-6683)
- * fix CME in SessionInfo updateProgress affecting netstats (CASSANDRA-6577)
- * Allow repairing between specific replicas (CASSANDRA-6440)
- * Allow per-dc enabling of hints (CASSANDRA-6157)
- * Add compatibility for Hadoop 0.2.x (CASSANDRA-5201)
- * Fix EstimatedHistogram races (CASSANDRA-6682)
- * Failure detector correctly converts initial value to nanos (CASSANDRA-6658)
- * Add nodetool taketoken to relocate vnodes (CASSANDRA-4445)
- * Fix upgradesstables NPE for non-CF-based indexes (CASSANDRA-6645)
- * Improve nodetool cfhistograms formatting (CASSANDRA-6360)
- * Expose bulk loading progress over JMX (CASSANDRA-4757)
- * Correctly handle null with IF conditions and TTL (CASSANDRA-6623)
- * Account for range/row tombstones in tombstone drop
-   time histogram (CASSANDRA-6522)
- * Stop CommitLogSegment.close() from calling sync() (CASSANDRA-6652)
- * Make commitlog failure handling configurable (CASSANDRA-6364)
- * Avoid overlaps in LCS (CASSANDRA-6688)
- * Improve support for paginating over composites (CASSANDRA-4851)
- * Fix count(*) queries in a mixed cluster (CASSANDRA-6707)
- * Improve repair tasks(snapshot, differencing) concurrency (CASSANDRA-6566)
- * Fix replaying pre-2.0 commit logs (CASSANDRA-6714)
- * Add static columns to CQL3 (CASSANDRA-6561)
- * Optimize single partition batch statements (CASSANDRA-6737)
- * Disallow post-query re-ordering when paging (CASSANDRA-6722)
- * Fix potential paging bug with deleted columns (CASSANDRA-6748)
- * Fix NPE on BulkLoader caused by losing StreamEvent (CASSANDRA-6636)
- * Fix truncating compression metadata (CASSANDRA-6791)
- * Fix UPDATE updating PRIMARY KEY columns implicitly (CASSANDRA-6782)
- * Fix IllegalArgumentException when updating from 1.2 with SuperColumns
-   (CASSANDRA-6733)
- * FBUtilities.singleton() should use the CF comparator (CASSANDRA-6778)
- * Fix CQLSStableWriter.addRow(Map<String, Object>) (CASSANDRA-6526)
- * Fix HSHA server introducing corrupt data (CASSANDRA-6285)
- * Fix CAS conditions for COMPACT STORAGE tables (CASSANDRA-6813)
-Merged from 1.2:
- * Add CMSClassUnloadingEnabled JVM option (CASSANDRA-6541)
- * Catch memtable flush exceptions during shutdown (CASSANDRA-6735)
- * Fix broken streams when replacing with same IP (CASSANDRA-6622)
- * Fix upgradesstables NPE for non-CF-based indexes (CASSANDRA-6645)
- * Fix partition and range deletes not triggering flush (CASSANDRA-6655)
- * Fix mean cells and mean row size per sstable calculations (CASSANDRA-6667)
- * Compact hints after partial replay to clean out tombstones (CASSANDRA-6666)
- * Log USING TTL/TIMESTAMP in a counter update warning (CASSANDRA-6649)
- * Don't exchange schema between nodes with different versions (CASSANDRA-6695)
- * Use real node messaging versions for schema exchange decisions (CASSANDRA-6700)
- * IN on the last clustering columns + ORDER BY DESC yield no results (CASSANDRA-6701)
- * Fix SecondaryIndexManager#deleteFromIndexes() (CASSANDRA-6711)
- * Fix snapshot repair not snapshotting coordinator itself (CASSANDRA-6713)
- * Support negative timestamps for CQL3 dates in query string (CASSANDRA-6718)
- * Avoid NPEs when receiving table changes for an unknown keyspace (CASSANDRA-5631)
- * Fix bootstrapping when there is no schema (CASSANDRA-6685)
+2.1.0-beta1
+ * Add flush directory distinct from compaction directories (CASSANDRA-6357)
+ * Require JNA by default (CASSANDRA-6575)
+ * add listsnapshots command to nodetool (CASSANDRA-5742)
+ * Introduce AtomicBTreeColumns (CASSANDRA-6271, 6692)
+ * Multithreaded commitlog (CASSANDRA-3578)
+ * allocate fixed index summary memory pool and resample cold index summaries 
+   to use less memory (CASSANDRA-5519)
+ * Removed multithreaded compaction (CASSANDRA-6142)
+ * Parallelize fetching rows for low-cardinality indexes (CASSANDRA-1337)
+ * change logging from log4j to logback (CASSANDRA-5883)
+ * switch to LZ4 compression for internode communication (CASSANDRA-5887)
+ * Stop using Thrift-generated Index* classes internally (CASSANDRA-5971)
+ * Remove 1.2 network compatibility code (CASSANDRA-5960)
+ * Remove leveled json manifest migration code (CASSANDRA-5996)
+ * Remove CFDefinition (CASSANDRA-6253)
+ * Use AtomicIntegerFieldUpdater in RefCountedMemory (CASSANDRA-6278)
+ * User-defined types for CQL3 (CASSANDRA-5590)
+ * Use of o.a.c.metrics in nodetool (CASSANDRA-5871, 6406)
+ * Batch read from OTC's queue and cleanup (CASSANDRA-1632)
+ * Secondary index support for collections (CASSANDRA-4511, 6383)
+ * SSTable metadata(Stats.db) format change (CASSANDRA-6356)
+ * Push composites support in the storage engine
+   (CASSANDRA-5417, CASSANDRA-6520)
+ * Add snapshot space used to cfstats (CASSANDRA-6231)
+ * Add cardinality estimator for key count estimation (CASSANDRA-5906)
+ * CF id is changed to be non-deterministic. Data dir/key cache are created
+   uniquely for CF id (CASSANDRA-5202)
+ * New counters implementation (CASSANDRA-6504)
+ * Replace UnsortedColumns, EmptyColumns, TreeMapBackedSortedColumns with new
+   ArrayBackedSortedColumns (CASSANDRA-6630, CASSANDRA-6662, CASSANDRA-6690)
+ * Add option to use row cache with a given amount of rows (CASSANDRA-5357)
+ * Avoid repairing already repaired data (CASSANDRA-5351)
+ * Reject counter updates with USING TTL/TIMESTAMP (CASSANDRA-6649)
+ * Replace index_interval with min/max_index_interval (CASSANDRA-6379)
+ * Lift limitation that order by columns must be selected for IN queries (CASSANDRA-4911)
 
 
 2.0.5
@@ -509,12 +801,7 @@
    (CASSANDRA-5750)
  * Invalidate row cache when dropping CF (CASSANDRA-6351)
  * add non-jamm path for cached statements (CASSANDRA-6293)
- * (Hadoop) Require CFRR batchSize to be at least 2 (CASSANDRA-6114)
- * Fix altering column types (CASSANDRA-6185)
- * cqlsh: fix CREATE/ALTER WITH completion (CASSANDRA-6196)
  * add windows bat files for shell commands (CASSANDRA-6145)
- * Fix potential stack overflow during range tombstones insertion (CASSANDRA-6181)
- * (Hadoop) Make LOCAL_ONE the default consistency level (CASSANDRA-6214)
  * Require logging in for Thrift CQL2/3 statement preparation (CASSANDRA-6254)
  * restrict max_num_tokens to 1536 (CASSANDRA-6267)
  * Nodetool gets default JMX port from cassandra-env.sh (CASSANDRA-6273)
@@ -577,7 +864,6 @@
  * Allow estimated memtable size to exceed slab allocator size (CASSANDRA-6078)
  * Start MeteredFlusher earlier to prevent OOM during CL replay (CASSANDRA-6087)
  * Avoid sending Truncate command to fat clients (CASSANDRA-6088)
- * Allow cache-keys-to-save to be set at runtime (CASSANDRA-5980)
  * Allow where clause conditions to be in parenthesis (CASSANDRA-6037)
  * Do not open non-ssl storage port if encryption option is all (CASSANDRA-3916)
  * Move batchlog replay to its own executor (CASSANDRA-6079)
@@ -648,7 +934,6 @@
  * Fix possible divide-by-zero in HHOM (CASSANDRA-5990)
  * Allow local batchlog writes for CL.ANY (CASSANDRA-5967)
  * Upgrade metrics-core to version 2.2.0 (CASSANDRA-5947)
- * Add snitch, schema version, cluster, partitioner to JMX (CASSANDRA-5881)
  * Fix CqlRecordWriter with composite keys (CASSANDRA-5949)
  * Add snitch, schema version, cluster, partitioner to JMX (CASSANDRA-5881)
  * Allow disabling SlabAllocator (CASSANDRA-5935)
@@ -766,13 +1051,9 @@
  * Add timeout events to query traces (CASSANDRA-5520)
  * Fix serialization of the LEFT gossip value (CASSANDRA-5696)
  * Pig: support for cql3 tables (CASSANDRA-5234)
- * cqlsh: Don't show 'null' in place of empty values (CASSANDRA-5675)
- * Race condition in detecting version on a mixed 1.1/1.2 cluster
-   (CASSANDRA-5692)
  * Fix skipping range tombstones with reverse queries (CASSANDRA-5712)
  * Expire entries out of ThriftSessionManager (CASSANDRA-5719)
  * Don't keep ancestor information in memory (CASSANDRA-5342)
- * cqlsh: fix handling of semicolons inside BATCH queries (CASSANDRA-5697)
  * Expose native protocol server status in nodetool info (CASSANDRA-5735)
  * Fix pathetic performance of range tombstones (CASSANDRA-5677)
  * Fix querying with an empty (impossible) range (CASSANDRA-5573)
@@ -954,7 +1235,6 @@
  * Add nodetool enablebackup/disablebackup (CASSANDRA-5556)
  * cqlsh: fix DESCRIBE after case insensitive USE (CASSANDRA-5567)
 Merged from 1.1
- * Remove buggy thrift max message length option (CASSANDRA-5529)
  * Add retry mechanism to OTC for non-droppable_verbs (CASSANDRA-5393)
  * Use allocator information to improve memtable memory usage estimate
    (CASSANDRA-5497)
@@ -1031,7 +1311,6 @@
  * Fix AssertionError during repair (CASSANDRA-5245)
  * Don't announce migrations to pre-1.2 nodes (CASSANDRA-5334)
 Merged from 1.1:
- * Fix trying to load deleted row into row cache on startup (CASSANDRA-4463)
  * Update offline scrub for 1.0 -> 1.1 directory structure (CASSANDRA-5195)
  * add tmp flag to Descriptor hashcode (CASSANDRA-4021)
  * fix logging of "Found table data in data directories" when only system tables
@@ -1130,9 +1409,7 @@
  * Detect (and warn) unintentional use of the cql2 thrift methods when cql3 was
    intended (CASSANDRA-5172)
  * cli: Quote ks and cf names in schema output when needed (CASSANDRA-5052)
- * Fix bad default for min/max timestamp in SSTableMetadata (CASSANDRA-5372)
  * Fix cf name extraction from manifest in Directories.migrateFile() (CASSANDRA-5242)
- * Support pluggable internode authentication (CASSANDRA-5401)
  * Replace mistaken usage of commons-logging with slf4j (CASSANDRA-5464)
  * Ensure Jackson dependency matches lib (CASSANDRA-5126)
  * Expose droppable tombstone ratio stats over JMX (CASSANDRA-5159)
@@ -1456,7 +1733,6 @@
  * (Hadoop) fix setting key length for old-style mapred api (CASSANDRA-4534)
  * (Hadoop) fix iterating through a resultset consisting entirely
    of tombstoned rows (CASSANDRA-4466)
- * Fix multiple values for CurrentLocal NodeID (CASSANDRA-4626)
 
 
 1.1.3
@@ -1485,8 +1761,6 @@
  * Ensure compacted files are never used, to avoid counter overcount (CASSANDRA-4436)
 Merged from 1.0:
  * Push the validation of secondary index values to the SecondaryIndexManager (CASSANDRA-4240)
- * (Hadoop) fix iterating through a resultset consisting entirely
-   of tombstoned rows (CASSANDRA-4466)
  * allow dropping columns shadowed by not-yet-expired supercolumn or row
    tombstones in PrecompactedRow (CASSANDRA-4396)
 

diff --git a/NEWS.txt b/NEWS.txt
index 102a87b..50b9c7e 100644
--- a/NEWS.txt
+++ b/NEWS.txt

@@ -13,6 +13,82 @@
 'sstableloader' tool. You can upgrade the file format of your snapshots
 using the provided 'sstableupgrade' tool.
 
+2.1.1
+=====
+
+New features
+------------
+   - Netty support for epoll on linux is now enabled.  If for some
+     reason you want to disable it pass, the following system property
+     -Dcassandra.native.epoll.enabled=false
+
+2.1
+===
+
+New features
+------------
+   - Default data and log locations have changed.  If not set in
+     cassandra.yaml, the data file directory, commitlog directory,
+     and saved caches directory will default to $CASSANDRA_HOME/data/data,
+     $CASSANDRA_HOME/data/commitlog, and $CASSANDRA_HOME/data/saved_caches,
+     respectively.  The log directory now defaults to $CASSANDRA_HOME/logs.
+     If not set, $CASSANDRA_HOME, defaults to the top-level directory of
+     the installation.
+     Note that this should only affect source checkouts and tarballs.
+     Deb and RPM packages will continue to use /var/lib/cassandra and
+     /var/log/cassandra in cassandra.yaml.
+   - SSTable data directory name is slightly changed. Each directory will
+     have hex string appended after CF name, e.g.
+         ks/cf-5be396077b811e3a3ab9dc4b9ac088d/
+     This hex string part represents unique ColumnFamily ID.
+     Note that existing directories are used as is, so only newly created
+     directories after upgrade have new directory name format.
+   - Saved key cache files also have ColumnFamily ID in their file name.
+   - It is now possible to do incremental repairs, sstables that have been
+     repaired are marked with a timestamp and not included in the next
+     repair session. Use nodetool repair -par -inc to use this feature.
+     A tool to manually mark/unmark sstables as repaired is available in
+     tools/bin/sstablerepairedset. This is particularly important when
+     using LCS, or any data not repaired in your first incremental repair
+     will be put back in L0.
+   - Bootstrapping now ensures that range movements are consistent,
+     meaning the data for the new node is taken from the node that is no 
+     longer a responsible for that range of keys.
+     If you want the old behavior (due to a lost node perhaps)
+     you can set the following property (-Dcassandra.consistent.rangemovement=false)
+   - It is now possible to use quoted identifiers in triggers' names. 
+     WARNING: if you previously used triggers with capital letters in their 
+     names, then you must quote them from now on.
+   - Improved stress tool (http://goo.gl/OTNqiQ)
+   - New incremental repair option (http://goo.gl/MjohJp, http://goo.gl/f8jSme)
+   - Incremental replacement of compacted SSTables (http://goo.gl/JfDBGW)
+   - The row cache can now cache only the head of partitions (http://goo.gl/6TJPH6)
+   - Off-heap memtables (http://goo.gl/YT7znJ)
+   - CQL improvements and additions: User-defined types, tuple types, 2ndary
+     indexing of collections, ... (http://goo.gl/kQl7GW)
+
+Upgrading
+---------
+   - Rolling upgrades from anything pre-2.0.7 is not supported. Furthermore
+     pre-2.0 sstables are not supported. This means that before upgrading
+     a node on 2.1, this node must be started on 2.0 and
+     'nodetool upgdradesstables' must be run (and this even in the case
+     of not-rolling upgrades).
+   - For size-tiered compaction users, Cassandra now defaults to ignoring
+     the coldest 5% of sstables.  This can be customized with the
+     cold_reads_to_omit compaction option; 0.0 omits nothing (the old
+     behavior) and 1.0 omits everything.
+   - Multithreaded compaction has been removed.
+   - Counters implementation has been changed, replaced by a safer one with
+     less caveats, but different performance characteristics. You might have
+     to change your data model to accomodate the new implementation.
+     (See https://issues.apache.org/jira/browse/CASSANDRA-6504 and the
+     blog post at http://goo.gl/qj8iQl for details).
+   - (per-table) index_interval parameter has been replaced with
+     min_index_interval and max_index_interval paratemeters. index_interval
+     has been deprecated.
+   - support for supercolumns has been removed from json2sstable
+
 2.0.11
 ======
 New features
@@ -30,7 +106,7 @@
     - If you are using Leveled Compaction, you can now disable doing size-tiered
       compaction in L0 by starting Cassandra with -Dcassandra.disable_stcs_in_l0
       (see CASSANDRA-6621 for details).
-    - Shuffle and taketoken have been removed.  For clusters that choose to
+    - Shuffle and taketoken have been removed.  For clusters that choose to 
       upgrade to vnodes, creating a new datacenter with vnodes and migrating is
       recommended. See http://goo.gl/Sna2S1 for further information.
 

diff --git a/NOTICE.txt b/NOTICE.txt
index 9475c4b..cf7b8dc 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt

@@ -37,10 +37,6 @@
 (https://github.com/ning/compress)
 Copyright 2009-2010 Ning, Inc.
 
-Alternative map implementation provided by SnapTree
-(https://github.com/nbronson/snaptree)
-Written by Nathan G. Bronson et al.
-
 CQL Native transport uses Netty
 (https://netty.io/)
 Copyright (C) 2011 The Netty Project
@@ -56,3 +52,11 @@
 LMAX Disruptor
 (http://lmax-exchange.github.io/disruptor/)
 Copyright 2011 LMAX Ltd.
+
+Airline
+(https://github.com/airlift/airline)
+Copyright 2011, Dain Sundstrom dain@iq80.com
+Copyright 2010, Cedric Beust cedric@beust.com
+
+HLL++ support provided by stream-lib
+(https://github.com/addthis/stream-lib)

diff --git a/bin/cassandra b/bin/cassandra
index 09fe013..c15a46a 100755
--- a/bin/cassandra
+++ b/bin/cassandra

@@ -138,7 +138,9 @@
     foreground="$2"
     props="$3"
     class="$4"
-    cassandra_parms="-Dlog4j.configuration=log4j-server.properties -Dlog4j.defaultInitOverride=true"
+    cassandra_parms="-Dlogback.configurationFile=logback.xml"
+    cassandra_parms="$cassandra_parms -Dcassandra.logdir=$CASSANDRA_HOME/logs"
+    cassandra_parms="$cassandra_parms -Dcassandra.storagedir=$cassandra_storagedir"
 
     if [ "x$pidpath" != "x" ]; then
         cassandra_parms="$cassandra_parms -Dcassandra-pidfile=$pidpath"

diff --git a/bin/cassandra-cli b/bin/cassandra-cli
index ea52b8e..a2696da 100755
--- a/bin/cassandra-cli
+++ b/bin/cassandra-cli

@@ -44,7 +44,8 @@
 fi
 
 "$JAVA" -ea -cp "$CLASSPATH" -Xmx256M \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.cli.CliMain "$@"
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/cassandra-cli.bat b/bin/cassandra-cli.bat
index 0d52bdf..6211371 100644
--- a/bin/cassandra-cli.bat
+++ b/bin/cassandra-cli.bat

@@ -14,30 +14,15 @@
 @REM  See the License for the specific language governing permissions and

 @REM  limitations under the License.

 

-

 @echo off

 if "%OS%" == "Windows_NT" setlocal

 

+pushd "%~dp0"

+call cassandra.in.bat

+

 if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..

 if NOT DEFINED JAVA_HOME goto :err

 

-REM Ensure that any user defined CLASSPATH variables are not used on startup

-set CLASSPATH=

-

-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.

-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

-goto okClasspath

-

-:append

-set CLASSPATH=%CLASSPATH%;%1

-goto :eof

-

-:okClasspath

-REM Include the build\classes\main directory so it works in development

-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";"%CASSANDRA_HOME%\build\classes\thrift"

-goto runCli

-

-:runCli

 echo Starting Cassandra Client

 "%JAVA_HOME%\bin\java" -cp %CASSANDRA_CLASSPATH% org.apache.cassandra.cli.CliMain %*

 goto finally


diff --git a/bin/cassandra.bat b/bin/cassandra.bat
index 8a3bf7f..1606dcc 100644
--- a/bin/cassandra.bat
+++ b/bin/cassandra.bat

@@ -18,6 +18,7 @@
 if "%OS%" == "Windows_NT" setlocal

 

 set ARG=%1

+if /i "%ARG%" == "LEGACY" goto runLegacy

 set INSTALL="INSTALL"

 set UNINSTALL="UNINSTALL"

 

@@ -25,14 +26,36 @@
 if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%CD%

 popd

 

+REM -----------------------------------------------------------------------------

+REM See if we have access to run unsigned powershell scripts

+for /F "delims=" %%i in ('powershell Get-ExecutionPolicy') do set PERMISSION=%%i

+if "%PERMISSION%" == "Unrestricted" goto runPowerShell

+goto runLegacy

+

+REM -----------------------------------------------------------------------------

+:runPowerShell

+echo Detected powershell execution permissions.  Running with enhanced startup scripts.

+set errorlevel=

+powershell /file "%CASSANDRA_HOME%\bin\cassandra.ps1" %*

+exit /b %errorlevel%

+

+REM -----------------------------------------------------------------------------

+:runLegacy

+echo WARNING! Powershell script execution unavailable.

+echo    Please use 'powershell Set-ExecutionPolicy Unrestricted'

+echo    on this user-account to run cassandra with fully featured

+echo    functionality on this platform.

+

+echo Starting with legacy startup options

+

 if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.service.CassandraDaemon

 if NOT DEFINED JAVA_HOME goto :err

 

 REM ***** JAVA options *****

 set JAVA_OPTS=-ea^

- -javaagent:"%CASSANDRA_HOME%\lib\jamm-0.2.5.jar"^

- -Xms1G^

- -Xmx1G^

+ -javaagent:"%CASSANDRA_HOME%\lib\jamm-0.2.6.jar"^

+ -Xms2G^

+ -Xmx2G^

  -XX:+HeapDumpOnOutOfMemoryError^

  -XX:+UseParNewGC^

  -XX:+UseConcMarkSweepGC^

@@ -44,8 +67,7 @@
  -Dcom.sun.management.jmxremote.port=7199^

  -Dcom.sun.management.jmxremote.ssl=false^

  -Dcom.sun.management.jmxremote.authenticate=false^

- -Dlog4j.configuration=log4j-server.properties^

- -Dlog4j.defaultInitOverride=true

+ -Dlogback.configurationFile=logback.xml

 

 REM ***** CLASSPATH library setting *****

 

@@ -64,6 +86,8 @@
 REM Include the build\classes\main directory so it works in development

 set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";"%CASSANDRA_HOME%\build\classes\thrift"

 set CASSANDRA_PARAMS=-Dcassandra -Dcassandra-foreground=yes

+set CASSANDRA_PARAMS=%CASSANDRA_PARAMS% "-Dcassandra.logdir=%CASSANDRA_HOME%\logs"

+set CASSANDRA_PARAMS=%CASSANDRA_PARAMS% "-Dcassandra.storagedir=%CASSANDRA_HOME%\data"

 if /i "%ARG%" == "INSTALL" goto doInstallOperation

 if /i "%ARG%" == "UNINSTALL" goto doInstallOperation

 goto runDaemon

@@ -103,7 +127,7 @@
  --StopMode=jvm --StopClass=%CASSANDRA_MAIN%  --StopMethod=stop ^

  ++JvmOptions=%JAVA_OPTS_DELM% ++JvmOptions=-DCassandra ^

  --PidFile pid.txt

- 

+

 echo Installation of %SERVICE_JVM% is complete

 goto finally

 


diff --git a/bin/sstablesplit.bat b/bin/cassandra.in.bat
similarity index 71%
rename from bin/sstablesplit.bat
rename to bin/cassandra.in.bat
index ef88670..e3304e7 100644
--- a/bin/sstablesplit.bat
+++ b/bin/cassandra.in.bat

@@ -1,61 +1,50 @@
-@REM
-@REM  Licensed to the Apache Software Foundation (ASF) under one or more
-@REM  contributor license agreements.  See the NOTICE file distributed with
-@REM  this work for additional information regarding copyright ownership.
-@REM  The ASF licenses this file to You under the Apache License, Version 2.0
-@REM  (the "License"); you may not use this file except in compliance with
-@REM  the License.  You may obtain a copy of the License at
-@REM
-@REM      http://www.apache.org/licenses/LICENSE-2.0
-@REM
-@REM  Unless required by applicable law or agreed to in writing, software
-@REM  distributed under the License is distributed on an "AS IS" BASIS,
-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM  See the License for the specific language governing permissions and
-@REM  limitations under the License.
-
-@echo off
-if "%OS%" == "Windows_NT" setlocal
-
-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..
-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"
-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneSplitter
-if NOT DEFINED JAVA_HOME goto :err
-
-REM ***** JAVA options *****
-set JAVA_OPTS=^
- -Dlog4j.configuration=log4j-tools.properties
-
-REM ***** CLASSPATH library setting *****
-
-REM Ensure that any user defined CLASSPATH variables are not used on startup
-set CLASSPATH="%CASSANDRA_HOME%\conf"
-
-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.
-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"
-goto okClasspath
-
-:append
-set CLASSPATH=%CLASSPATH%;%1
-goto :eof
-
-:okClasspath
-REM Include the build\classes\main directory so it works in development
-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"
-
-set CASSANDRA_PARAMS=
-set TOOLS_PARAMS=
-
-goto runTool
-
-:runTool
-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*
-goto finally
-
-:err
-echo JAVA_HOME environment variable must be set!
-pause
-
-:finally
-
-ENDLOCAL
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+pushd %~dp0..

+if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%CD%

+popd

+

+if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"

+

+REM the default location for commitlogs, sstables, and saved caches

+REM if not set in cassandra.yaml

+set cassandra_storagedir="%CASSANDRA_HOME%\data"

+

+REM JAVA_HOME can optionally be set here

+REM set JAVA_HOME="<directory>"

+

+REM ***** CLASSPATH library setting *****

+

+REM Ensure that any user defined CLASSPATH variables are not used on startup

+set CLASSPATH="%CASSANDRA_HOME%\conf"

+

+REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.

+for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

+goto :okClasspath

+

+:append

+set CLASSPATH=%CLASSPATH%;%1

+goto :eof

+

+:okClasspath

+

+REM Include the build\classes\main directory so it works in development

+set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"

+

+REM Add the default storage location.  Can be overridden in conf\cassandra.yaml

+set CASSANDRA_PARAMS=%CASSANDRA_PARAMS% "-Dcassandra.storagedir=%CASSANDRA_HOME%\data"


diff --git a/bin/cassandra.in.sh b/bin/cassandra.in.sh
index 29e0d0e..50e6eaf 100644
--- a/bin/cassandra.in.sh
+++ b/bin/cassandra.in.sh

@@ -30,6 +30,10 @@
 cassandra_bin="$cassandra_bin:$CASSANDRA_HOME/build/classes/thrift"
 #cassandra_bin="$cassandra_home/build/cassandra.jar"
 
+# the default location for commitlogs, sstables, and saved caches
+# if not set in cassandra.yaml
+cassandra_storagedir="$CASSANDRA_HOME/data"
+
 # JAVA_HOME can optionally be set here
 #JAVA_HOME=/usr/local/jdk6
 
@@ -44,5 +48,5 @@
 if [ "$JVM_VENDOR" != "OpenJDK" -o "$JVM_VERSION" \> "1.6.0" ] \
       || [ "$JVM_VERSION" = "1.6.0" -a "$JVM_PATCH_VERSION" -ge 23 ]
 then
-    JAVA_AGENT="$JAVA_AGENT -javaagent:$CASSANDRA_HOME/lib/jamm-0.2.5.jar"
-fi
\ No newline at end of file
+    JAVA_AGENT="$JAVA_AGENT -javaagent:$CASSANDRA_HOME/lib/jamm-0.2.6.jar"
+fi

diff --git a/bin/cassandra.ps1 b/bin/cassandra.ps1
new file mode 100644
index 0000000..33ff97a
--- /dev/null
+++ b/bin/cassandra.ps1

@@ -0,0 +1,308 @@
+#

+# Licensed to the Apache Software Foundation (ASF) under one or more

+# contributor license agreements.  See the NOTICE file distributed with

+# this work for additional information regarding copyright ownership.

+# The ASF licenses this file to You under the Apache License, Version 2.0

+# (the "License"); you may not use this file except in compliance with

+# the License.  You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+param (

+    [switch]$install,

+    [switch]$uninstall,

+    [switch]$help,

+    [switch]$v,

+    [switch]$s,

+    [switch]$f,

+    [string]$p,

+    [string]$H,

+    [string]$E

+)

+

+$pidfile = "pid.txt"

+

+#-----------------------------------------------------------------------------

+Function ValidateArguments

+{

+    if ($install -and $uninstall)

+    {

+        exit

+    }

+    if ($help)

+    {

+        PrintUsage

+    }

+}

+

+#-----------------------------------------------------------------------------

+Function PrintUsage

+{

+    echo @"

+usage: cassandra.ps1 [-f] [-h] [-p pidfile] [-H dumpfile] [-D arg] [-E errorfile] [-install | -uninstall] [-help]

+    -f              Run cassandra in foreground

+    -install        install cassandra as a service

+    -uninstall      remove cassandra service

+    -p              pidfile tracked by server and removed on close (defaults to pid.txt)

+    -H              change JVM HeapDumpPath

+    -D              items to append to JVM_OPTS

+    -E              change JVM ErrorFile

+    -v              Print cassandra version and exit

+    -s              Show detailed jvm environment information during launch

+    -help           print this message

+

+    NOTE: installing cassandra as a service requires Commons Daemon Service Runner

+        available at http://commons.apache.org/proper/commons-daemon/"

+"@

+    exit

+}

+

+#-----------------------------------------------------------------------------

+# Note: throughout these scripts we're replacing \ with /.  This allows clean

+# operation on both command-prompt and cygwin-based environments.

+Function Main

+{

+    ValidateArguments

+

+    # support direct run of .ps1 file w/out batch file

+    if ($env:CASSANDRA_HOME -eq $null)

+    {

+        $scriptDir = Split-Path $script:MyInvocation.MyCommand.Path

+        $env:CASSANDRA_HOME = (Get-Item $scriptDir).parent.FullName

+    }

+    . "$env:CASSANDRA_HOME\bin\source-conf.ps1"

+

+    $conf = Find-Conf

+    if ($s)

+    {

+        echo "Sourcing cassandra config file: $conf"

+    }

+    . $conf

+

+    SetCassandraEnvironment

+    if ($v)

+    {

+        PrintVersion

+        exit

+    }

+    $pidfile = "$env:CASSANDRA_HOME\$pidfile"

+

+    $logdir = "$env:CASSANDRA_HOME/logs"

+    $storagedir = "$env:CASSANDRA_HOME/data"

+    $env:CASSANDRA_PARAMS = $env:CASSANDRA_PARAMS + " -Dcassandra.logdir=""$logdir"" -Dcassandra.storagedir=""$storagedir"""

+

+    # Other command line params

+    if ($H)

+    {

+        $env:JVM_OPTS = $env:JVM_OPTS + " -XX:HeapDumpPath=$H"

+    }

+    if ($E)

+    {

+        $env:JVM_OPTS = $env:JVM_OPTS + " -XX:ErrorFile=$E"

+    }

+    if ($p)

+    {

+        $pidfile = "$p"

+        $env:CASSANDRA_PARAMS = $env:CASSANDRA_PARAMS + ' -Dcassandra-pidfile="' + "$pidfile" + '"'

+    }

+

+    # Parse -D JVM_OPTS

+    for ($i = 0; $i -lt $script:args.Length; ++$i)

+    {

+        if ($script:args[$i].Substring(0,2) -eq "-D")

+        {

+            $param = $script:args[$i].Substring(2)

+            $env:JVM_OPTS = "$env:JVM_OPTS -D$param"

+        }

+    }

+

+    if ($install -or $uninstall)

+    {

+        HandleInstallation

+    }

+    else

+    {

+        RunCassandra($f)

+    }

+}

+

+#-----------------------------------------------------------------------------

+Function HandleInstallation

+{

+    $SERVICE_JVM = """cassandra"""

+    $PATH_PRUNSRV = "$env:CASSANDRA_HOME\bin\daemon"

+    $PR_LOGPATH = $serverPath

+

+    if (-Not (Test-Path $PATH_PRUNSRV\prunsrv.exe))

+    {

+        Write-Warning "Cannot find $PATH_PRUNSRV\prunsrv.exe.  Please download package from http://www.apache.org/dist/commons/daemon/binaries/windows/ to install as a service."

+        Break

+    }

+

+    If (-NOT ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator"))

+    {

+        Write-Warning "Cannot perform installation without admin credentials.  Please re-run as administrator."

+        Break

+    }

+    if (!$env:PRUNSRV)

+    {

+        $env:PRUNSRV="$PATH_PRUNSRV\prunsrv"

+    }

+

+    $regPath = "HKLM:\SYSTEM\CurrentControlSet\services\Tcpip\Parameters\"

+

+    echo "Attempting to delete existing $SERVICE_JVM service..."

+    Start-Sleep -s 2

+    $proc = Start-Process -FilePath "$env:PRUNSRV" -ArgumentList "//DS//$SERVICE_JVM" -PassThru -WindowStyle Hidden

+

+    echo "Reverting to default TCP keepalive settings (2 hour timeout)"

+    Remove-ItemProperty -Path $regPath -Name KeepAliveTime -EA SilentlyContinue

+

+    # Quit out if this is uninstall only

+    if ($uninstall)

+    {

+        return

+    }

+

+    echo "Installing [$SERVICE_JVM]."

+    Start-Sleep -s 2

+    $proc = Start-Process -FilePath "$env:PRUNSRV" -ArgumentList "//IS//$SERVICE_JVM" -PassThru -WindowStyle Hidden

+

+    echo "Setting launch parameters for [$SERVICE_JVM]"

+    Start-Sleep -s 2

+

+    # Change delim from " -" to ";-" in JVM_OPTS for prunsrv

+    $env:JVM_OPTS = $env:JVM_OPTS -replace " -", ";-"

+    $env:JVM_OPTS = $env:JVM_OPTS -replace " -", ";-"

+

+    # Strip off leading ; if it's there

+    $env:JVM_OPTS = $env:JVM_OPTS.TrimStart(";")

+

+    # Broken multi-line for convenience - glued back together in a bit

+    $args = @"

+//US//$SERVICE_JVM

+ --Jvm=auto --StdOutput auto --StdError auto

+ --Classpath=$env:CLASSPATH

+ --StartMode=jvm --StartClass=$env:CASSANDRA_MAIN --StartMethod=main

+ --StopMode=jvm --StopClass=$env:CASSANDRA_MAIN  --StopMethod=stop

+ ++JvmOptions=$env:JVM_OPTS ++JvmOptions=-DCassandra

+ --PidFile "$pidfile"

+"@

+    $args = $args -replace [Environment]::NewLine, ""

+    $proc = Start-Process -FilePath "$env:PRUNSRV" -ArgumentList $args -PassThru -WindowStyle Hidden

+

+    echo "Setting KeepAliveTimer to 5 minutes for TCP keepalive"

+    Set-ItemProperty -Path $regPath -Name KeepAliveTime -Value 300000

+

+    echo "Installation of [$SERVICE_JVM] is complete"

+}

+

+#-----------------------------------------------------------------------------

+Function PrintVersion()

+{

+    Write-Host "Cassandra Version: " -NoNewLine

+    $pinfo = New-Object System.Diagnostics.ProcessStartInfo

+    $pinfo.FileName = "$env:JAVA_BIN"

+    $pinfo.UseShellExecute = $false

+    $pinfo.Arguments = "-cp $env:CLASSPATH org.apache.cassandra.tools.GetVersion"

+    $p = New-Object System.Diagnostics.Process

+    $p.StartInfo = $pinfo

+    $p.Start() | Out-Null

+    $p.WaitForExit()

+}

+

+#-----------------------------------------------------------------------------

+Function RunCassandra([string]$foreground)

+{

+    echo "Starting cassandra server"

+    $cmd = @"

+$env:JAVA_BIN

+"@

+    $arg1 = $env:CASSANDRA_PARAMS

+    $arg2 = $env:JVM_OPTS

+    $arg3 = "-cp $env:CLASSPATH"

+    $arg4 = @"

+"$env:CASSANDRA_MAIN"

+"@

+

+    $proc = $null

+

+    if ($s)

+    {

+        echo "Running cassandra with: [$cmd $arg1 $arg2 $arg3 $arg4]"

+    }

+

+    if ($foreground -ne "False")

+    {

+        $cygwin = $false

+        try

+        {

+            $uname = uname -o

+            if ($uname.CompareTo("Cygwin") -eq 0)

+            {

+                $cygwin = $true

+            }

+        }

+        catch

+        {

+            # Failed at uname call, not in cygwin

+        }

+

+        if ($cygwin)

+        {

+            # if running on cygwin, we cannot capture ctrl+c signals as mintty traps them and then

+            # SIGKILLs processes, so we'll need to record our $pidfile file for future

+            # stop-server usage

+            if (!$p)

+            {

+                echo "Detected cygwin runtime environment.  Adding -Dcassandra-pidfile=$pidfile to JVM params as control+c trapping on mintty is inconsistent"

+                $arg2 = $arg2 + " -Dcassandra-pidfile=$pidfile"

+            }

+        }

+

+        $arg2 = $arg2 + " -Dcassandra-foreground=yes"

+

+        $pinfo = New-Object System.Diagnostics.ProcessStartInfo

+        $pinfo.FileName = "$env:JAVA_BIN"

+        $pinfo.RedirectStandardInput = $true

+        $pinfo.UseShellExecute = $false

+        $pinfo.Arguments = $arg1,$arg2,$arg3,$arg4

+        $p = New-Object System.Diagnostics.Process

+        $p.StartInfo = $pinfo

+        $p.Start() | Out-Null

+        echo $p.Id > $pidfile

+        $p.WaitForExit()

+    }

+    else

+    {

+        $proc = Start-Process -FilePath "$cmd" -ArgumentList $arg1,$arg2,$arg3,$arg4 -PassThru -WindowStyle Hidden

+

+        $exitCode = $?

+

+        try

+        {

+            echo $proc.Id > $pidfile

+        }

+        catch

+        {

+            echo @"

+WARNING! Failed to write pidfile to $pidfile.  stop-server.bat and

+    startup protection will not be available.

+"@

+            exit 1

+        }

+

+        if (-Not $exitCode)

+        {

+            exit 1

+        }

+    }

+}

+

+#-----------------------------------------------------------------------------

+Main


diff --git a/bin/cqlsh b/bin/cqlsh
index c99b98c..763a828 100755
--- a/bin/cqlsh
+++ b/bin/cqlsh

@@ -30,15 +30,14 @@
 ":"""
 
 from __future__ import with_statement
+from uuid import UUID
 
 description = "CQL Shell for Apache Cassandra"
-version = "4.1.1"
+version = "5.0.1"
 
 from StringIO import StringIO
-from itertools import groupby
 from contextlib import contextmanager, closing
 from glob import glob
-from uuid import UUID
 
 import cmd
 import sys
@@ -64,8 +63,7 @@
 except ImportError:
     pass
 
-CQL_LIB_PREFIX = 'cql-internal-only-'
-THRIFT_LIB_PREFIX = 'thrift-python-internal-only-'
+CQL_LIB_PREFIX = 'cassandra-driver-internal-only-'
 
 CASSANDRA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
 
@@ -88,24 +86,30 @@
 cql_zip = find_zip(CQL_LIB_PREFIX)
 if cql_zip:
     ver = os.path.splitext(os.path.basename(cql_zip))[0][len(CQL_LIB_PREFIX):]
-    sys.path.insert(0, os.path.join(cql_zip, 'cql-' + ver))
-thrift_zip = find_zip(THRIFT_LIB_PREFIX)
-if thrift_zip:
-    sys.path.insert(0, thrift_zip)
+    sys.path.insert(0, os.path.join(cql_zip, 'cassandra-driver-' + ver))
 
+third_parties = ('futures-', 'six-')
+
+for lib in third_parties:
+    lib_zip = find_zip(lib)
+    if lib_zip:
+        sys.path.insert(0, lib_zip)
+
+warnings.filterwarnings("ignore", r".*blist.*")
 try:
-    import cql
+    import cassandra
 except ImportError, e:
-    sys.exit("\nPython CQL driver not installed, or not on PYTHONPATH.\n"
-             'You might try "easy_install cql".\n\n'
+    sys.exit("\nPython Cassandra driver not installed, or not on PYTHONPATH.\n"
+             'You might try "pip install cassandra-driver".\n\n'
              'Python: %s\n'
              'Module load path: %r\n\n'
              'Error: %s\n' % (sys.executable, sys.path, e))
 
-import cql.decoders
-from cql.cursor import _VOID_DESCRIPTION
-from cql.cqltypes import (cql_types, cql_typename, lookup_casstype, lookup_cqltype,
-                          CassandraType, ReversedType, CompositeType)
+from cassandra.cluster import Cluster, PagedResult
+from cassandra.query import SimpleStatement, ordered_dict_factory
+from cassandra.policies import WhiteListRoundRobinPolicy
+from cassandra.metadata import protect_name, protect_names, protect_value
+from cassandra.auth import PlainTextAuthProvider
 
 # cqlsh should run correctly when run out of a Cassandra source tree,
 # out of an unpacked Cassandra tarball, and after a proper package install.
@@ -113,37 +117,21 @@
 if os.path.isdir(cqlshlibdir):
     sys.path.insert(0, cqlshlibdir)
 
-from cqlshlib import cqlhandling, cql3handling, pylexotron
-from cqlshlib.displaying import (RED, BLUE, ANSI_RESET, COLUMN_NAME_COLORS,
+from cqlshlib import cqlhandling, cql3handling, pylexotron, sslhandling, async_insert, meter
+from cqlshlib.displaying import (RED, BLUE, CYAN, ANSI_RESET, COLUMN_NAME_COLORS,
                                  FormattedValue, colorme)
-from cqlshlib.formatting import format_by_type
+from cqlshlib.formatting import format_by_type, formatter_for, format_value_utype
 from cqlshlib.util import trim_if_present
-from cqlshlib.tracing import print_trace_session
+from cqlshlib.tracing import print_trace_session, print_trace
 
-HISTORY_DIR = os.path.expanduser(os.path.join('~', '.cassandra'))
-CONFIG_FILE = os.path.join(HISTORY_DIR, 'cqlshrc')
-HISTORY = os.path.join(HISTORY_DIR, 'cqlsh_history')
-if not os.path.exists(HISTORY_DIR):
-    try:
-        os.mkdir(HISTORY_DIR)
-    except OSError:
-        print '\nWarning: Cannot create directory at `%s`. Command history will not be saved.\n' % HISTORY_DIR
-
-OLD_CONFIG_FILE = os.path.expanduser(os.path.join('~', '.cqlshrc'))
-if os.path.exists(OLD_CONFIG_FILE):
-    os.rename(OLD_CONFIG_FILE, CONFIG_FILE)
-OLD_HISTORY = os.path.expanduser(os.path.join('~', '.cqlsh_history'))
-if os.path.exists(OLD_HISTORY):
-    os.rename(OLD_HISTORY, HISTORY)
-
-DEFAULT_HOST = 'localhost'
-DEFAULT_PORT = 9160
-DEFAULT_CQLVER = '3.1.1'
-DEFAULT_TRANSPORT_FACTORY = 'cqlshlib.tfactory.regular_transport_factory'
+DEFAULT_HOST = '127.0.0.1'
+DEFAULT_PORT = 9042
+DEFAULT_CQLVER = '3.2.0'
+DEFAULT_PROTOCOL_VERSION = 3
 
 DEFAULT_TIME_FORMAT = '%Y-%m-%d %H:%M:%S%z'
 DEFAULT_FLOAT_PRECISION = 5
-DEFAULT_SELECT_LIMIT = 10000
+DEFAULT_MAX_TRACE_WAIT = 10
 
 if readline is not None and readline.__doc__ is not None and 'libedit' in readline.__doc__:
     DEFAULT_COMPLETEKEY = '\t'
@@ -162,26 +150,55 @@
                   help='Always use color output')
 parser.add_option("--no-color", action='store_false', dest='color',
                   help='Never use color output')
+parser.add_option('--ssl', action='store_true', help='Use SSL', default=False)
 parser.add_option("-u", "--username", help="Authenticate as user.")
 parser.add_option("-p", "--password", help="Authenticate using password.")
 parser.add_option('-k', '--keyspace', help='Authenticate to the given keyspace.')
 parser.add_option("-f", "--file", help="Execute commands from FILE, then exit")
-parser.add_option("-t", "--transport-factory",
-                  help="Use the provided Thrift transport factory function.")
 parser.add_option('--debug', action='store_true',
                   help='Show additional debugging information')
+parser.add_option("--cqlshrc", help="Specify an alternative cqlshrc file location.")
 parser.add_option('--cqlversion', default=DEFAULT_CQLVER,
                   help='Specify a particular CQL version (default: %default).'
                        ' Examples: "3.0.3", "3.1.0"')
 parser.add_option("-e", "--execute", help='Execute the statement and quit.')
 
-CQL_ERRORS = (cql.Error,)
-try:
-    from thrift.Thrift import TException
-except ImportError:
-    pass
+optvalues = optparse.Values()
+(options, arguments) = parser.parse_args(sys.argv[1:], values=optvalues)
+
+#BEGIN history/config definition
+HISTORY_DIR = os.path.expanduser(os.path.join('~', '.cassandra'))
+
+if hasattr(options, 'cqlshrc'):
+    CONFIG_FILE = options.cqlshrc
+    if not os.path.exists(CONFIG_FILE):
+        print '\nWarning: Specified cqlshrc location `%s` does not exist.  Using `%s` instead.\n' % (CONFIG_FILE, HISTORY_DIR)
+        CONFIG_FILE = os.path.join(HISTORY_DIR, 'cqlshrc')
 else:
-    CQL_ERRORS += (TException,)
+    CONFIG_FILE = os.path.join(HISTORY_DIR, 'cqlshrc')
+
+HISTORY = os.path.join(HISTORY_DIR, 'cqlsh_history')
+if not os.path.exists(HISTORY_DIR):
+    try:
+        os.mkdir(HISTORY_DIR)
+    except OSError:
+        print '\nWarning: Cannot create directory at `%s`. Command history will not be saved.\n' % HISTORY_DIR
+
+OLD_CONFIG_FILE = os.path.expanduser(os.path.join('~', '.cqlshrc'))
+if os.path.exists(OLD_CONFIG_FILE):
+    os.rename(OLD_CONFIG_FILE, CONFIG_FILE)
+OLD_HISTORY = os.path.expanduser(os.path.join('~', '.cqlsh_history'))
+if os.path.exists(OLD_HISTORY):
+    os.rename(OLD_HISTORY, HISTORY)
+#END history/config definition
+
+CQL_ERRORS = (
+    cassandra.AlreadyExists, cassandra.AuthenticationFailed, cassandra.InvalidRequest,
+    cassandra.Timeout, cassandra.Unauthorized, cassandra.OperationTimedOut,
+    cassandra.cluster.NoHostAvailable,
+    cassandra.connection.ConnectionBusy, cassandra.connection.ProtocolError, cassandra.connection.ConnectionException,
+    cassandra.protocol.ErrorMessage, cassandra.protocol.InternalError, cassandra.query.TraceUnavailable
+)
 
 debug_completion = bool(os.environ.get('CQLSH_DEBUG_COMPLETION', '') == 'YES')
 
@@ -200,6 +217,7 @@
     'debug',
     'tracing',
     'expand',
+    'paging',
     'exit',
     'quit'
 )
@@ -227,6 +245,7 @@
                    | <tracingCommand>
                    | <expandCommand>
                    | <exitCommand>
+                   | <pagingCommand>
                    ;
 
 <describeCommand> ::= ( "DESCRIBE" | "DESC" )
@@ -235,7 +254,9 @@
                                   | ( "COLUMNFAMILY" | "TABLE" ) cf=<columnFamilyName>
                                   | ( "COLUMNFAMILIES" | "TABLES" )
                                   | "FULL"? "SCHEMA"
-                                  | "CLUSTER" )
+                                  | "CLUSTER"
+                                  | "TYPES"
+                                  | "TYPE" ut=<userTypeName>)
                     ;
 
 <consistencyCommand> ::= "CONSISTENCY" ( level=<consistencyLevel> )?
@@ -288,6 +309,9 @@
 <expandCommand> ::= "EXPAND" ( switch=( "ON" | "OFF" ) )?
                    ;
 
+<pagingCommand> ::= "PAGING" ( switch=( "ON" | "OFF" ) )?
+                  ;
+
 <exitCommand> ::= "exit" | "quit"
                 ;
 
@@ -371,15 +395,15 @@
 class VersionNotSupported(Exception):
     pass
 
+class UserTypeNotFound(Exception):
+    pass
+
 class DecodeError(Exception):
     verb = 'decode'
 
-    def __init__(self, thebytes, err, expectedtype, colname=None):
+    def __init__(self, thebytes, err, colname=None):
         self.thebytes = thebytes
         self.err = err
-        if isinstance(expectedtype, type) and issubclass(expectedtype, CassandraType):
-            expectedtype = expectedtype.cql_parameterized_type()
-        self.expectedtype = expectedtype
         self.colname = colname
 
     def __str__(self):
@@ -389,8 +413,8 @@
         what = 'value %r' % (self.thebytes,)
         if self.colname is not None:
             what = 'value %r (for column %r)' % (self.thebytes, self.colname)
-        return 'Failed to %s %s as %s: %s' \
-               % (self.verb, what, self.expectedtype, self.err)
+        return 'Failed to %s %s : %s' \
+               % (self.verb, what, self.err)
 
     def __repr__(self):
         return '<%s %s>' % (self.__class__.__name__, self.message())
@@ -405,16 +429,14 @@
     vertuple = tuple(map(int, ver_parts[0].split('.')) + [ver_parts[1]])
     return ver, vertuple
 
-def format_value(val, typeclass, output_encoding, addcolor=False, time_format=None,
+def format_value(val, output_encoding, addcolor=False, time_format=None,
                  float_precision=None, colormap=None, nullval=None):
     if isinstance(val, DecodeError):
         if addcolor:
             return colorme(repr(val.thebytes), colormap, 'error')
         else:
             return FormattedValue(repr(val.thebytes))
-    if not issubclass(typeclass, CassandraType):
-        typeclass = lookup_casstype(typeclass)
-    return format_by_type(typeclass, val, output_encoding, colormap=colormap,
+    return format_by_type(type(val), val, output_encoding, colormap=colormap,
                           addcolor=addcolor, nullval=nullval, time_format=time_format,
                           float_precision=float_precision)
 
@@ -444,6 +466,26 @@
         words = desc[0] + ' and ' + words
     return words
 
+
+def auto_format_udts():
+    # when we see a new user defined type, set up the shell formatting for it
+    udt_apply_params = cassandra.cqltypes.UserType.apply_parameters
+    def new_apply_params(cls, *args, **kwargs):
+        udt_class = udt_apply_params(*args, **kwargs)
+        formatter_for(udt_class.typename)(format_value_utype)
+        return udt_class
+
+    cassandra.cqltypes.UserType.udt_apply_parameters = classmethod(new_apply_params)
+
+    make_udt_class = cassandra.cqltypes.UserType.make_udt_class
+    def new_make_udt_class(cls, *args, **kwargs):
+        udt_class = make_udt_class(*args, **kwargs)
+        formatter_for(udt_class.typename)(format_value_utype)
+        return udt_class
+
+    cassandra.cqltypes.UserType.make_udt_class = classmethod(new_make_udt_class)
+
+
 class Shell(cmd.Cmd):
     custom_prompt = os.getenv('CQLSH_PROMPT', '')
     if custom_prompt is not '':
@@ -452,44 +494,56 @@
     continue_prompt = "   ... "
     keyspace_prompt = custom_prompt + "cqlsh:%s> "
     keyspace_continue_prompt = "%s    ... "
-    num_retries = 4
     show_line_nums = False
     debug = False
     stop = False
     last_hist = None
     shunted_query_out = None
+    use_paging = True
     csv_dialect_defaults = dict(delimiter=',', doublequote=False,
                                 escapechar='\\', quotechar='"')
+    default_page_size = 100
 
-    def __init__(self, hostname, port, transport_factory, color=False,
+    def __init__(self, hostname, port, color=False,
                  username=None, password=None, encoding=None, stdin=None, tty=True,
                  completekey=DEFAULT_COMPLETEKEY, use_conn=None,
                  cqlver=DEFAULT_CQLVER, keyspace=None,
                  tracing_enabled=False, expand_enabled=False,
                  display_time_format=DEFAULT_TIME_FORMAT,
                  display_float_precision=DEFAULT_FLOAT_PRECISION,
-                 single_statement=None):
+                 max_trace_wait=DEFAULT_MAX_TRACE_WAIT,
+                 ssl=False,
+                 single_statement=None,
+                 client_timeout=10):
         cmd.Cmd.__init__(self, completekey=completekey)
         self.hostname = hostname
         self.port = port
-        self.transport_factory = transport_factory
-
-        if username and not password:
-            password = getpass.getpass()
-
+        self.auth_provider = None
+        if username:
+            if not password:
+                password = getpass.getpass()
+            self.auth_provider = PlainTextAuthProvider(username=username, password=password)
         self.username = username
-        self.password = password
         self.keyspace = keyspace
         self.tracing_enabled = tracing_enabled
         self.expand_enabled = expand_enabled
-        if use_conn is not None:
+        if use_conn:
             self.conn = use_conn
         else:
-            transport = transport_factory(hostname, port, os.environ, CONFIG_FILE)
-            self.conn = cql.connect(hostname, port, keyspace=keyspace, user=username, 
-                                    password=password, cql_version=cqlver, transport=transport)
+            self.conn = Cluster(contact_points=(self.hostname,), port=self.port, cql_version=cqlver,
+                                protocol_version=DEFAULT_PROTOCOL_VERSION,
+                                auth_provider=self.auth_provider,
+                                ssl_options=sslhandling.ssl_settings(hostname, CONFIG_FILE) if ssl else None,
+                                load_balancing_policy=WhiteListRoundRobinPolicy([self.hostname]))
+        self.owns_connection = not use_conn
         self.set_expanded_cql_version(cqlver)
-        self.cursor = self.conn.cursor()
+
+        if keyspace:
+            self.session = self.conn.connect(keyspace)
+        else:
+            self.session = self.conn.connect()
+        self.session.default_timeout = client_timeout
+        self.session.row_factory = ordered_dict_factory
         self.get_connection_versions()
 
         self.current_keyspace = keyspace
@@ -497,6 +551,8 @@
         self.color = color
         self.display_time_format = display_time_format
         self.display_float_precision = display_float_precision
+        self.max_trace_wait = max_trace_wait
+        self.session.max_trace_wait = max_trace_wait
         if encoding is None:
             encoding = locale.getpreferredencoding()
         self.encoding = encoding
@@ -518,49 +574,46 @@
             self.show_line_nums = True
         self.stdin = stdin
         self.query_out = sys.stdout
+        self.consistency_level = cassandra.ConsistencyLevel.ONE
+        # the python driver returns BLOBs as string, but we expect them as bytearrays
+        cassandra.cqltypes.BytesType.deserialize = staticmethod(lambda byts, protocol_version: bytearray(byts))
+        cassandra.cqltypes.CassandraType.support_empty_values = True
+
+        auto_format_udts()
+
         self.empty_lines = 0
         self.statement_error = False
         self.single_statement = single_statement
-        # see CASSANDRA-7399
-        cql.cqltypes.CompositeType.cql_parameterized_type = classmethod(lambda cls: "'%s'" % cls.cass_parameterized_type_with(cls.subtypes, True))
 
     def set_expanded_cql_version(self, ver):
         ver, vertuple = full_cql_version(ver)
-        self.set_cql_version(ver)
         self.cql_version = ver
         self.cql_ver_tuple = vertuple
 
     def cqlver_atleast(self, major, minor=0, patch=0):
         return self.cql_ver_tuple[:3] >= (major, minor, patch)
 
-    def cassandraver_atleast(self, major, minor=0, patch=0):
-        return self.cass_ver_tuple[:3] >= (major, minor, patch)
-
-    def myformat_value(self, val, casstype, **kwargs):
+    def myformat_value(self, val, **kwargs):
         if isinstance(val, DecodeError):
             self.decoding_errors.append(val)
         try:
-            return format_value(val, casstype, self.output_codec.name,
+            return format_value(val, self.output_codec.name,
                                 addcolor=self.color, time_format=self.display_time_format,
                                 float_precision=self.display_float_precision, **kwargs)
         except Exception, e:
-            err = FormatError(val, e, casstype)
+            err = FormatError(val, e)
             self.decoding_errors.append(err)
-            return format_value(err, None, self.output_codec.name, addcolor=self.color)
+            return format_value(err, self.output_codec.name, addcolor=self.color)
 
-    def myformat_colname(self, name, nametype):
-        return self.myformat_value(name, nametype, colormap=COLUMN_NAME_COLORS)
-
-    # cql/cursor.py:Cursor.decode_row() function, modified to not turn '' into None.
-    def decode_row(self, cursor, row):
-        values = []
-        bytevals = cursor.columnvalues(row)
-        for val, vtype, nameinfo in zip(bytevals, cursor.column_types, cursor.name_info):
-            if val == '':
-                values.append(val)
-            else:
-                values.append(cursor.decoder.decode_value(val, vtype, nameinfo[0]))
-        return values
+    def myformat_colname(self, name, table_meta=None):
+        column_colors = COLUMN_NAME_COLORS.copy()
+        # check column role and color appropriately
+        if table_meta:
+            if name in [col.name for col in table_meta.partition_key]:
+                column_colors.default_factory = lambda : RED
+            elif name in [col.name for col in table_meta.clustering_key]:
+                column_colors.default_factory = lambda : CYAN
+        return self.myformat_value(name, colormap=column_colors)
 
     def report_connection(self):
         self.show_host()
@@ -578,131 +631,107 @@
         # system.Versions['cql'] apparently does not reflect changes with
         # set_cql_version.
         vers['cql'] = self.cql_version
-        print "[cqlsh %(shver)s | Cassandra %(build)s | CQL spec %(cql)s | Thrift protocol %(thrift)s]" % vers
+        print "[cqlsh %(shver)s | Cassandra %(build)s | CQL spec %(cql)s | Native protocol v%(protocol)s]" % vers
 
     def show_session(self, sessionid):
-        print_trace_session(self, self.cursor, sessionid)
+        print_trace_session(self, self.session, sessionid)
 
     def get_connection_versions(self):
-        self.cursor.execute("select * from system.local where key = 'local'")
-        result = self.fetchdict()
+        result, = self.session.execute("select * from system.local where key = 'local'")
         vers = {
             'build': result['release_version'],
-            'thrift': result['thrift_version'],
+            'protocol': result['native_protocol_version'],
             'cql': result['cql_version'],
         }
         self.connection_versions = vers
-        self.cass_ver_tuple = tuple(map(int, vers['build'].split('-', 1)[0].split('.')[:3]))
-
-    def fetchdict(self):
-        row = self.cursor.fetchone()
-        if row is None:
-            return None
-        desc = self.cursor.description
-        return dict(zip([d[0] for d in desc], row))
-
-    def fetchdict_all(self):
-        dicts = []
-        for row in self.cursor:
-            desc = self.cursor.description
-            dicts.append(dict(zip([d[0] for d in desc], row)))
-        return dicts
 
     def get_keyspace_names(self):
-        return [k.name for k in self.get_keyspaces()]
+        return map(str, self.conn.metadata.keyspaces.keys())
 
     def get_columnfamily_names(self, ksname=None):
         if ksname is None:
             ksname = self.current_keyspace
-        cf_q = """select columnfamily_name from system.schema_columnfamilies
-                   where keyspace_name=:ks"""
-        self.cursor.execute(cf_q,
-                            {'ks': self.cql_unprotect_name(ksname)},
-                            consistency_level='ONE')
-        return [str(row[0]) for row in self.cursor.fetchall()]
+
+        return map(str, self.get_keyspace_meta(ksname).tables.keys())
 
     def get_index_names(self, ksname=None):
         idxnames = []
         for cfname in self.get_columnfamily_names(ksname=ksname):
-            for col in self.get_columnfamily_layout(ksname, cfname).columns:
-                if col.index_name is not None:
-                    idxnames.append(col.index_name)
+            for col in self.get_table_meta(ksname, cfname).columns.values():
+                if col.index:
+                    idxnames.append(col.index.name)
         return idxnames
 
     def get_column_names(self, ksname, cfname):
         if ksname is None:
             ksname = self.current_keyspace
-        layout = self.get_columnfamily_layout(ksname, cfname)
-        return [col.name for col in layout.columns]
+        layout = self.get_table_meta(ksname, cfname)
+        return [str(col) for col in layout.columns]
 
-    # ===== thrift-dependent parts =====
+    def get_usertype_names(self, ksname=None):
+        if ksname is None:
+            ksname = self.current_keyspace
+
+        return self.get_keyspace_meta(ksname).user_types.keys()
+
+    def get_usertype_layout(self, ksname, typename):
+        if ksname is None:
+            ksname = self.current_keyspace
+
+        ks_meta = self.get_keyspace_meta(ksname)
+
+        try:
+            user_type = ks_meta.user_types[typename]
+        except KeyError:
+            raise UserTypeNotFound("User type %r not found" % typename)
+
+        return [(field_name, field_type.cql_parameterized_type())
+                for field_name, field_type in zip(user_type.field_names, user_type.field_types)]
 
     def get_cluster_name(self):
-        return self.make_hacktastic_thrift_call('describe_cluster_name')
+        return self.conn.metadata.cluster_name
 
     def get_partitioner(self):
-        return self.make_hacktastic_thrift_call('describe_partitioner')
+        return self.conn.metadata.partitioner
 
-    def get_snitch(self):
-        return self.make_hacktastic_thrift_call('describe_snitch')
+    def get_keyspace_meta(self, ksname):
+        if not ksname in self.conn.metadata.keyspaces:
+            raise KeyspaceNotFound('Keyspace %r not found.' % ksname)
+        return self.conn.metadata.keyspaces[ksname]
 
-    def get_thrift_version(self):
-        return self.make_hacktastic_thrift_call('describe_version')
+    def get_keyspaces(self):
+        return self.conn.metadata.keyspaces.values()
 
     def get_ring(self):
         if self.current_keyspace is None or self.current_keyspace == 'system':
             raise NoKeyspaceError("Ring view requires a current non-system keyspace")
-        return self.make_hacktastic_thrift_call('describe_ring', self.current_keyspace)
+        self.conn.metadata.token_map.rebuild_keyspace(self.current_keyspace, build_if_absent=True)
+        return self.conn.metadata.token_map.tokens_to_hosts_by_ks[self.current_keyspace]
 
-    def get_keyspace(self, ksname):
-        try:
-            return self.make_hacktastic_thrift_call('describe_keyspace', ksname)
-        except cql.cassandra.ttypes.NotFoundException:
-            raise KeyspaceNotFound('Keyspace %r not found.' % ksname)
-
-    def get_keyspaces(self):
-        return self.make_hacktastic_thrift_call('describe_keyspaces')
-
-    def get_schema_versions(self):
-        return self.make_hacktastic_thrift_call('describe_schema_versions')
-
-    def set_cql_version(self, ver):
-        try:
-            return self.make_hacktastic_thrift_call('set_cql_version', ver)
-        except cql.cassandra.ttypes.InvalidRequestException, e:
-            raise VersionNotSupported(e.why)
-
-    def trace_next_query(self):
-        return self.make_hacktastic_thrift_call('trace_next_query')
-
-    def make_hacktastic_thrift_call(self, call, *args):
-        client = self.conn.client
-        return getattr(client, call)(*args)
-
-    # ===== end thrift-dependent parts =====
-
-    # ===== cql3-dependent parts =====
-
-    def get_columnfamily_layout(self, ksname, cfname):
+    def get_table_meta(self, ksname, tablename):
         if ksname is None:
             ksname = self.current_keyspace
-        cf_q = """select * from system.schema_columnfamilies
-                   where keyspace_name=:ks and columnfamily_name=:cf"""
-        col_q = """select * from system.schema_columns
-                    where keyspace_name=:ks and columnfamily_name=:cf"""
-        self.cursor.execute(cf_q,
-                            {'ks': ksname, 'cf': cfname},
-                            consistency_level='ONE')
-        layout = self.fetchdict()
-        if layout is None:
-            raise ColumnFamilyNotFound("Column family %r not found" % cfname)
-        self.cursor.execute(col_q,
-                            {'ks': ksname, 'cf': cfname},
-                            consistency_level='ONE')
-        cols = self.fetchdict_all()
-        return cql3handling.CqlTableDef.from_layout(layout, cols)
+        ksmeta = self.get_keyspace_meta(ksname)
 
-    # ===== end cql3-dependent parts =====
+        if tablename not in ksmeta.tables:
+            raise ColumnFamilyNotFound("Column family %r not found" % tablename)
+
+        return ksmeta.tables[tablename]
+
+    def get_usertypes_meta(self):
+        data = self.session.execute("select * from system.schema_usertypes")
+        if not data:
+            return cql3handling.UserTypesMeta({})
+
+        return cql3handling.UserTypesMeta.from_layout(data)
+
+    def get_trigger_names(self, ksname=None):
+        if ksname is None:
+            ksname = self.current_keyspace
+
+        return [trigger.name
+                for table in self.get_keyspace_meta(ksname).tables.values()
+                for trigger in table.triggers.values()]
 
     def reset_statement(self):
         self.reset_prompt()
@@ -711,9 +740,9 @@
 
     def reset_prompt(self):
         if self.current_keyspace is None:
-            self.set_prompt(self.default_prompt)
+            self.set_prompt(self.default_prompt, True)
         else:
-            self.set_prompt(self.keyspace_prompt % self.current_keyspace)
+            self.set_prompt(self.keyspace_prompt % self.current_keyspace, True)
 
     def set_continue_prompt(self):
         if self.empty_lines >=3:
@@ -734,6 +763,8 @@
             try:
                 import readline
             except ImportError:
+                if platform.system() == 'Windows':
+                    print "WARNING: pyreadline dependency missing.  Install to enable tab completion."
                 pass
             else:
                 old_completer = readline.get_completer()
@@ -773,6 +804,7 @@
                 return
             yield newline
 
+
     def cmdloop(self):
         """
         Adapted from cmd.Cmd's version, because there is literally no way with
@@ -792,7 +824,7 @@
                         self.reset_statement()
                 except EOFError:
                     self.handle_eof()
-                except cql.Error, cqlerr:
+                except CQL_ERRORS, cqlerr:
                     self.printerr(str(cqlerr))
                 except KeyboardInterrupt:
                     self.reset_statement()
@@ -883,105 +915,91 @@
 
     def do_use(self, parsed):
         ksname = parsed.get_binding('ksname')
-        if self.perform_statement_untraced(parsed.extract_orig()):
+        if self.perform_simple_statement(SimpleStatement(parsed.extract_orig())):
             if ksname[0] == '"' and ksname[-1] == '"':
                 self.current_keyspace = self.cql_unprotect_name(ksname)
             else:
                 self.current_keyspace = ksname.lower()
 
     def do_select(self, parsed):
-        ksname = parsed.get_binding('ksname')
-        if ksname is not None:
-            ksname = self.cql_unprotect_name(ksname)
-        cfname = self.cql_unprotect_name(parsed.get_binding('cfname'))
         tracing_was_enabled = self.tracing_enabled
+        ksname = parsed.get_binding('ksname')
         stop_tracing = ksname == 'system_traces' or (ksname is None and self.current_keyspace == 'system_traces')
         self.tracing_enabled = self.tracing_enabled and not stop_tracing
         statement = parsed.extract_orig()
-        with_default_limit = parsed.get_binding('limit') is None
-        if with_default_limit:
-            statement = "%s LIMIT %d;" % (statement[:-1], DEFAULT_SELECT_LIMIT)
-        self.perform_statement(statement,
-                               decoder=ErrorHandlingSchemaDecoder,
-                               with_default_limit=with_default_limit)
+        self.perform_statement(statement)
         self.tracing_enabled = tracing_was_enabled
 
-    def perform_statement(self, statement, decoder=None, with_default_limit=False):
+    def perform_statement(self, statement):
+        stmt = SimpleStatement(statement, consistency_level=self.consistency_level, fetch_size=self.default_page_size if self.use_paging else None)
+        result = self.perform_simple_statement(stmt)
         if self.tracing_enabled:
-            session_id = UUID(bytes=self.trace_next_query())
-            result = self.perform_statement_untraced(statement,
-                                                     decoder=decoder,
-                                                     with_default_limit=with_default_limit)
-            print_trace_session(self, self.cursor, session_id)
-            return result
-        else:
-            return self.perform_statement_untraced(statement,
-                                                   decoder=decoder,
-                                                   with_default_limit=with_default_limit)
+            if stmt.trace:
+                print_trace(self, stmt.trace)
+            else:
+                msg = "Statement trace did not complete within %d seconds" % (self.session.max_trace_wait)
+                self.writeresult(msg, color=RED)
 
-    def perform_statement_untraced(self, statement, decoder=None, with_default_limit=False):
+        return result
+
+    def parse_for_table_meta(self, query_string):
+        parsed = cqlruleset.cql_parse(query_string)[1]
+        ks =  self.cql_unprotect_name(parsed.get_binding('ksname', None))
+        cf = self.cql_unprotect_name(parsed.get_binding('cfname'))
+        return self.get_table_meta(ks, cf)
+
+    def perform_simple_statement(self, statement):
         if not statement:
             return False
-        trynum = 1
+        rows = None
         while True:
             try:
-                self.cursor.execute(statement, decoder=decoder)
+                rows = self.session.execute(statement, trace=self.tracing_enabled)
                 break
-            except cql.IntegrityError, err:
-                self.printerr("Attempt #%d: %s" % (trynum, str(err)))
-                trynum += 1
-                if trynum > self.num_retries:
-                    return False
-                time.sleep(1*trynum)
-            except cql.ProgrammingError, err:
-                self.printerr(str(err))
-                return False
             except CQL_ERRORS, err:
                 self.printerr(str(err))
                 return False
-            except KeyboardInterrupt:
-                self.cursor.close()
-                self.conn.terminate_connection()
-                transport = self.transport_factory(self.hostname, self.port,
-                                                   os.environ, CONFIG_FILE)
-                self.conn = cql.connect(self.hostname, self.port, keyspace=self.keyspace,
-                                        user=self.username, password=self.password,
-                                        cql_version=self.cql_version, transport=transport)
-                self.cursor = self.conn.cursor()
-                return False                
             except Exception, err:
                 import traceback
                 self.printerr(traceback.format_exc())
                 return False
 
-        if statement[:6].lower() == 'select' or statement.lower().startswith("list"):
-            self.print_result(self.cursor, with_default_limit)
-        elif self.cursor.rowcount > 0:
+        if statement.query_string[:6].lower() == 'select' or statement.query_string.lower().startswith("list"):
+            self.print_result(rows, self.parse_for_table_meta(statement.query_string))
+        elif rows:
             # CAS INSERT/UPDATE
             self.writeresult("")
-            self.print_static_result(self.cursor)
+            self.print_static_result(rows, self.parse_for_table_meta(statement.query_string))
         self.flush_output()
         return True
 
-    def get_nametype(self, cursor, num):
-        """
-        Determine the Cassandra type of a column name from the current row of
-        query results on the given cursor. The column in question is given by
-        its zero-based ordinal number within the row.
-
-        This is necessary to differentiate some things like ascii vs. blob hex.
-        """
-
-        return cursor.name_info[num][1]
-
-    def print_result(self, cursor, with_default_limit):
+    def print_result(self, rows, table_meta):
         self.decoding_errors = []
 
         self.writeresult("")
-        if cursor.rowcount != 0:
-            self.print_static_result(cursor)
-        self.writeresult("(%d rows)" % cursor.rowcount)
-        self.writeresult("")
+        if isinstance(rows, PagedResult) and self.tty:
+            num_rows = 0
+            while True:
+                page = list(rows.current_response)
+                if not page:
+                    break
+                num_rows += len(page)
+                self.print_static_result(page, table_meta)
+                if not rows.response_future.has_more_pages:
+                    break
+                raw_input("---MORE---")
+
+                rows.response_future.start_fetching_next_page()
+                result = rows.response_future.result()
+                if rows.response_future.has_more_pages:
+                    rows.current_response = result.current_response
+                else:
+                    rows.current_response = iter(result)
+        else:
+            rows = list(rows or [])
+            num_rows = len(rows)
+            self.print_static_result(rows, table_meta)
+        self.writeresult("(%d rows)" % num_rows)
 
         if self.decoding_errors:
             for err in self.decoding_errors[:2]:
@@ -990,25 +1008,18 @@
                 self.writeresult('%d more decoding errors suppressed.'
                                  % (len(self.decoding_errors) - 2), color=RED)
 
-        if with_default_limit:
-            if (self.is_count_result(cursor) and self.get_count(cursor) == DEFAULT_SELECT_LIMIT) \
-                    or cursor.rowcount == DEFAULT_SELECT_LIMIT:
-                self.writeresult("Default LIMIT of %d was used. "
-                                 "Specify your own LIMIT clause to get more results."
-                                 % DEFAULT_SELECT_LIMIT, color=RED)
-                self.writeresult("")
+    def print_static_result(self, rows, table_meta):
+        if not rows:
+            # print header only
+            colnames = table_meta.columns.keys()  # full header
+            formatted_names = [self.myformat_colname(name, table_meta) for name in colnames]
+            self.print_formatted_result(formatted_names, None)
+            return
 
-    def is_count_result(self, cursor):
-        return cursor.description == [(u'count', 'LongType', None, None, None, None, True)]
+        colnames = rows[0].keys()
+        formatted_names = [self.myformat_colname(name, table_meta) for name in colnames]
+        formatted_values = [map(self.myformat_value, row.values()) for row in rows]
 
-    def get_count(self, cursor):
-        return lookup_casstype('LongType').deserialize(cursor.result[0][0].value)
-
-    def print_static_result(self, cursor):
-        colnames = [d[0] for d in cursor.description]
-        colnames_t = [(name, self.get_nametype(cursor, n)) for (n, name) in enumerate(colnames)]
-        formatted_names = [self.myformat_colname(name, nametype) for (name, nametype) in colnames_t]
-        formatted_values = [map(self.myformat_value, self.decode_row(cursor, row), cursor.column_types) for row in cursor.result]
         if self.expand_enabled:
             self.print_formatted_result_vertically(formatted_names, formatted_values)
         else:
@@ -1017,15 +1028,21 @@
     def print_formatted_result(self, formatted_names, formatted_values):
         # determine column widths
         widths = [n.displaywidth for n in formatted_names]
-        for fmtrow in formatted_values:
-            for num, col in enumerate(fmtrow):
-                widths[num] = max(widths[num], col.displaywidth)
+        if formatted_values is not None:
+            for fmtrow in formatted_values:
+                for num, col in enumerate(fmtrow):
+                    widths[num] = max(widths[num], col.displaywidth)
 
         # print header
         header = ' | '.join(hdr.ljust(w, color=self.color) for (hdr, w) in zip(formatted_names, widths))
         self.writeresult(' ' + header.rstrip())
         self.writeresult('-%s-' % '-+-'.join('-' * w for w in widths))
 
+        # stop if there are no rows
+        if formatted_values is None:
+            self.writeresult("")
+            return;
+
         # print row data
         for row in formatted_values:
             line = ' | '.join(col.rjust(w, color=self.color) for (col, w) in zip(row, widths))
@@ -1081,20 +1098,12 @@
         return cqlruleset.cql_complete(stuff_to_complete, text, cassandra_conn=self,
                                        debug=debug_completion, startsymbol='cqlshCommand')
 
-    def set_prompt(self, prompt):
+    def set_prompt(self, prompt, prepend_user=False):
+        if prepend_user and self.username:
+            self.prompt = "%s@%s" % (self.username, prompt)
+            return
         self.prompt = prompt
 
-    def cql_protect_name(self, name):
-        if isinstance(name, unicode):
-            name = name.encode('utf8')
-        return cqlruleset.maybe_escape_name(name)
-
-    def cql_protect_names(self, names):
-        return map(self.cql_protect_name, names)
-
-    def cql_protect_value(self, value):
-        return cqlruleset.escape_value(value)
-
     def cql_unprotect_name(self, namestr):
         if namestr is None:
             return
@@ -1105,25 +1114,8 @@
             return cqlruleset.dequote_value(valstr)
 
     def print_recreate_keyspace(self, ksdef, out):
-        stratclass = trim_if_present(ksdef.strategy_class, 'org.apache.cassandra.locator.')
-        ksname = self.cql_protect_name(ksdef.name)
-        out.write("CREATE KEYSPACE %s WITH replication = {\n" % ksname)
-        out.write("  'class': %s" % self.cql_protect_value(stratclass))
-        for opname, opval in ksdef.strategy_options.iteritems():
-            out.write(",\n  %s: %s" % (self.cql_protect_value(opname),
-                                       self.cql_protect_value(opval)))
-        out.write("\n}")
-        if not ksdef.durable_writes:
-            out.write(" AND durable_writes = 'false'")
-        out.write(';\n')
-
-        cfs = self.get_columnfamily_names(ksname)
-        if cfs:
-            out.write('\nUSE %s;\n' % ksname)
-            for cf in cfs:
-                out.write('\n')
-                # yes, cf might be looked up again. oh well.
-                self.print_recreate_columnfamily(ksdef.name, cf, out)
+        out.write(ksdef.export_as_string())
+        out.write("\n")
 
     def print_recreate_columnfamily(self, ksname, cfname, out):
         """
@@ -1132,128 +1124,17 @@
 
         Writes output to the given out stream.
         """
-        layout = self.get_columnfamily_layout(ksname, cfname)
-        cfname = self.cql_protect_name(layout.name)
-        out.write("CREATE TABLE %s (\n" % cfname)
-
-        for col in layout.columns:
-            colname = self.cql_protect_name(col.name)
-            coltype = col.cqltype
-
-            # Reversed types only matter for clustering order, not column definitions
-            if issubclass(coltype, ReversedType):
-                coltype = coltype.subtypes[0]
-
-            out.write("  %s %s" % (colname, coltype.cql_parameterized_type()))
-            if col.is_static():
-                out.write(" static")
-            out.write(",\n")
-
-        out.write("  PRIMARY KEY (")
-        partkeynames = self.cql_protect_names(layout.partition_key_columns)
-
-        # Changed to put parenthesis around one or more partition keys in CASSANDRA-7274
-        partkey = "(%s)" % ', '.join(partkeynames)
-
-        pk_parts = [partkey] + self.cql_protect_names(layout.clustering_key_columns)
-        out.write(', '.join(pk_parts) + ')')
-
-        out.write("\n)")
-        joiner = 'WITH'
-
-        if layout.is_compact_storage():
-            out.write(' WITH COMPACT STORAGE')
-            joiner = 'AND'
-
-        # check if we need a CLUSTERING ORDER BY clause
-        if layout.clustering_key_columns:
-            # get a list of clustering component types
-            if issubclass(layout.comparator, CompositeType):
-                clustering_types = layout.comparator.subtypes
-            else:
-                clustering_types = [layout.comparator]
-
-            # only write CLUSTERING ORDER clause of we have >= 1 DESC item
-            if any(issubclass(t, ReversedType) for t in clustering_types):
-                if layout.is_compact_storage():
-                    out.write(' AND\n ')
-                else:
-                    out.write(' WITH')
-                out.write(' CLUSTERING ORDER BY (')
-
-                clustering_names = self.cql_protect_names(layout.clustering_key_columns)
-
-                inner = []
-                for colname, coltype in zip(clustering_names, clustering_types):
-                    ordering = "DESC" if issubclass(coltype, ReversedType) else "ASC"
-                    inner.append("%s %s" % (colname, ordering))
-                out.write(", ".join(inner))
-
-                out.write(")")
-                joiner = "AND"
-
-        cf_opts = []
-        compaction_strategy = trim_if_present(getattr(layout, 'compaction_strategy_class'),
-                                              'org.apache.cassandra.db.compaction.')
-        for cql3option, layoutoption in cqlruleset.columnfamily_layout_options:
-            if layoutoption is None:
-                layoutoption = cql3option
-            optval = getattr(layout, layoutoption, None)
-            if optval is None:
-                if layoutoption == 'bloom_filter_fp_chance':
-                    if compaction_strategy == 'LeveledCompactionStrategy':
-                        optval = 0.1
-                    else:
-                        optval = 0.01
-                else:
-                    continue
-            elif layoutoption == 'compaction_strategy_class':
-                optval = compaction_strategy
-            cf_opts.append((cql3option, self.cql_protect_value(optval)))
-        for cql3option, layoutoption, _ in cqlruleset.columnfamily_layout_map_options:
-            if layoutoption is None:
-                layoutoption = cql3option
-            optmap = getattr(layout, layoutoption, {})
-            if layoutoption == 'compression_parameters':
-                compclass = optmap.get('sstable_compression')
-                if compclass is not None:
-                    optmap['sstable_compression'] = \
-                            trim_if_present(compclass, 'org.apache.cassandra.io.compress.')
-            if layoutoption == 'compaction_strategy_options':
-                optmap['class'] = compaction_strategy
-
-            cf_opts.append((cql3option, optmap))
-
-        if cf_opts:
-            for optname, optval in cf_opts:
-                if isinstance(optval, dict):
-                    optval = '{%s}' % ', '.join(['%s: %s' % (self.cql_protect_value(k),
-                                                             self.cql_protect_value(v))
-                                                 for (k, v) in optval.items()])
-                out.write(" %s\n  %s=%s" % (joiner, optname, optval))
-                joiner = 'AND'
-        out.write(";\n")
-
-        for col in [ c for c in layout.columns if c.index_name is not None ]:
-            out.write('\n')
-            if col.index_type != 'CUSTOM':
-                out.write('CREATE INDEX %s ON %s (%s);\n'
-                             % (col.index_name, cfname, self.cql_protect_name(col.name)))
-            else:
-                out.write("CREATE CUSTOM INDEX %s ON %s (%s) USING '%s';\n"
-                             % (col.index_name,
-                                cfname,
-                                self.cql_protect_name(col.name),
-                                col.index_options[u'class_name']))
+        out.write(self.get_table_meta(ksname, cfname).export_as_string())
+        out.write("\n")
 
     def describe_keyspaces(self):
         print
-        cmd.Cmd.columnize(self, self.get_keyspace_names())
+        cmd.Cmd.columnize(self, protect_names(self.get_keyspace_names()))
         print
 
     def describe_keyspace(self, ksname):
         print
-        self.print_recreate_keyspace(self.get_keyspace(ksname), sys.stdout)
+        self.print_recreate_keyspace(self.get_keyspace_meta(ksname), sys.stdout)
         print
 
     def describe_columnfamily(self, ksname, cfname):
@@ -1267,27 +1148,54 @@
         print
         if ksname is None:
             for k in self.get_keyspaces():
-                name = self.cql_protect_name(k.name)
+                name = protect_name(k.name)
                 print 'Keyspace %s' % (name,)
                 print '---------%s' % ('-' * len(name))
-                cmd.Cmd.columnize(self, self.get_columnfamily_names(k.name))
+                cmd.Cmd.columnize(self, protect_names(self.get_columnfamily_names(k.name)))
                 print
         else:
-            cmd.Cmd.columnize(self, self.get_columnfamily_names(ksname))
+            cmd.Cmd.columnize(self, protect_names(self.get_columnfamily_names(ksname)))
             print
 
+    def describe_usertypes(self, ksname):
+        print
+        if ksname is None:
+            for ksmeta in self.get_keyspaces():
+                name = protect_name(ksmeta.name)
+                print 'Keyspace %s' % (name,)
+                print '---------%s' % ('-' * len(name))
+                cmd.Cmd.columnize(self, protect_names(ksmeta.user_types.keys()))
+                print
+        else:
+            ksmeta = self.get_keyspace_meta(ksname)
+            cmd.Cmd.columnize(self, protect_names(ksmeta.user_types.keys()))
+            print
+
+    def describe_usertype(self, ksname, typename):
+        if ksname is None:
+            ksname = self.current_keyspace
+        print
+        ksmeta = self.get_keyspace_meta(ksname)
+        try:
+            usertype = ksmeta.user_types[typename]
+        except KeyError:
+            raise UserTypeNotFound("User type %r not found" % typename)
+        print usertype.as_cql_query(formatted=True)
+        print
+
     def describe_cluster(self):
         print '\nCluster: %s' % self.get_cluster_name()
         p = trim_if_present(self.get_partitioner(), 'org.apache.cassandra.dht.')
-        print 'Partitioner: %s' % p
-        snitch = trim_if_present(self.get_snitch(), 'org.apache.cassandra.locator.')
-        print 'Snitch: %s\n' % snitch
+        print 'Partitioner: %s\n' % p
+        # TODO: snitch?
+        #snitch = trim_if_present(self.get_snitch(), 'org.apache.cassandra.locator.')
+        #print 'Snitch: %s\n' % snitch
         if self.current_keyspace is not None \
         and self.current_keyspace != 'system':
             print "Range ownership:"
             ring = self.get_ring()
-            for entry in ring:
-                print ' %39s  [%s]' % (entry.start_token, ', '.join(entry.endpoints))
+            for entry in ring.items():
+                print ' %39s  [%s]' % (str(entry[0].value), ', '.join([host.address for host in entry[1]]))
             print
 
     def describe_schema(self, include_system=False):
@@ -1361,6 +1269,12 @@
             self.describe_columnfamily(ks, cf)
         elif what in ('columnfamilies', 'tables'):
             self.describe_columnfamilies(self.current_keyspace)
+        elif what == 'types':
+            self.describe_usertypes(self.current_keyspace)
+        elif what == 'type':
+            ks = self.cql_unprotect_name(parsed.get_binding('ksname', None))
+            ut = self.cql_unprotect_name(parsed.get_binding('utname'))
+            self.describe_usertype(ks, ut)
         elif what == 'cluster':
             self.describe_cluster()
         elif what == 'schema':
@@ -1463,61 +1377,70 @@
         try:
             if header:
                 linesource.next()
-            layout = self.get_columnfamily_layout(ks, cf)
-            rownum = -1
+            table_meta = self.get_table_meta(ks, cf)
             reader = csv.reader(linesource, **dialect_options)
-            for rownum, row in enumerate(reader):
-                if len(row) != len(columns):
-                    self.printerr("Record #%d (line %d) has the wrong number of fields "
-                                  "(%d instead of %d)."
-                                  % (rownum, reader.line_num, len(row), len(columns)))
-                    return rownum
-                if not self.do_import_row(columns, nullval, layout, row):
-                    self.printerr("Aborting import at record #%d (line %d). "
-                                  "Previously-inserted values still present."
-                                  % (rownum, reader.line_num))
-                    return rownum
+            from functools import partial
+            rownum, error = \
+                async_insert.insert_concurrent(self.session, enumerate(reader, start=1),
+                                               partial(
+                                                   self.create_insert_statement,
+                                                   columns, nullval,
+                                                   table_meta))
+            if error:
+                self.printerr(str(error[0]))
+                self.printerr("Aborting import at record #%d. "
+                              "Previously-inserted values still present."
+                               % error[1])
         finally:
             if do_close:
                 linesource.close()
             elif self.tty:
                 print
-        return rownum + 1
+        return rownum
 
-    def do_import_row(self, columns, nullval, layout, row):
+    def create_insert_statement(self, columns, nullval, table_meta, row):
+
+        if len(row) != len(columns):
+            raise ValueError(
+                "Record has the wrong number of fields (%d instead of %d)."
+                % (len(row), len(columns)))
+
         rowmap = {}
+        primary_key_columns = [col.name for col in table_meta.primary_key]
         for name, value in zip(columns, row):
-            type = layout.get_column(name).cqltype
-            if issubclass(type, ReversedType):
-                type = type.subtypes[0]
-            cqltype = type.cql_parameterized_type()
+            type = table_meta.columns[name].data_type
+            cqltype = table_meta.columns[name].typestring
 
             if value != nullval:
                 if cqltype in ('ascii', 'text', 'timestamp', 'inet'):
-                    rowmap[name] = self.cql_protect_value(value)
+                    rowmap[name] = protect_value(value)
                 else:
                     rowmap[name] = value
-            elif name in layout.clustering_key_columns and not type.empty_binary_ok:
-                rowmap[name] = 'blobAs%s(0x)' % cqltype.title()
+            elif name in primary_key_columns:
+                # By default, nullval is an empty string. See CASSANDRA-7792 for details.
+                message = "Cannot insert null value for primary key column '%s'." % (name,)
+                if nullval == '':
+                    message += " If you want to insert empty strings, consider using " \
+                               "the WITH NULL=<marker> option for COPY."
+                self.printerr(message)
+                return False
             else:
                 rowmap[name] = 'null'
-        return self.do_import_insert(layout, rowmap)
-
-    def do_import_insert(self, layout, rowmap):
         # would be nice to be able to use a prepared query here, but in order
         # to use that interface, we'd need to have all the input as native
         # values already, reading them from text just like the various
         # Cassandra cql types do. Better just to submit them all as intact
         # CQL string literals and let Cassandra do its thing.
         query = 'INSERT INTO %s.%s (%s) VALUES (%s)' % (
-            self.cql_protect_name(layout.keyspace_name),
-            self.cql_protect_name(layout.columnfamily_name),
-            ', '.join(self.cql_protect_names(rowmap.keys())),
+            protect_name(table_meta.keyspace.name),
+            protect_name(table_meta.name),
+            ', '.join(protect_names(rowmap.keys())),
             ', '.join(rowmap.values())
         )
         if self.debug:
             print 'Import using CQL: %s' % query
-        return self.perform_statement_untraced(query)
+        return SimpleStatement(query)
+
 
     def perform_csv_export(self, ks, cf, columns, fname, opts):
         dialect_options = self.csv_dialect_defaults.copy()
@@ -1549,38 +1472,32 @@
             except IOError, e:
                 self.printerr("Can't open %r for writing: %s" % (fname, e))
                 return 0
+        wmeter = meter.Meter()
         try:
-            self.prep_export_dump(ks, cf, columns)
+
+            dump = self.prep_export_dump(ks, cf, columns)
             writer = csv.writer(csvdest, **dialect_options)
             if header:
-                writer.writerow([d[0] for d in self.cursor.description])
-            rows = 0
-            while True:
-                row = self.cursor.fetchone()
-                if row is None:
-                    break
-                fmt = lambda v, t: \
-                    format_value(v, t, output_encoding=encoding, nullval=nullval,
+                writer.writerow(columns)
+            for row in dump:
+                fmt = lambda v: \
+                    format_value(v, output_encoding=encoding, nullval=nullval,
                                  time_format=self.display_time_format,
                                  float_precision=self.display_float_precision).strval
-                writer.writerow(map(fmt, row, self.cursor.column_types))
-                rows += 1
+                writer.writerow(map(fmt, row.values()))
+                wmeter.mark_written()
+            wmeter.done()
         finally:
             if do_close:
                 csvdest.close()
-        return rows
+        return wmeter.num_finished()
 
     def prep_export_dump(self, ks, cf, columns):
         if columns is None:
             columns = self.get_column_names(ks, cf)
-        columnlist = ', '.join(self.cql_protect_names(columns))
-        # this limit is pretty awful. would be better to use row-key-paging, so
-        # that the dump could be pretty easily aborted if necessary, but that
-        # can be kind of tricky with cql3. Punt for now, until the real cursor
-        # API is added in CASSANDRA-4415.
-        query = 'SELECT %s FROM %s.%s LIMIT 99999999' \
-                % (columnlist, self.cql_protect_name(ks), self.cql_protect_name(cf))
-        self.cursor.execute(query)
+        columnlist = ', '.join(protect_names(columns))
+        query = 'SELECT %s FROM %s.%s' % (columnlist, protect_name(ks), protect_name(cf))
+        return self.session.execute(query)
 
     def do_show(self, parsed):
         """
@@ -1611,7 +1528,7 @@
             self.show_host()
         elif showwhat.startswith('session'):
             session_id = parsed.get_binding('sessionid').lower()
-            self.show_session(session_id)
+            self.show_session(UUID(session_id))
         else:
             self.printerr('Wait, how do I show %r?' % (showwhat,))
 
@@ -1642,11 +1559,12 @@
         except IOError, e:
             self.printerr('Could not open %r: %s' % (fname, e))
             return
-        subshell = Shell(self.hostname, self.port, self.transport_factory,
+        subshell = Shell(self.hostname, self.port,
                          color=self.color, encoding=self.encoding, stdin=f,
                          tty=False, use_conn=self.conn, cqlver=self.cql_version,
                          display_time_format=self.display_time_format,
-                         display_float_precision=self.display_float_precision)
+                         display_float_precision=self.display_float_precision,
+                         max_trace_wait=self.max_trace_wait)
         subshell.cmdloop()
         f.close()
 
@@ -1731,29 +1649,7 @@
 
           TRACING with no arguments shows the current tracing status.
         """
-        switch = parsed.get_binding('switch')
-        if switch is None:
-            if self.tracing_enabled:
-                print "Tracing is currently enabled. Use TRACING OFF to disable"
-            else:
-                print "Tracing is currently disabled. Use TRACING ON to enable."
-            return
-
-        if switch.upper() == 'ON':
-            if self.tracing_enabled:
-                self.printerr('Tracing is already enabled. '
-                              'Use TRACING OFF to disable.')
-                return
-            self.tracing_enabled = True
-            print 'Now tracing requests.'
-            return
-
-        if switch.upper() == 'OFF':
-            if not self.tracing_enabled:
-                self.printerr('Tracing is not enabled.')
-                return
-            self.tracing_enabled = False
-            print 'Disabled tracing.'
+        self.tracing_enabled = SwitchCommand("TRACING", "Tracing").execute(self.tracing_enabled, parsed, self.printerr)
 
     def do_expand(self, parsed):
         """
@@ -1773,29 +1669,7 @@
 
           EXPAND with no arguments shows the current value of expand setting.
         """
-        switch = parsed.get_binding('switch')
-        if switch is None:
-            if self.expand_enabled:
-                print "Expanded output is currently enabled. Use EXPAND OFF to disable"
-            else:
-                print "Expanded output is currently disabled. Use EXPAND ON to enable."
-            return
-
-        if switch.upper() == 'ON':
-            if self.expand_enabled:
-                self.printerr('Expanded output is already enabled. '
-                              'Use EXPAND OFF to disable.')
-                return
-            self.expand_enabled = True
-            print 'Now printing expanded output'
-            return
-
-        if switch.upper() == 'OFF':
-            if not self.expand_enabled:
-                self.printerr('Expanded output is not enabled.')
-                return
-            self.expand_enabled = False
-            print 'Disabled expanded output.'
+        self.expand_enabled = SwitchCommand("EXPAND", "Expanded output").execute(self.expand_enabled, parsed, self.printerr)
 
     def do_consistency(self, parsed):
         """
@@ -1817,10 +1691,10 @@
         """
         level = parsed.get_binding('level')
         if level is None:
-            print 'Current consistency level is %s.' % (self.cursor.consistency_level,)
+            print 'Current consistency level is %s.' % (self.consistency_level)
             return
 
-        self.cursor.consistency_level = level.upper()
+        self.consistency_level = cassandra.ConsistencyLevel.name_to_value[level.upper()]
         print 'Consistency level set to %s.' % (level.upper(),)
 
     def do_exit(self, parsed=None):
@@ -1830,6 +1704,8 @@
         Exits cqlsh.
         """
         self.stop = True
+        if self.owns_connection:
+            self.conn.shutdown()
     do_quit = do_exit
 
     def do_debug(self, parsed):
@@ -1869,6 +1745,26 @@
             else:
                 self.printerr("*** No help on %s" % (t,))
 
+    def do_paging(self, parsed):
+        """
+        PAGING [cqlsh]
+
+          Enables or disables query paging.
+
+        PAGING ON
+
+          Enables query paging for all further queries.
+
+        PAGING OFF
+
+          Disables paging.
+
+        PAGING
+
+          PAGING with no arguments shows the current query paging status.
+        """
+        self.use_paging = SwitchCommand("PAGING", "Query paging").execute(self.use_paging, parsed, self.printerr)
+
     def applycolor(self, text, color=None):
         if not color or not self.color:
             return text
@@ -1890,12 +1786,41 @@
             text = '%s:%d:%s' % (self.stdin.name, self.lineno, text)
         self.writeresult(text, color, newline=newline, out=sys.stderr)
 
-class ErrorHandlingSchemaDecoder(cql.decoders.SchemaDecoder):
-    def name_decode_error(self, err, namebytes, expectedtype):
-        return DecodeError(namebytes, err, expectedtype)
 
-    def value_decode_error(self, err, namebytes, valuebytes, expectedtype):
-        return DecodeError(valuebytes, err, expectedtype, colname=namebytes)
+class SwitchCommand(object):
+    command = None
+    description = None
+
+    def __init__(self, command, desc):
+        self.command = command
+        self.description = desc
+
+    def execute(self, state, parsed, printerr):
+        switch = parsed.get_binding('switch')
+        if switch is None:
+            if state:
+                print "%s is currently enabled. Use %s OFF to disable" \
+                      % (self.description, self.command)
+            else:
+                print "%s is currently disabled. Use %s ON to enable." \
+                      % (self.description, self.command)
+            return state
+
+        if switch.upper() == 'ON':
+            if state:
+                printerr('%s is already enabled. Use %s OFF to disable.'
+                         % (self.description, self.command))
+                return state
+            print 'Now %s is enabled' % (self.description,)
+            return True
+
+        if switch.upper() == 'OFF':
+            if not state:
+                printerr('%s is not enabled.' % (self.description,))
+                return state
+            print 'Disabled %s.' % (self.description,)
+            return False
+
 
 def option_with_default(cparser_getter, section, option, default=None):
     try:
@@ -1930,21 +1855,6 @@
         pass
     return True
 
-def load_factory(name):
-    """
-    Attempts to load a transport factory function given its fully qualified
-    name, e.g. "cqlshlib.tfactory.regular_transport_factory"
-    """
-    parts = name.split('.')
-    module = ".".join(parts[:-1])
-    try:
-        t = __import__(module)
-        for part in parts[1:]:
-            t = getattr(t, part)
-        return t
-    except (ImportError, AttributeError):
-        sys.exit("Can't locate transport factory function %s" % name)
-
 def read_options(cmdlineargs, environment):
     configs = ConfigParser.SafeConfigParser()
     configs.read(CONFIG_FILE)
@@ -1953,8 +1863,6 @@
     optvalues.username = option_with_default(configs.get, 'authentication', 'username')
     optvalues.password = option_with_default(configs.get, 'authentication', 'password')
     optvalues.keyspace = option_with_default(configs.get, 'authentication', 'keyspace')
-    optvalues.transport_factory = option_with_default(configs.get, 'connection', 'factory',
-                                                      DEFAULT_TRANSPORT_FACTORY)
     optvalues.completekey = option_with_default(configs.get, 'ui', 'completekey',
                                                 DEFAULT_COMPLETEKEY)
     optvalues.color = option_with_default(configs.getboolean, 'ui', 'color')
@@ -1962,8 +1870,12 @@
                                                     DEFAULT_TIME_FORMAT)
     optvalues.float_precision = option_with_default(configs.getint, 'ui', 'float_precision',
                                                     DEFAULT_FLOAT_PRECISION)
+    optvalues.max_trace_wait = option_with_default(configs.getfloat, 'tracing', 'max_trace_wait',
+                                                   DEFAULT_MAX_TRACE_WAIT)
     optvalues.debug = False
     optvalues.file = None
+    optvalues.ssl = False
+
     optvalues.tty = sys.stdin.isatty()
     optvalues.cqlversion = option_with_default(configs.get, 'cql', 'version', DEFAULT_CQLVER)
     optvalues.execute = None
@@ -1972,6 +1884,11 @@
 
     hostname = option_with_default(configs.get, 'connection', 'hostname', DEFAULT_HOST)
     port = option_with_default(configs.get, 'connection', 'port', DEFAULT_PORT)
+    options.client_timeout = option_with_default(configs.get, 'connection', 'client_timeout', '10')
+    if options.client_timeout.lower() == 'none':
+        options.client_timeout = None
+    else:
+        options.client_timeout = int(options.client_timeout)
 
     hostname = environment.get('CQLSH_HOST', hostname)
     port = environment.get('CQLSH_PORT', port)
@@ -1987,8 +1904,6 @@
     if options.execute and not options.execute.endswith(';'):
         options.execute += ';'
 
-    options.transport_factory = load_factory(options.transport_factory)
-
     if optvalues.color in (True, False):
         options.color = optvalues.color
     else:
@@ -2054,14 +1969,11 @@
             sys.exit("Can't open %r: %s" % (options.file, e))
 
     if options.debug:
-        import thrift
-        sys.stderr.write("Using CQL driver: %s\n" % (cql,))
-        sys.stderr.write("Using thrift lib: %s\n" % (thrift,))
+        sys.stderr.write("Using CQL driver: %s\n" % (cassandra,))
 
     try:
         shell = Shell(hostname,
                       port,
-                      options.transport_factory,
                       color=options.color,
                       username=options.username,
                       password=options.password,
@@ -2072,7 +1984,10 @@
                       keyspace=options.keyspace,
                       display_time_format=options.time_format,
                       display_float_precision=options.float_precision,
-                      single_statement=options.execute)
+                      max_trace_wait=options.max_trace_wait,
+                      ssl=options.ssl,
+                      single_statement=options.execute,
+                      client_timeout=options.client_timeout)
     except KeyboardInterrupt:
         sys.exit('Connection aborted.')
     except CQL_ERRORS, e:

diff --git a/bin/cqlsh.bat b/bin/cqlsh.bat
index 68235b6..066b1d0 100644
--- a/bin/cqlsh.bat
+++ b/bin/cqlsh.bat

@@ -1,34 +1,34 @@
-@ECHO OFF
-@REM
-@REM Licensed to the Apache Software Foundation (ASF) under one or more
-@REM contributor license agreements. See the NOTICE file distributed with
-@REM this work for additional information regarding copyright ownership.
-@REM The ASF licenses this file to You under the Apache License, Version 2.0
-@REM (the "License"); you may not use this file except in compliance with
-@REM the License. You may obtain a copy of the License at
-@REM
-@REM http://www.apache.org/licenses/LICENSE-2.0
-@REM
-@REM Unless required by applicable law or agreed to in writing, software
-@REM distributed under the License is distributed on an "AS IS" BASIS,
-@REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM See the License for the specific language governing permissions and
-@REM limitations under the License.
-
-@echo off
-
-if "%OS%" == "Windows_NT" setlocal
-
-python -V >nul 2>&1
-if ERRORLEVEL 1 goto err
-
-python "%~dp0\cqlsh" %*
-goto finally
-
-:err
-echo Can't detect Python version!
-
-:finally
-
-ENDLOCAL
-
+@ECHO OFF

+@REM

+@REM Licensed to the Apache Software Foundation (ASF) under one or more

+@REM contributor license agreements. See the NOTICE file distributed with

+@REM this work for additional information regarding copyright ownership.

+@REM The ASF licenses this file to You under the Apache License, Version 2.0

+@REM (the "License"); you may not use this file except in compliance with

+@REM the License. You may obtain a copy of the License at

+@REM

+@REM http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM Unless required by applicable law or agreed to in writing, software

+@REM distributed under the License is distributed on an "AS IS" BASIS,

+@REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM See the License for the specific language governing permissions and

+@REM limitations under the License.

+

+@echo off

+

+if "%OS%" == "Windows_NT" setlocal

+

+python -V >nul 2>&1

+if ERRORLEVEL 1 goto err

+

+python "%~dp0\cqlsh" %*

+goto finally

+

+:err

+echo Can't detect Python version!

+

+:finally

+

+ENDLOCAL

+


diff --git a/bin/debug-cql b/bin/debug-cql
index aa6630f..b4ebb82 100755
--- a/bin/debug-cql
+++ b/bin/debug-cql

@@ -57,7 +57,7 @@
 esac
 
 class="org.apache.cassandra.transport.Client"
-cassandra_parms="-Dlog4j.configuration=log4j-tools.properties"
+cassandra_parms="-Dlogback.configurationFile=logback-tools.xml"
 "$JAVA" $JVM_OPTS $cassandra_parms  -cp "$CLASSPATH" "$class" $1 $2
 
 exit $?

diff --git a/bin/debug-cql.bat b/bin/debug-cql.bat
index d936f0a..121e0c1 100644
--- a/bin/debug-cql.bat
+++ b/bin/debug-cql.bat

@@ -1,61 +1,39 @@
-@REM
-@REM  Licensed to the Apache Software Foundation (ASF) under one or more
-@REM  contributor license agreements.  See the NOTICE file distributed with
-@REM  this work for additional information regarding copyright ownership.
-@REM  The ASF licenses this file to You under the Apache License, Version 2.0
-@REM  (the "License"); you may not use this file except in compliance with
-@REM  the License.  You may obtain a copy of the License at
-@REM
-@REM      http://www.apache.org/licenses/LICENSE-2.0
-@REM
-@REM  Unless required by applicable law or agreed to in writing, software
-@REM  distributed under the License is distributed on an "AS IS" BASIS,
-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM  See the License for the specific language governing permissions and
-@REM  limitations under the License.
-
-@echo off
-if "%OS%" == "Windows_NT" setlocal
-
-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..
-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"
-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.transport.Client
-if NOT DEFINED JAVA_HOME goto :err
-
-REM ***** JAVA options *****
-set JAVA_OPTS=^
- -Dlog4j.configuration=log4j-tools.properties
-
-REM ***** CLASSPATH library setting *****
-
-REM Ensure that any user defined CLASSPATH variables are not used on startup
-set CLASSPATH="%CASSANDRA_HOME%\conf"
-
-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.
-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"
-goto okClasspath
-
-:append
-set CLASSPATH=%CLASSPATH%;%1
-goto :eof
-
-:okClasspath
-REM Include the build\classes\main directory so it works in development
-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"
-
-set CASSANDRA_PARAMS=
-set TOOLS_PARAMS=
-
-goto runTool
-
-:runTool
-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*
-goto finally
-
-:err
-echo JAVA_HOME environment variable must be set!
-pause
-
-:finally
-
-ENDLOCAL
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+if "%OS%" == "Windows_NT" setlocal

+

+pushd "%~dp0"

+call cassandra.in.bat

+if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.transport.Client

+if NOT DEFINED JAVA_HOME goto :err

+

+REM ***** JAVA options *****

+set JAVA_OPTS=^

+ -Dlogback.configurationFile=logback-tools.xml

+

+set TOOLS_PARAMS=

+"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*

+goto finally

+

+:err

+echo JAVA_HOME environment variable must be set!

+pause

+

+:finally

+

+ENDLOCAL


diff --git a/bin/nodetool b/bin/nodetool
index c5ee066..e81c59b 100755
--- a/bin/nodetool
+++ b/bin/nodetool

@@ -88,8 +88,9 @@
 
 "$JAVA" $JAVA_AGENT -cp "$CLASSPATH" \
       -Xmx128m \
-      -Dlog4j.configuration=log4j-tools.properties \
+      -Dcassandra.storagedir="$cassandra_storagedir" \
+      -Dlogback.configurationFile=logback-tools.xml \
       -Dstorage-config="$CASSANDRA_CONF" \
-      org.apache.cassandra.tools.NodeCmd -p $JMX_PORT $ARGS
+      org.apache.cassandra.tools.NodeTool -p $JMX_PORT $ARGS
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/nodetool.bat b/bin/nodetool.bat
index 2f74687..ec64db0 100644
--- a/bin/nodetool.bat
+++ b/bin/nodetool.bat

@@ -17,28 +17,14 @@
 @echo off

 if "%OS%" == "Windows_NT" setlocal

 

+pushd "%~dp0"

+call cassandra.in.bat

+

 if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..

 if NOT DEFINED JAVA_HOME goto :err

 

-REM Ensure that any user defined CLASSPATH variables are not used on startup

-set CLASSPATH="%CASSANDRA_HOME%\conf"

-

-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.

-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

-goto okClasspath

-

-:append

-set CLASSPATH=%CLASSPATH%;%1

-goto :eof

-

-:okClasspath

-REM Include the build\classes\main directory so it works in development

-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";"%CASSANDRA_HOME%\build\classes\thrift"

-goto runNodeTool

-

-:runNodeTool

 echo Starting NodeTool

-"%JAVA_HOME%\bin\java" -cp %CASSANDRA_CLASSPATH% -Dlog4j.configuration=log4j-tools.properties org.apache.cassandra.tools.NodeCmd %*

+"%JAVA_HOME%\bin\java" -cp %CASSANDRA_CLASSPATH% -Dlogback.configurationFile=logback-tools.xml org.apache.cassandra.tools.NodeTool %*

 goto finally

 

 :err


diff --git a/bin/source-conf.ps1 b/bin/source-conf.ps1
new file mode 100644
index 0000000..9828787
--- /dev/null
+++ b/bin/source-conf.ps1

@@ -0,0 +1,57 @@
+#

+# Licensed to the Apache Software Foundation (ASF) under one or more

+# contributor license agreements.  See the NOTICE file distributed with

+# this work for additional information regarding copyright ownership.

+# The ASF licenses this file to You under the Apache License, Version 2.0

+# (the "License"); you may not use this file except in compliance with

+# the License.  You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+

+Function Find-Conf

+{

+    $file = "";

+    # Order of preference on grabbing environment settings:

+    #   1:  %CASSANDRA_INCLUDE%

+    #   2a: %USERPROFILE%/cassandra-env.ps1 (cmd-prompt)

+    #   2b: $HOME/cassandra-env.ps1 (cygwin)

+    #   3:  %CASSANDRA_HOME%/conf/cassandra-env.ps1

+    #   4:  Relative to current working directory (../conf)

+    if (Test-Path Env:\CASSANDRA_INCLUDE)

+    {

+        $file = "$env:CASSANDRA_INCLUDE"

+    }

+    elseif (Test-Path "$env:USERPROFILE/cassandra-env.ps1")

+    {

+        $file = "$env:USERPROFILE/cassandra-env.ps1"

+    }

+    elseif (Test-Path "$env:HOME/cassandra-env.ps1")

+    {

+        $file = "$env:HOME/cassandra-env.ps1"

+    }

+    elseif (Test-Path Env:\CASSANDRA_HOME)

+    {

+        $file = "$env:CASSANDRA_HOME/conf/cassandra-env.ps1"

+    }

+    else

+    {

+        $file = [System.IO.Directory]::GetCurrentDirectory() + "/../conf/cassandra-env.ps1"

+    }

+    $file = $file -replace "\\", "/"

+

+    if (Test-Path $file)

+    {

+        return $file

+    }

+    else

+    {

+        echo "Error with environment file resolution.  Path: [$file] not found."

+        exit

+    }

+}


diff --git a/bin/sstablekeys b/bin/sstablekeys
index 9033881..8308b88 100755
--- a/bin/sstablekeys
+++ b/bin/sstablekeys

@@ -49,7 +49,8 @@
 fi
 
 "$JAVA" $JAVA_AGENT -cp "$CLASSPATH" -Dstorage-config="$CASSANDRA_CONF" \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.SSTableExport "$@" -e
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/sstablekeys.bat b/bin/sstablekeys.bat
index 6a483ba..0d0cf95 100644
--- a/bin/sstablekeys.bat
+++ b/bin/sstablekeys.bat

@@ -17,38 +17,18 @@
 @echo off

 if "%OS%" == "Windows_NT" setlocal

 

-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..

-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"

+pushd "%~dp0"

+call cassandra.in.bat

+

 if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.SSTableExport

 if NOT DEFINED JAVA_HOME goto :err

 

 REM ***** JAVA options *****

 set JAVA_OPTS=^

- -Dlog4j.configuration=log4j-tools.properties

+ -Dlogback.configurationFile=logback-tools.xml

 

-REM ***** CLASSPATH library setting *****

-

-REM Ensure that any user defined CLASSPATH variables are not used on startup

-set CLASSPATH="%CASSANDRA_HOME%\conf"

-

-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.

-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

-goto okClasspath

-

-:append

-set CLASSPATH=%CLASSPATH%;%1

-goto :eof

-

-:okClasspath

-REM Include the build\classes\main directory so it works in development

-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"

-

-set CASSANDRA_PARAMS=

 set TOOLS_PARAMS=

 

-goto runTool

-

-:runTool

 "%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %1 -e

 goto finally

 


diff --git a/bin/sstableloader b/bin/sstableloader
index 85335d2..2ee4f51 100755
--- a/bin/sstableloader
+++ b/bin/sstableloader

@@ -44,7 +44,8 @@
 fi
 
 "$JAVA" $JAVA_AGENT -ea -cp "$CLASSPATH" -Xmx256M \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.BulkLoader "$@"
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/sstableloader.bat b/bin/sstableloader.bat
index 5678693..13293f0 100644
--- a/bin/sstableloader.bat
+++ b/bin/sstableloader.bat

@@ -1,61 +1,41 @@
-@REM
-@REM  Licensed to the Apache Software Foundation (ASF) under one or more
-@REM  contributor license agreements.  See the NOTICE file distributed with
-@REM  this work for additional information regarding copyright ownership.
-@REM  The ASF licenses this file to You under the Apache License, Version 2.0
-@REM  (the "License"); you may not use this file except in compliance with
-@REM  the License.  You may obtain a copy of the License at
-@REM
-@REM      http://www.apache.org/licenses/LICENSE-2.0
-@REM
-@REM  Unless required by applicable law or agreed to in writing, software
-@REM  distributed under the License is distributed on an "AS IS" BASIS,
-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM  See the License for the specific language governing permissions and
-@REM  limitations under the License.
-
-@echo off
-if "%OS%" == "Windows_NT" setlocal
-
-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..
-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"
-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.BulkLoader
-if NOT DEFINED JAVA_HOME goto :err
-
-REM ***** JAVA options *****
-set JAVA_OPTS=^
- -Dlog4j.configuration=log4j-tools.properties
-
-REM ***** CLASSPATH library setting *****
-
-REM Ensure that any user defined CLASSPATH variables are not used on startup
-set CLASSPATH="%CASSANDRA_HOME%\conf"
-
-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.
-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"
-goto okClasspath
-
-:append
-set CLASSPATH=%CLASSPATH%;%1
-goto :eof
-
-:okClasspath
-REM Include the build\classes\main directory so it works in development
-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"
-
-set CASSANDRA_PARAMS=
-set TOOLS_PARAMS=
-
-goto runTool
-
-:runTool
-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*
-goto finally
-
-:err
-echo JAVA_HOME environment variable must be set!
-pause
-
-:finally
-
-ENDLOCAL
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+if "%OS%" == "Windows_NT" setlocal

+

+pushd "%~dp0"

+call cassandra.in.bat

+

+if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.BulkLoader

+if NOT DEFINED JAVA_HOME goto :err

+

+REM ***** JAVA options *****

+set JAVA_OPTS=^

+ -Dlogback.configurationFile=logback-tools.xml

+

+set TOOLS_PARAMS=

+

+"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*

+goto finally

+

+:err

+echo JAVA_HOME environment variable must be set!

+pause

+

+:finally

+

+ENDLOCAL


diff --git a/bin/sstablescrub b/bin/sstablescrub
index 31ecf02..a5f9a67 100755
--- a/bin/sstablescrub
+++ b/bin/sstablescrub

@@ -48,7 +48,8 @@
 fi
 
 "$JAVA" $JAVA_AGENT -ea -cp "$CLASSPATH" -Xmx$MAX_HEAP_SIZE \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.StandaloneScrubber "$@"
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/sstablescrub.bat b/bin/sstablescrub.bat
index 06f4c84..62c140b 100644
--- a/bin/sstablescrub.bat
+++ b/bin/sstablescrub.bat

@@ -1,61 +1,41 @@
-@REM
-@REM  Licensed to the Apache Software Foundation (ASF) under one or more
-@REM  contributor license agreements.  See the NOTICE file distributed with
-@REM  this work for additional information regarding copyright ownership.
-@REM  The ASF licenses this file to You under the Apache License, Version 2.0
-@REM  (the "License"); you may not use this file except in compliance with
-@REM  the License.  You may obtain a copy of the License at
-@REM
-@REM      http://www.apache.org/licenses/LICENSE-2.0
-@REM
-@REM  Unless required by applicable law or agreed to in writing, software
-@REM  distributed under the License is distributed on an "AS IS" BASIS,
-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM  See the License for the specific language governing permissions and
-@REM  limitations under the License.
-
-@echo off
-if "%OS%" == "Windows_NT" setlocal
-
-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..
-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"
-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneScrubber
-if NOT DEFINED JAVA_HOME goto :err
-
-REM ***** JAVA options *****
-set JAVA_OPTS=^
- -Dlog4j.configuration=log4j-tools.properties
-
-REM ***** CLASSPATH library setting *****
-
-REM Ensure that any user defined CLASSPATH variables are not used on startup
-set CLASSPATH="%CASSANDRA_HOME%\conf"
-
-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.
-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"
-goto okClasspath
-
-:append
-set CLASSPATH=%CLASSPATH%;%1
-goto :eof
-
-:okClasspath
-REM Include the build\classes\main directory so it works in development
-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"
-
-set CASSANDRA_PARAMS=
-set TOOLS_PARAMS=
-
-goto runTool
-
-:runTool
-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*
-goto finally
-
-:err
-echo JAVA_HOME environment variable must be set!
-pause
-
-:finally
-
-ENDLOCAL
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+if "%OS%" == "Windows_NT" setlocal

+

+pushd "%~dp0"

+call cassandra.in.bat

+

+if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneScrubber

+if NOT DEFINED JAVA_HOME goto :err

+

+REM ***** JAVA options *****

+set JAVA_OPTS=^

+ -Dlogback.configurationFile=logback-tools.xml

+

+set TOOLS_PARAMS=

+

+"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*

+goto finally

+

+:err

+echo JAVA_HOME environment variable must be set!

+pause

+

+:finally

+

+ENDLOCAL


diff --git a/bin/sstableupgrade b/bin/sstableupgrade
index 5421786..6248ac8 100755
--- a/bin/sstableupgrade
+++ b/bin/sstableupgrade

@@ -48,7 +48,8 @@
 fi
 
 $JAVA $JAVA_AGENT -ea -cp $CLASSPATH -Xmx$MAX_HEAP_SIZE \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.StandaloneUpgrader "$@"
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/sstableupgrade.bat b/bin/sstableupgrade.bat
index 8c7892d..4025ae8 100644
--- a/bin/sstableupgrade.bat
+++ b/bin/sstableupgrade.bat

@@ -1,61 +1,41 @@
-@REM
-@REM  Licensed to the Apache Software Foundation (ASF) under one or more
-@REM  contributor license agreements.  See the NOTICE file distributed with
-@REM  this work for additional information regarding copyright ownership.
-@REM  The ASF licenses this file to You under the Apache License, Version 2.0
-@REM  (the "License"); you may not use this file except in compliance with
-@REM  the License.  You may obtain a copy of the License at
-@REM
-@REM      http://www.apache.org/licenses/LICENSE-2.0
-@REM
-@REM  Unless required by applicable law or agreed to in writing, software
-@REM  distributed under the License is distributed on an "AS IS" BASIS,
-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM  See the License for the specific language governing permissions and
-@REM  limitations under the License.
-
-@echo off
-if "%OS%" == "Windows_NT" setlocal
-
-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..
-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"
-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneUpgrader
-if NOT DEFINED JAVA_HOME goto :err
-
-REM ***** JAVA options *****
-set JAVA_OPTS=^
- -Dlog4j.configuration=log4j-tools.properties
-
-REM ***** CLASSPATH library setting *****
-
-REM Ensure that any user defined CLASSPATH variables are not used on startup
-set CLASSPATH="%CASSANDRA_HOME%\conf"
-
-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.
-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"
-goto okClasspath
-
-:append
-set CLASSPATH=%CLASSPATH%;%1
-goto :eof
-
-:okClasspath
-REM Include the build\classes\main directory so it works in development
-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"
-
-set CASSANDRA_PARAMS=
-set TOOLS_PARAMS=
-
-goto runTool
-
-:runTool
-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*
-goto finally
-
-:err
-echo JAVA_HOME environment variable must be set!
-pause
-
-:finally
-
-ENDLOCAL
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+if "%OS%" == "Windows_NT" setlocal

+

+pushd "%~dp0"

+call cassandra.in.bat

+

+if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneUpgrader

+if NOT DEFINED JAVA_HOME goto :err

+

+REM ***** JAVA options *****

+set JAVA_OPTS=^

+ -Dlogback.configurationFile=logback-tools.xml

+

+set TOOLS_PARAMS=

+

+"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*

+goto finally

+

+:err

+echo JAVA_HOME environment variable must be set!

+pause

+

+:finally

+

+ENDLOCAL


diff --git a/bin/stop-server.bat b/bin/stop-server.bat
new file mode 100644
index 0000000..66a55fd
--- /dev/null
+++ b/bin/stop-server.bat

@@ -0,0 +1,55 @@
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+if "%OS%" == "Windows_NT" setlocal

+

+pushd %~dp0..

+if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%CD%

+popd

+

+REM -----------------------------------------------------------------------------

+REM See if we have the capabilities of running the powershell scripts

+for /F "delims=" %%i in ('powershell Get-ExecutionPolicy') do set PERMISSION=%%i

+if "%PERMISSION%" == "Unrestricted" goto runPowerShell

+goto runLegacy

+

+REM -----------------------------------------------------------------------------

+:runPowerShell

+REM Need to generate a random title for this command-prompt to determine its pid.

+REM We detach and re-attach the console in stop-server.ps1 to send ctrl+c to the

+REM running cassandra process and need to re-attach here to print results.

+set /A rand=%random% %% (100000 - 1 + 1) + 1

+TITLE %rand%

+FOR /F "tokens=2 delims= " %%A IN ('TASKLIST /FI ^"WINDOWTITLE eq %rand%^" /NH') DO set PID=%%A

+

+REM Start with /B -> the control+c event we generate in stop-server.ps1 percolates

+REM up and hits this external batch file if we call powershell directly.

+start /B powershell /file "%CASSANDRA_HOME%/bin/stop-server.ps1" -batchpid %PID% %*

+goto finally

+

+REM -----------------------------------------------------------------------------

+:runLegacy

+echo WARNING! Powershell script execution unavailable.

+echo    Please use 'powershell Set-ExecutionPolicy Unrestricted'

+echo    on this user-account to run cassandra with fully featured

+echo    functionality on this platform.

+

+echo Cannot stop server without powershell access.

+goto finally

+

+:finally

+ENDLOCAL


diff --git a/bin/stop-server.ps1 b/bin/stop-server.ps1
new file mode 100644
index 0000000..0d125dc
--- /dev/null
+++ b/bin/stop-server.ps1

@@ -0,0 +1,185 @@
+#

+# Licensed to the Apache Software Foundation (ASF) under one or more

+# contributor license agreements.  See the NOTICE file distributed with

+# this work for additional information regarding copyright ownership.

+# The ASF licenses this file to You under the Apache License, Version 2.0

+# (the "License"); you may not use this file except in compliance with

+# the License.  You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+param (

+    [string]$p,

+    [string]$batchpid,

+    [switch]$f,

+    [switch]$silent,

+    [switch]$help

+)

+

+#-----------------------------------------------------------------------------

+Function ValidateArguments

+{

+    if (!$p)

+    {

+        PrintUsage

+    }

+    if ($help)

+    {

+        PrintUsage

+    }

+}

+

+#-----------------------------------------------------------------------------

+Function PrintUsage

+{

+    echo @"

+

+usage: stop-server.ps1 -p pidfile -f[-help]

+    -p      pidfile tracked by server and removed on close.

+    -s      Silent.  Don't print success/failure data.

+    -f      force kill.

+"@

+    exit

+}

+

+#-----------------------------------------------------------------------------

+Function KillProcess

+{

+    if (-Not (Test-Path $p))

+    {

+        if (-Not ($silent))

+        {

+            echo "Error - pidfile not found.  Aborting."

+        }

+        exit

+    }

+

+    $t = @"

+    using System;

+    using System.Diagnostics;

+    using System.IO;

+    using System.Runtime.InteropServices;

+    using System.Threading;

+

+    namespace PowerStopper

+    {

+        public static class Stopper

+        {

+            delegate bool ConsoleCtrlDelegate(CtrlTypes CtrlType);

+

+            [DllImport("kernel32.dll", SetLastError = true)]

+            static extern bool AttachConsole(uint dwProcessId);

+

+            [DllImport("kernel32.dll", SetLastError = true, ExactSpelling = true)]

+            static extern bool FreeConsole();

+

+            enum CtrlTypes : uint

+            {

+                CTRL_C_EVENT = 0,

+                CTRL_BREAK_EVENT,

+                CTRL_CLOSE_EVENT,

+                CTRL_LOGOFF_EVENT = 5,

+                CTRL_SHUTDOWN_EVENT

+            }

+

+            [DllImport("kernel32.dll")]

+            [return: MarshalAs(UnmanagedType.Bool)]

+            private static extern bool GenerateConsoleCtrlEvent(CtrlTypes dwCtrlEvent, uint dwProcessGroupId);

+

+            [DllImport("kernel32.dll")]

+            static extern bool SetConsoleCtrlHandler(ConsoleCtrlDelegate HandlerRoutine, bool Add);

+

+            // Our output gets swallowed on ms-dos as we can't re-attach our console to the output of the cmd

+            // running the batch file.

+            public static void StopProgram(int pidToKill, int consolePid, bool silent)

+            {

+                Process proc = null;

+                try

+                {

+                    proc = Process.GetProcessById(pidToKill);

+                }

+                catch (ArgumentException)

+                {

+                    if (!silent)

+                        System.Console.WriteLine("Process " + pidToKill + " not found.  Aborting.");

+                    return;

+                }

+

+                if (!FreeConsole())

+                {

+                    if (!silent)

+                        System.Console.WriteLine("Failed to FreeConsole to attach to running cassandra process.  Aborting.");

+                    return;

+                }

+

+                if (AttachConsole((uint)pidToKill))

+                {

+                    //Disable Ctrl-C handling for our program

+                    SetConsoleCtrlHandler(null, true);

+                    GenerateConsoleCtrlEvent(CtrlTypes.CTRL_C_EVENT, 0);

+

+                    // Must wait here. If we don't and re-enable Ctrl-C

+                    // handling below too fast, we might terminate ourselves.

+                    proc.WaitForExit(2000);

+                    FreeConsole();

+

+                    // Re-attach to current console to write output

+                    if (consolePid >= 0)

+                        AttachConsole((uint)consolePid);

+

+                    // Re-enable Ctrl-C handling or any subsequently started

+                    // programs will inherit the disabled state.

+                    SetConsoleCtrlHandler(null, false);

+

+                    if (!silent)

+                        System.Console.WriteLine("Successfully sent ctrl+c to process with id: " + pidToKill + ".");

+                }

+                else

+                {

+                    if (!silent)

+                    {

+                        string errorMsg = new System.ComponentModel.Win32Exception(Marshal.GetLastWin32Error()).Message;

+                        System.Console.WriteLine("Error attaching to pid: " + pidToKill + ": " + Marshal.GetLastWin32Error() + " - " + errorMsg);

+                    }

+                }

+            }

+        }

+    }

+"@

+    # cygwin assumes environment variables are case sensitive which causes problems when

+    # the type dictionary references 'tmp' or 'temp' and throws a System.ArgumentException

+    $oldTmp = $env:TMP

+    $oldTemp = $env:Temp

+    $env:TMP=''

+    $env:TEMP=''

+    Add-Type -TypeDefinition $t

+    $env:TMP = $oldTmp

+    $env:TEMP = $oldTemp

+

+    $pidToKill = Get-Content $p

+    # If run in cygwin, we don't get the TITLE / pid combo in stop-server.bat but also don't need

+    # to worry about reattaching console output as it gets stderr/stdout even after the C#/C++

+    # FreeConsole calls.

+    if ($batchpid -eq "No")

+    {

+        $batchpid = -1

+    }

+

+    if ($f)

+    {

+        taskkill /f /pid $pidToKill

+    }

+    else

+    {

+        [PowerStopper.Stopper]::StopProgram($pidToKill, $batchpid, $silent)

+    }

+}

+

+#-----------------------------------------------------------------------------

+ValidateArguments

+KillProcess


diff --git a/build.xml b/build.xml
index 829c873..84b10ed 100644
--- a/build.xml
+++ b/build.xml

@@ -25,7 +25,7 @@
     <property name="debuglevel" value="source,lines,vars"/>
 
     <!-- default version and SCM information -->
-    <property name="base.version" value="2.0.10"/>
+    <property name="base.version" value="2.1.1"/>
     <property name="scm.connection" value="scm:git://git.apache.org/cassandra.git"/>
     <property name="scm.developerConnection" value="scm:git://git.apache.org/cassandra.git"/>
     <property name="scm.url" value="http://git-wip-us.apache.org/repos/asf?p=cassandra.git;a=tree"/>
@@ -37,6 +37,7 @@
     <property name="build.src.resources" value="${basedir}/src/resources"/>
     <property name="build.src.gen-java" value="${basedir}/src/gen-java"/>
     <property name="build.lib" value="${basedir}/lib"/>
+    <property name="build.tools.lib" value="${basedir}/tools/lib"/>
     <property name="build.dir" value="${basedir}/build"/>
     <property name="build.dir.lib" value="${basedir}/build/lib"/>
     <property name="build.test.dir" value="${build.dir}/test"/>
@@ -56,6 +57,7 @@
     <property name="test.data" value="${test.dir}/data"/>
     <property name="test.name" value="*Test"/>
     <property name="test.methods" value=""/>
+    <property name="test.runners" value="1"/>
     <property name="test.unit.src" value="${test.dir}/unit"/>
     <property name="test.long.src" value="${test.dir}/long"/>
     <property name="test.pig.src" value="${test.dir}/pig"/>
@@ -91,6 +93,9 @@
     <property name="test.timeout" value="60000" />
     <property name="test.long.timeout" value="600000" />
 
+    <!-- default for cql tests. Can be override by -Dcassandra.test.use_prepared=false -->
+    <property name="cassandra.test.use_prepared" value="true" />
+
     <!-- http://cobertura.sourceforge.net/ -->
     <property name="cobertura.version" value="2.0.3"/>
     <property name="cobertura.build.dir" value="${build.dir}/cobertura"/>
@@ -189,7 +194,7 @@
     <target name="gen-cli-grammar" depends="check-gen-cli-grammar" unless="cliUpToDate">
       <echo>Building Grammar ${build.src.java}/org/apache/cassandra/cli/Cli.g  ....</echo>
       <java classname="org.antlr.Tool"
-            classpath="${build.lib}/antlr-3.2.jar"
+        classpath="${build.dir.lib}/jars/antlr-3.5.2.jar;${build.lib}/antlr-runtime-3.5.2.jar;${build.lib}/stringtemplate-4.0.2.jar"
             fork="true"
             failonerror="true">
          <jvmarg value="-Xmx512M" />
@@ -216,7 +221,7 @@
     <target name="gen-cql2-grammar" depends="check-gen-cql2-grammar" unless="cql2current">
       <echo>Building Grammar ${build.src.java}/org/apache/cassandra/cql/Cql.g  ...</echo>
       <java classname="org.antlr.Tool"
-            classpath="${build.lib}/antlr-3.2.jar"
+            classpath="${build.dir.lib}/jars/antlr-3.5.2.jar;${build.lib}/antlr-runtime-3.5.2.jar;${build.lib}/stringtemplate-4.0.2.jar"
             fork="true"
             failonerror="true">
          <jvmarg value="-Xmx512M" />
@@ -228,7 +233,7 @@
     <target name="gen-cql3-grammar" depends="check-gen-cql3-grammar" unless="cql3current">
       <echo>Building Grammar ${build.src.java}/org/apache/cassandra/cql3/Cql.g  ...</echo>
       <java classname="org.antlr.Tool"
-            classpath="${build.lib}/antlr-3.2.jar"
+            classpath="${build.dir.lib}/jars/antlr-3.5.2.jar;${build.lib}/antlr-runtime-3.5.2.jar;${build.lib}/stringtemplate-4.0.2.jar"
             fork="true"
             failonerror="true">
          <jvmarg value="-Xmx512M" />
@@ -345,65 +350,86 @@
           <dependency groupId="org.xerial.snappy" artifactId="snappy-java" version="1.0.5"/>
           <dependency groupId="net.jpountz.lz4" artifactId="lz4" version="1.2.0"/>
           <dependency groupId="com.ning" artifactId="compress-lzf" version="0.8.4"/>
-          <dependency groupId="com.google.guava" artifactId="guava" version="15.0"/>
+          <dependency groupId="com.google.guava" artifactId="guava" version="16.0"/>
           <dependency groupId="commons-cli" artifactId="commons-cli" version="1.1"/>
           <dependency groupId="commons-codec" artifactId="commons-codec" version="1.2"/>
           <dependency groupId="org.apache.commons" artifactId="commons-lang3" version="3.1"/>
+          <dependency groupId="org.apache.commons" artifactId="commons-math3" version="3.2"/>
           <dependency groupId="com.googlecode.concurrentlinkedhashmap" artifactId="concurrentlinkedhashmap-lru" version="1.3"/>
-          <dependency groupId="org.antlr" artifactId="antlr" version="3.2"/>
+          <dependency groupId="org.antlr" artifactId="antlr" version="3.5.2"/>
+          <dependency groupId="org.antlr" artifactId="antlr-runtime" version="3.5.2"/>
+          <dependency groupId="org.antlr" artifactId="stringtemplate" version="4.0.2"/>
           <dependency groupId="org.slf4j" artifactId="slf4j-api" version="1.7.2"/>
-          <dependency groupId="org.slf4j" artifactId="slf4j-log4j12" version="1.7.2"/>
+          <dependency groupId="ch.qos.logback" artifactId="logback-core" version="1.1.2"/>
+          <dependency groupId="ch.qos.logback" artifactId="logback-classic" version="1.1.2"/>
           <dependency groupId="org.codehaus.jackson" artifactId="jackson-core-asl" version="1.9.2"/>
           <dependency groupId="org.codehaus.jackson" artifactId="jackson-mapper-asl" version="1.9.2"/>
           <dependency groupId="jline" artifactId="jline" version="1.0">
             <exclusion groupId="junit" artifactId="junit"/>
           </dependency>
           <dependency groupId="com.googlecode.json-simple" artifactId="json-simple" version="1.1"/>
-          <dependency groupId="com.github.stephenc.high-scale-lib" artifactId="high-scale-lib" version="1.1.2"/>
-          <dependency groupId="com.github.stephenc" artifactId="jamm" version="0.2.5"/>
-	   <dependency groupId="com.thinkaurelius.thrift" artifactId="thrift-server" version="0.3.7"/>
+          <dependency groupId="com.boundary" artifactId="high-scale-lib" version="1.0.6"/>
+          <dependency groupId="com.github.jbellis" artifactId="jamm" version="0.2.6"/>
+          <dependency groupId="com.thinkaurelius.thrift" artifactId="thrift-server" version="0.3.7">
+	      	<exclusion groupId="org.slf4j" artifactId="slf4j-log4j12"/>
+          </dependency>
           <dependency groupId="org.yaml" artifactId="snakeyaml" version="1.11"/>
           <dependency groupId="org.apache.thrift" artifactId="libthrift" version="0.9.1"/>
 
           <dependency groupId="junit" artifactId="junit" version="4.6" />
           <dependency groupId="commons-logging" artifactId="commons-logging" version="1.1.1"/>
-          <dependency groupId="org.apache.rat" artifactId="apache-rat" version="0.6">
+          <dependency groupId="org.apache.rat" artifactId="apache-rat" version="0.10">
              <exclusion groupId="commons-lang" artifactId="commons-lang"/>
           </dependency>
-          <dependency groupId="org.apache.hadoop" artifactId="hadoop-core" version="1.0.3"/>
+          <dependency groupId="org.apache.hadoop" artifactId="hadoop-core" version="1.0.3">
+          	<exclusion groupId="org.mortbay.jetty" artifactId="servlet-api"/>
+          </dependency>
           <dependency groupId="org.apache.hadoop" artifactId="hadoop-minicluster" version="1.0.3"/>
-          <dependency groupId="org.apache.pig" artifactId="pig" version="0.10.0"/>
-          <dependency groupId="net.java.dev.jna" artifactId="jna" version="3.2.7"/>
+          <dependency groupId="org.apache.pig" artifactId="pig" version="0.12.1"/>
+          <dependency groupId="net.java.dev.jna" artifactId="jna" version="4.0.0"/>
 
           <dependency groupId="net.sourceforge.cobertura" artifactId="cobertura" version="${cobertura.version}">
             <exclusion groupId="xerces" artifactId="xercesImpl"/>
           </dependency>
 
-          <dependency groupId="log4j" artifactId="log4j" version="1.2.16" />
           <dependency groupId="org.apache.cassandra" artifactId="cassandra-all" version="${version}" />
           <dependency groupId="org.apache.cassandra" artifactId="cassandra-thrift" version="${version}" />
           <dependency groupId="com.yammer.metrics" artifactId="metrics-core" version="2.2.0" />
           <dependency groupId="com.addthis.metrics" artifactId="reporter-config" version="2.1.0" />
-          <dependency groupId="edu.stanford.ppl" artifactId="snaptree" version="0.1" />
           <dependency groupId="org.mindrot" artifactId="jbcrypt" version="0.3m" />
-          <dependency groupId="io.netty" artifactId="netty" version="3.6.6.Final" />
+          <dependency groupId="io.airlift" artifactId="airline" version="0.6" />
+          <dependency groupId="io.netty" artifactId="netty-all" version="4.0.23.Final" />
+          <dependency groupId="com.google.code.findbugs" artifactId="jsr305" version="2.0.2" />
+          <dependency groupId="com.clearspring.analytics" artifactId="stream" version="2.5.2" />
           <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" version="2.0.5" />
           <dependency groupId="net.sf.supercsv" artifactId="super-csv" version="2.1.0" />
+	      <dependency groupId="net.ju-n.compile-command-annotations" artifactId="compile-command-annotations" version="1.2.0" />
         </dependencyManagement>
         <developer id="alakshman" name="Avinash Lakshman"/>
-        <developer id="antelder" name="Anthony Elder"/>
+        <developer id="aleksey" name="Aleksey Yeschenko"/>
+        <developer id="amorton" name="Aaron Morton"/>
+        <developer id="benedict" name="Benedict Elliott Smith"/>
         <developer id="brandonwilliams" name="Brandon Williams"/>
+        <developer id="dbrosius" name="David Brosius"/>
         <developer id="eevans" name="Eric Evans"/>
         <developer id="gdusbabek" name="Gary Dusbabek"/>
         <developer id="goffinet" name="Chris Goffinet"/>
         <developer id="jaakko" name="Laine Jaakko Olavi"/>
         <developer id="jake" name="T Jake Luciani"/>
+        <developer id="jasonbrown" name="Jason Brown"/>
         <developer id="jbellis" name="Jonathan Ellis"/>
+        <developer id="jmckenzie" name="Josh McKenzie"/>
         <developer id="johan" name="Johan Oskarsson"/>
         <developer id="junrao" name="Jun Rao"/>
-        <developer id="mriou" name="Matthieu Riou"/>
+        <developer id="marcuse" name="Marcus Eriksson"/>
+        <developer id="mishail" name="Mikhail Stepura"/>
         <developer id="pmalik" name="Prashant Malik"/>
+        <developer id="scode" name="Peter Schuller"/>
         <developer id="slebresne" name="Sylvain Lebresne"/>
+        <developer id="tylerhobbs" name="Tyler Hobbs"/>
+        <developer id="vijay" name="Vijay Parthasarathy"/>
+        <developer id="xedin" name="Pavel Yaskevich"/>
+        <developer id="yukim" name="Yuki Morishita"/>
       </artifact:pom>
 
       <!-- each dependency set then defines the subset of the dependencies for that dependency set -->
@@ -418,8 +444,10 @@
         <dependency groupId="org.apache.hadoop" artifactId="hadoop-core"/>
       	<dependency groupId="org.apache.hadoop" artifactId="hadoop-minicluster"/>
         <dependency groupId="org.apache.pig" artifactId="pig"/>
+      	<dependency groupId="com.google.code.findbugs" artifactId="jsr305"/>
+        <dependency groupId="org.antlr" artifactId="antlr"/>
         <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core"/>
-        <dependency groupId="net.java.dev.jna" artifactId="jna"/>
+	<dependency groupId="net.ju-n.compile-command-annotations" artifactId="compile-command-annotations"/>
       </artifact:pom>
 
       <artifact:pom id="coverage-deps-pom"
@@ -435,6 +463,9 @@
         <parent groupId="org.apache.cassandra"
                 artifactId="cassandra-parent"
                 version="${version}"/>
+        <!-- do NOT remove this, it breaks pig-test -->
+        <dependency groupId="org.slf4j" artifactId="slf4j-log4j12" version="1.7.2"/>
+        <dependency groupId="joda-time" artifactId="joda-time" version="2.3" />
       </artifact:pom>
 
       <!-- now the pom's for artifacts being deployed to Maven Central -->
@@ -454,25 +485,27 @@
         <dependency groupId="commons-cli" artifactId="commons-cli"/>
         <dependency groupId="commons-codec" artifactId="commons-codec"/>
         <dependency groupId="org.apache.commons" artifactId="commons-lang3"/>
+        <dependency groupId="org.apache.commons" artifactId="commons-math3"/>
         <dependency groupId="com.googlecode.concurrentlinkedhashmap" artifactId="concurrentlinkedhashmap-lru"/>
         <dependency groupId="org.antlr" artifactId="antlr"/>
+        <dependency groupId="org.antlr" artifactId="antlr-runtime"/>
+        <dependency groupId="org.antlr" artifactId="stringtemplate" version="4.0.2"/>
         <dependency groupId="org.slf4j" artifactId="slf4j-api"/>
         <dependency groupId="org.codehaus.jackson" artifactId="jackson-core-asl"/>
         <dependency groupId="org.codehaus.jackson" artifactId="jackson-mapper-asl"/>
         <dependency groupId="jline" artifactId="jline"/>
         <dependency groupId="com.googlecode.json-simple" artifactId="json-simple"/>
-        <dependency groupId="com.github.stephenc.high-scale-lib" artifactId="high-scale-lib"/>
+        <dependency groupId="com.boundary" artifactId="high-scale-lib"/>
         <dependency groupId="org.yaml" artifactId="snakeyaml"/>
-        <dependency groupId="edu.stanford.ppl" artifactId="snaptree"/>
         <dependency groupId="org.mindrot" artifactId="jbcrypt"/>
         <dependency groupId="com.yammer.metrics" artifactId="metrics-core"/>
         <dependency groupId="com.addthis.metrics" artifactId="reporter-config"/>
         <dependency groupId="com.thinkaurelius.thrift" artifactId="thrift-server" version="0.3.7"/>
+        <dependency groupId="com.clearspring.analytics" artifactId="stream" version="2.5.2" />
         <dependency groupId="net.sf.supercsv" artifactId="super-csv" version="2.1.0" />
 
-        <dependency groupId="log4j" artifactId="log4j"/>
-        <!-- cassandra has a hard dependency on log4j, so force slf4j's log4j provider at runtime -->
-        <dependency groupId="org.slf4j" artifactId="slf4j-log4j12" scope="runtime"/>
+        <dependency groupId="ch.qos.logback" artifactId="logback-core"/>
+        <dependency groupId="ch.qos.logback" artifactId="logback-classic"/>
 
         <dependency groupId="org.apache.thrift" artifactId="libthrift"/>
         <dependency groupId="org.apache.cassandra" artifactId="cassandra-thrift"/>
@@ -484,11 +517,11 @@
       	<dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" optional="true"/>
 
         <!-- don't need jna to run, but nice to have -->
-        <dependency groupId="net.java.dev.jna" artifactId="jna" optional="true"/>
+        <dependency groupId="net.java.dev.jna" artifactId="jna" version="4.0.0"/>
         
         <!-- don't need jamm unless running a server in which case it needs to be a -javagent to be used anyway -->
-        <dependency groupId="com.github.stephenc" artifactId="jamm"/>
-        <dependency groupId="io.netty" artifactId="netty"/>
+        <dependency groupId="com.github.jbellis" artifactId="jamm"/>
+        <dependency groupId="io.netty" artifactId="netty-all"/>
       </artifact:pom>
       <artifact:pom id="thrift-pom"
                     artifactId="cassandra-thrift"
@@ -570,6 +603,25 @@
       </copy>
     </target>
 
+    <target name="maven-ant-tasks-retrieve-pig-test" depends="maven-ant-tasks-init">
+      <artifact:dependencies pomRefId="test-deps-pom"
+                             filesetId="test-dependency-jars"
+                             sourcesFilesetId="test-dependency-sources"
+                             cacheDependencyRefs="true"
+                             dependencyRefsBuildFile="${build.dir}/test-dependencies.xml">
+        <remoteRepository refid="apache"/>
+        <remoteRepository refid="central"/>
+        <remoteRepository refid="java.net2"/>
+      </artifact:dependencies>
+      <copy todir="${build.dir.lib}/jars">
+        <fileset refid="test-dependency-jars"/>
+        <mapper type="flatten"/>
+      </copy>
+      <copy todir="${build.dir.lib}/sources">
+        <fileset refid="test-dependency-sources"/>
+        <mapper type="flatten"/>
+      </copy>
+    </target>
 
     <!--
        Generate thrift code.  We have targets to build java because
@@ -636,7 +688,7 @@
           <pathelement location="${test.conf}"/>
         </classpath>
         <jvmarg value="-Dstorage-config=${test.conf}"/>
-        <jvmarg value="-javaagent:${basedir}/lib/jamm-0.2.5.jar" />
+        <jvmarg value="-javaagent:${basedir}/lib/jamm-0.2.6.jar" />
         <jvmarg value="-ea"/>
       </java>
     </target>
@@ -652,13 +704,17 @@
             name="build-project">
         <echo message="${ant.project.name}: ${ant.file}"/>
         <!-- Order matters! -->
-        <javac debug="true" debuglevel="${debuglevel}"
-               destdir="${build.classes.thrift}" includeantruntime="false" source="${source.version}" target="${target.version}">
+        <javac fork="true"
+               debug="true" debuglevel="${debuglevel}"
+               destdir="${build.classes.thrift}" includeantruntime="false" source="${source.version}" target="${target.version}"
+               memorymaximumsize="512M">
             <src path="${interface.thrift.dir}/gen-java"/>
             <classpath refid="cassandra.classpath"/>
         </javac>
-        <javac debug="true" debuglevel="${debuglevel}"
-               destdir="${build.classes.main}" includeantruntime="false" source="${source.version}" target="${target.version}">
+        <javac fork="true"
+               debug="true" debuglevel="${debuglevel}"
+               destdir="${build.classes.main}" includeantruntime="false" source="${source.version}" target="${target.version}"
+               memorymaximumsize="512M">
             <src path="${build.src.java}"/>
             <src path="${build.src.gen-java}"/>
             <classpath refid="cassandra.classpath"/>
@@ -668,6 +724,7 @@
         <copy todir="${build.classes.main}">
             <fileset dir="${build.src.resources}" />
         </copy>
+	<copy todir="${basedir}/conf" file="${build.classes.main}/META-INF/hotspot_compiler"/>
     </target>
 
     <!-- Stress build file -->
@@ -688,6 +745,9 @@
                     <fileset dir="${build.lib}">
                         <include name="**/*.jar" />
                     </fileset>
+                    <fileset dir="${build.tools.lib}">
+                        <include name="**/*.jar" />
+                    </fileset>
                 </path>
             </classpath>
         </javac>
@@ -863,6 +923,10 @@
     <target name="artifacts" depends="jar,javadoc"
             description="Create Cassandra release artifacts">
       <mkdir dir="${dist.dir}"/>
+      <!-- fix the control linefeed so that builds on windows works on linux -->
+      <fixcrlf srcdir="bin" includes="**/*" excludes="**/*.bat, **/*.ps1" eol="lf" eof="remove" />
+      <fixcrlf srcdir="conf" includes="**/*" excludes="**/*.bat, **/*.ps1" eol="lf" eof="remove" />
+      <fixcrlf srcdir="tools/bin" includes="**/*" excludes="**/*.bat, **/*.ps1" eol="lf" eof="remove" />
       <copy todir="${dist.dir}/lib">
         <fileset dir="${build.lib}"/>
         <fileset dir="${build.dir}">
@@ -899,10 +963,18 @@
       <copy todir="${dist.dir}/tools/bin">
         <fileset dir="${basedir}/tools/bin"/>
       </copy>
+      <copy todir="${dist.dir}/tools/">
+        <fileset dir="${basedir}/tools/">
+            <include name="*.yaml"/>
+	</fileset>
+      </copy>
       <copy todir="${dist.dir}/tools/lib">
         <fileset dir="${build.dir}/tools/lib/">
             <include name="*.jar" />
         </fileset>
+        <fileset dir="${build.tools.lib}">
+            <include name="*.jar" />
+        </fileset>
       </copy>
       <artifact:writepom pomRefId="dist-pom" 
             file="${build.dir}/${final.name}-dist.pom"/>
@@ -1018,11 +1090,13 @@
   <macrodef name="testmacro">
     <attribute name="suitename" />
     <attribute name="inputdir" />
-    <attribute name="timeout" />
+    <attribute name="timeout" default="${test.timeout}" />
     <attribute name="forkmode" default="perTest"/>
     <element name="optjvmargs" implicit="true" optional="true" />
     <attribute name="filter" default="**/${test.name}.java"/>
     <attribute name="exclude" default="" />
+    <attribute name="filelist" default="" />
+    <attribute name="poffset" default="0"/>
     <sequential>
       <echo message="running @{suitename} tests"/>
       <mkdir dir="${build.test.dir}/cassandra"/>
@@ -1032,12 +1106,14 @@
         <formatter type="xml" usefile="true"/>
         <formatter type="brief" usefile="false"/>
         <jvmarg value="-Dstorage-config=${test.conf}"/>
-        <jvmarg value="-Dlog4j.configuration=log4j-junit.properties" />
         <jvmarg value="-Djava.awt.headless=true"/>
-        <jvmarg value="-javaagent:${basedir}/lib/jamm-0.2.5.jar" />
+        <jvmarg value="-javaagent:${basedir}/lib/jamm-0.2.6.jar" />
         <jvmarg value="-ea"/>
         <jvmarg value="-Xss256k"/>
-        <optjvmargs/>
+        <jvmarg value="-Dcassandra.memtable_row_overhead_computation_step=100"/>
+        <jvmarg value="-Dcassandra.test.use_prepared=${cassandra.test.use_prepared}"/>
+	<jvmarg value="-Dcassandra.test.offsetseed=@{poffset}"/>        
+	<optjvmargs/>
         <classpath>
           <path refid="cassandra.classpath" />
           <pathelement location="${test.classes}"/>
@@ -1049,6 +1125,7 @@
         </classpath>
         <batchtest todir="${build.test.dir}/output">
             <fileset dir="@{inputdir}" includes="@{filter}" excludes="@{exclude}"/>
+            <filelist dir="@{inputdir}" files="@{filelist}"/>
         </batchtest>
       </junit>
       <fail message="Some @{suitename} test(s) failed.">
@@ -1080,8 +1157,8 @@
         <pathelement location="${build.dir}/${ant.project.name}-thrift-${version}.jar" />
         <pathelement location="${build.lib}/libthrift-0.9.0.jar" />
         <pathelement location="${build.lib}/slf4j-api-1.7.2.jar" />
-        <pathelement location="${build.lib}/slf4j-log4j12-1.7.2.jar" />
-        <pathelement location="${build.lib}/log4j-1.2.16.jar" />
+        <pathelement location="${build.lib}/logback-core-1.1.2.jar" />
+        <pathelement location="${build.lib}/logback-classic-1.1.2.jar" />
         <pathelement location="${build.lib}/jackson-core-asl-1.9.2.jar" />
         <pathelement location="${build.lib}/jackson-mapper-asl-1.9.2.jar" />
         <fileset dir="${build.dir.lib}">
@@ -1091,7 +1168,7 @@
     </junit>
   </target>
 
-  <target name="test" depends="build-test" description="Execute unit tests" >
+  <target name="testold" depends="build-test" description="Execute unit tests">
     <testmacro suitename="unit" inputdir="${test.unit.src}" exclude="**/pig/*.java" timeout="${test.timeout}">
       <jvmarg value="-Dlegacy-sstable-root=${test.data}/legacy-sstables"/>
       <jvmarg value="-Dcorrupt-sstable-root=${test.data}/corrupt-sstables"/>
@@ -1099,6 +1176,20 @@
       <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
       <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
     </testmacro>
+    <fileset dir="${test.unit.src}">
+        <exclude name="**/pig/*.java" />
+    </fileset>
+  </target>
+  
+  <target name="testlist">
+    <testmacro suitename="${testlist.name}" inputdir="${test.unit.src}" filelist="${test.file.list}" poffset="${testlist.offset}" exclude="**/*.java" timeout="${test.timeout}">
+      <jvmarg value="-Dlegacy-sstable-root=${test.data}/legacy-sstables"/>
+      <jvmarg value="-Dcorrupt-sstable-root=${test.data}/corrupt-sstables"/>
+      <jvmarg value="-Dmigration-sstable-root=${test.data}/migration-sstables"/>
+      <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
+      <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
+      <jvmarg value="-Dcassandra.config.loader=org.apache.cassandra.OffsetAwareConfigurationLoader"/>
+    </testmacro>
   </target>
 
   <target name="testsome" depends="build-test" description="Execute specific unit tests" >
@@ -1121,6 +1212,9 @@
       <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
       <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
     </testmacro>
+    <fileset dir="${test.unit.src}">
+        <exclude name="**/pig/*.java" />
+    </fileset>
   </target>
 
   <target name="msg-ser-gen-test" depends="build-test" description="Generates message serializations">
@@ -1157,7 +1251,78 @@
     </testmacro>
   </target>
 
-  <target name="pig-test" depends="build-test" description="Excute Pig tests">
+  <target name="cql-test" depends="build-test" description="Execute CQL tests">
+    <sequential>
+      <echo message="running CQL tests"/>
+      <mkdir dir="${build.test.dir}/cassandra"/>
+      <mkdir dir="${build.test.dir}/output"/>
+      <junit fork="on" forkmode="once" failureproperty="testfailed" maxmemory="1024m" timeout="${test.timeout}">
+        <formatter type="brief" usefile="false"/>
+        <jvmarg value="-Dstorage-config=${test.conf}"/>
+        <jvmarg value="-Djava.awt.headless=true"/>
+        <jvmarg value="-javaagent:${basedir}/lib/jamm-0.2.6.jar" />
+        <jvmarg value="-ea"/>
+        <jvmarg value="-Xss256k"/>
+        <jvmarg value="-Dcassandra.memtable_row_overhead_computation_step=100"/>
+        <jvmarg value="-Dcassandra.test.use_prepared=${cassandra.test.use_prepared}"/>
+        <classpath>
+          <path refid="cassandra.classpath" />
+          <pathelement location="${test.classes}"/>
+          <path refid="cobertura.classpath"/>
+          <pathelement location="${test.conf}"/>
+          <fileset dir="${test.lib}">
+            <include name="**/*.jar" />
+          </fileset>
+        </classpath>
+        <batchtest todir="${build.test.dir}/output">
+            <fileset dir="${test.unit.src}" includes="**/cql3/*Test.java">
+                <contains text="CQLTester" casesensitive="yes"/>
+            </fileset>
+        </batchtest>
+      </junit>
+      <fail message="Some CQL test(s) failed.">
+        <condition>
+            <and>
+            <isset property="testfailed"/>
+            <not>
+              <isset property="ant.test.failure.ignore"/>
+            </not>
+          </and>
+        </condition>
+      </fail>
+    </sequential>
+  </target>
+
+  <target name="cql-test-some" depends="build-test" description="Execute specific CQL tests" >
+    <sequential>
+      <echo message="running ${test.methods} tests from ${test.name}"/>
+      <mkdir dir="${build.test.dir}/cassandra"/>
+      <mkdir dir="${build.test.dir}/output"/>
+      <junit fork="on" forkmode="once" failureproperty="testfailed" maxmemory="1024m" timeout="${test.timeout}">
+        <sysproperty key="net.sourceforge.cobertura.datafile" file="${cobertura.datafile}"/>
+        <formatter type="brief" usefile="false"/>
+        <jvmarg value="-Dstorage-config=${test.conf}"/>
+        <jvmarg value="-Djava.awt.headless=true"/>
+        <jvmarg value="-javaagent:${basedir}/lib/jamm-0.2.6.jar" />
+        <jvmarg value="-ea"/>
+        <jvmarg value="-Xss256k"/>
+        <jvmarg value="-Dcassandra.test.use_prepared=${cassandra.test.use_prepared}"/>
+        <jvmarg value="-Dcassandra.memtable_row_overhead_computation_step=100"/>
+        <classpath>
+          <path refid="cassandra.classpath" />
+          <pathelement location="${test.classes}"/>
+          <path refid="cobertura.classpath"/>
+          <pathelement location="${test.conf}"/>
+          <fileset dir="${test.lib}">
+            <include name="**/*.jar" />
+          </fileset>
+        </classpath>
+        <test name="org.apache.cassandra.cql3.${test.name}" methods="${test.methods}" todir="${build.test.dir}/output"/>
+      </junit>
+    </sequential>
+  </target>
+
+  <target name="pig-test" depends="build-test,maven-ant-tasks-retrieve-pig-test" description="Excute Pig tests">
     <testmacro suitename="pig" inputdir="${test.pig.src}" 
                timeout="1200000">
     </testmacro>
@@ -1177,7 +1342,7 @@
     <delete file="${cobertura.datafile}"/>
 
     <cobertura-instrument todir="${cobertura.classes.dir}" datafile="${cobertura.datafile}">
-      <ignore regex="org.apache.log4j.*"/>
+      <ignore regex="ch.qos.logback.*"/>
 
       <fileset dir="${build.classes.main}">
         <include name="**/*.class"/>
@@ -1186,7 +1351,7 @@
         <exclude name="**/test/*.class"/>
         <!-- cobertura modifies the serialVersionUID of classes. Some of our unit tests rely on backward
         wire compatability of these classes.  It was easier to exlude them from instrumentation than to
-        force their serialVersinUIDs. -->
+        force their serialVersionUIDs. -->
         <exclude name="**/*Token.class"/>
         <exclude name="${cobertura.excludes}"/>
       </fileset>
@@ -1213,7 +1378,7 @@
   </target>
 
   <target name="rat-check" depends="rat-init">
-    <rat:report xmlns:rat="antlib:org.apache.rat.anttasks"
+    <rat:report xmlns:rat="antlib:org.apache.rat.anttasks"  
                 reportFile="${build.dir}/rat-report.log">
       <fileset dir="."  excludesfile=".rat-excludes" />
     </rat:report>
@@ -1249,6 +1414,62 @@
     </create-javadoc>
    </target>
 
+  <!-- Split test classes into n buckets and run across processes -->
+  <target name="test" depends="build-test" description="Parallel Test Runner">
+    <path id="all-test-classes-path">
+      <fileset dir="${test.unit.src}" excludes="**/pig/*.java" includes="**/${test.name}.java" />   
+    </path>
+    <property name="all-test-classes" refid="all-test-classes-path"/>
+    <script language="javascript"> <![CDATA[
+	var Integer = java.lang.Integer;
+	sep = project.getProperty("path.separator");
+	all = project.getProperty("all-test-classes").split(sep);
+	dir = project.getProperty("test.unit.src");
+
+	numRunners = parseInt(project.getProperty("test.runners"));  	
+	
+	buckets = new Array(numRunners);
+	for (i = 0; i < all.length; i++) {
+	    bucketNum = i % numRunners;
+	    if (buckets[bucketNum] == undefined) 
+		buckets[bucketNum] = "";
+	    else
+		buckets[bucketNum] += ",";
+	
+	    buckets[bucketNum] += all[i];
+	}
+
+
+	var p = project.createTask('parallel');
+	p.setThreadCount(numRunners);
+
+  	for (i = 0; i < buckets.length; i++) {
+
+	    if (buckets[i] == undefined) continue;
+
+	    task = project.createTask( 'antcall' );
+
+	    task.setTarget("testlist");
+	    param = task.createParam();
+	    param.setName("test.file.list");
+	    param.setValue("" + buckets[i]);
+
+	    param = task.createParam();
+	    param.setName("testlist.name");
+	    param.setValue("test bucket "+i);	  
+
+	    param = task.createParam();
+	    param.setName("testlist.offset");
+	    param.setValue("" + i);
+
+	    p.addTask(task); 
+  	}
+	
+	p.perform();
+	  			    
+]]> </script>
+  </target>
+
   <!-- Generate Eclipse project description files -->
   <target name="generate-eclipse-files" depends="build" description="Generate eclipse files">
     <echo file=".project"><![CDATA[<?xml version="1.0" encoding="UTF-8"?>
@@ -1291,10 +1512,13 @@
  	 <fileset dir="build/lib/jars">
   	    <include name="**/*.jar" />
   	 </fileset>
+  	 <fileset dir="tools/lib">
+  	     <include name="**/*.jar" />
+  	 </fileset>
   	</path>
   	<property name="eclipse-project-libs" refid="eclipse-project-libs-path"/>
   	<script language="javascript"> <![CDATA[
-  		importClass(java.io.File);
+  		var File = java.io.File;
   		jars = project.getProperty("eclipse-project-libs").split(project.getProperty("path.separator"));
   		
   		cp = "";

diff --git a/conf/README.txt b/conf/README.txt
index c3fd98d..e44d4a3 100644
--- a/conf/README.txt
+++ b/conf/README.txt

@@ -2,10 +2,12 @@
 ============================
 
 cassandra.yaml: main Cassandra configuration file
-log4j-server.proprties: log4j configuration file for Cassandra server
+logback.xml: logback configuration file for Cassandra server
 
 
 Optional configuration files
 ============================
 
 cassandra-topology.properties: used by PropertyFileSnitch
+
+

diff --git a/conf/cassandra-env.ps1 b/conf/cassandra-env.ps1
new file mode 100644
index 0000000..47f4fa3
--- /dev/null
+++ b/conf/cassandra-env.ps1

@@ -0,0 +1,402 @@
+#

+# Licensed to the Apache Software Foundation (ASF) under one or more

+# contributor license agreements.  See the NOTICE file distributed with

+# this work for additional information regarding copyright ownership.

+# The ASF licenses this file to You under the Apache License, Version 2.0

+# (the "License"); you may not use this file except in compliance with

+# the License.  You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+

+# NOTE: All param tuning can be done in the SetCassandraEnvironment Function below

+

+#-----------------------------------------------------------------------------

+Function SetCassandraHome()

+{

+    if (! $env:CASSANDRA_HOME)

+    {

+        $cwd = [System.IO.Directory]::GetCurrentDirectory()

+        $cwd = Split-Path $cwd -parent

+        $env:CASSANDRA_HOME = $cwd -replace "\\", "/"

+    }

+}

+

+#-----------------------------------------------------------------------------

+Function SetCassandraMain()

+{

+    if (! $env:CASSANDRA_MAIN)

+    {

+        $env:CASSANDRA_MAIN="org.apache.cassandra.service.CassandraDaemon"

+    }

+}

+

+#-----------------------------------------------------------------------------

+Function BuildClassPath

+{

+    $cp = """$env:CASSANDRA_HOME\conf"""

+    foreach ($file in Get-ChildItem "$env:CASSANDRA_HOME\lib\*.jar")

+    {

+        $file = $file -replace "\\", "/"

+        $cp = $cp + ";" + """$file"""

+    }

+

+    # Add build/classes/main so it works in development

+    $cp = $cp + ";" + """$env:CASSANDRA_HOME\build\classes\main"";""$env:CASSANDRA_HOME\build\classes\thrift"""

+    $env:CLASSPATH=$cp

+}

+

+#-----------------------------------------------------------------------------

+Function CalculateHeapSizes

+{

+    # Check if swapping is enabled on the host and warn if so - reference CASSANDRA-7316

+

+    $osInfo = Get-WmiObject -class "Win32_computersystem"

+    $autoPage = $osInfo.AutomaticManagedPageFile

+

+    if ($autoPage)

+    {

+        echo "*---------------------------------------------------------------------*"

+        echo "*---------------------------------------------------------------------*"

+        echo ""

+        echo "    WARNING!  Automatic page file configuration detected."

+        echo "    It is recommended that you disable swap when running Cassandra"

+        echo "    for performance and stability reasons."

+        echo ""

+        echo "*---------------------------------------------------------------------*"

+        echo "*---------------------------------------------------------------------*"

+    }

+    else

+    {

+        $pageFileInfo = Get-WmiObject -class "Win32_PageFileSetting" -EnableAllPrivileges

+        $pageFileCount = $PageFileInfo.Count

+        if ($pageFileInfo)

+        {

+            $files = @()

+            $sizes = @()

+            $hasSizes = $FALSE

+

+            # PageFileCount isn't populated and obj comes back as single if there's only 1

+            if ([string]::IsNullOrEmpty($PageFileCount))

+            {

+                $PageFileCount = 1

+                $files += $PageFileInfo.Name

+                if ($PageFileInfo.MaximumSize -ne 0)

+                {

+                    $hasSizes = $TRUE

+                    $sizes += $PageFileInfo.MaximumSize

+                }

+            }

+            else

+            {

+                for ($i = 0; $i -le $PageFileCount; $i++)

+                {

+                    $files += $PageFileInfo[$i].Name

+                    if ($PageFileInfo[$i].MaximumSize -ne 0)

+                    {

+                        $hasSizes = $TRUE

+                        $sizes += $PageFileInfo[$i].MaximumSize

+                    }

+                }

+            }

+

+            echo "*---------------------------------------------------------------------*"

+            echo "*---------------------------------------------------------------------*"

+            echo ""

+            echo "    WARNING!  $PageFileCount swap file(s) detected"

+            for ($i = 0; $i -lt $PageFileCount; $i++)

+            {

+                $toPrint = "        Name: " + $files[$i]

+                if ($hasSizes)

+                {

+                    $toPrint = $toPrint + " Size: " + $sizes[$i]

+                    $toPrint = $toPrint -replace [Environment]::NewLine, ""

+                }

+                echo $toPrint

+            }

+            echo "    It is recommended that you disable swap when running Cassandra"

+            echo "    for performance and stability reasons."

+            echo ""

+            echo "*---------------------------------------------------------------------*"

+            echo "*---------------------------------------------------------------------*"

+        }

+    }

+

+    # Validate that we need to run this function and that our config is good

+    if ($env:MAX_HEAP_SIZE -and $env:HEAP_NEWSIZE)

+    {

+        return

+    }

+    if (($env:MAX_HEAP_SIZE -and !$env:HEAP_NEWSIZE) -or (!$env:MAX_HEAP_SIZE -and $env:HEAP_NEWSIZE))

+    {

+        echo "please set or unset MAX_HEAP_SIZE and HEAP_NEWSIZE in pairs"

+        exit 1

+    }

+

+    $memObject = Get-WMIObject -class win32_physicalmemory

+    if ($memObject -eq $null)

+    {

+        echo "WARNING!  Could not determine system memory.  Defaulting to 2G heap, 512M newgen.  Manually override in conf/cassandra-env.ps1 for different heap values."

+        $env:MAX_HEAP_SIZE = "2048M"

+        $env:HEAP_NEWSIZE = "512M"

+        return

+    }

+

+    $memory = ($memObject | Measure-Object Capacity -Sum).sum

+    $memoryMB = [Math]::Truncate($memory / (1024*1024))

+

+    $cpu = gwmi Win32_ComputerSystem | Select-Object NumberOfLogicalProcessors

+    $systemCores = $cpu.NumberOfLogicalProcessors

+

+    # set max heap size based on the following

+    # max(min(1/2 ram, 1024MB), min(1/4 ram, 8GB))

+    # calculate 1/2 ram and cap to 1024MB

+    # calculate 1/4 ram and cap to 8192MB

+    # pick the max

+    $halfMem = [Math]::Truncate($memoryMB / 2)

+    $quarterMem = [Math]::Truncate($halfMem / 2)

+

+    if ($halfMem -gt 1024)

+    {

+        $halfMem = 1024

+    }

+    if ($quarterMem -gt 8192)

+    {

+        $quarterMem = 8192

+    }

+

+    $maxHeapMB = ""

+    if ($halfMem -gt $quarterMem)

+    {

+        $maxHeapMB = $halfMem

+    }

+    else

+    {

+        $maxHeapMB = $quarterMem

+    }

+    $env:MAX_HEAP_SIZE = [System.Convert]::ToString($maxHeapMB) + "M"

+

+    # Young gen: min(max_sensible_per_modern_cpu_core * num_cores, 1/4

+    $maxYGPerCore = 100

+    $maxYGTotal = $maxYGPerCore * $systemCores

+    $desiredYG = [Math]::Truncate($maxHeapMB / 4)

+

+    if ($desiredYG -gt $maxYGTotal)

+    {

+        $env:HEAP_NEWSIZE = [System.Convert]::ToString($maxYGTotal) + "M"

+    }

+    else

+    {

+        $env:HEAP_NEWSIZE = [System.Convert]::ToString($desiredYG) + "M"

+    }

+}

+

+#-----------------------------------------------------------------------------

+Function ParseJVMInfo

+{

+    # grab info about the JVM

+    $pinfo = New-Object System.Diagnostics.ProcessStartInfo

+    $pinfo.FileName = "$env:JAVA_BIN"

+    $pinfo.RedirectStandardError = $true

+    $pinfo.RedirectStandardOutput = $true

+    $pinfo.UseShellExecute = $false

+    $pinfo.Arguments = "-version"

+    $p = New-Object System.Diagnostics.Process

+    $p.StartInfo = $pinfo

+    $p.Start() | Out-Null

+    $p.WaitForExit()

+    $stderr = $p.StandardError.ReadToEnd()

+

+    $sa = $stderr.Split("""")

+    $env:JVM_VERSION = $sa[1]

+

+    if ($stderr.Contains("OpenJDK"))

+    {

+        $env:JVM_VENDOR = "OpenJDK"

+    }

+    elseif ($stderr.Contains("Java(TM)"))

+    {

+        $env:JVM_VENDOR = "Oracle"

+    }

+    else

+    {

+        $JVM_VENDOR = "other"

+    }

+

+    $pa = $sa[1].Split("_")

+    $env:JVM_PATCH_VERSION=$pa[1]

+

+    # get 64-bit vs. 32-bit

+    $pinfo.Arguments = "-d64 -version"

+    $pArch = New-Object System.Diagnostics.Process

+    $p.StartInfo = $pinfo

+    $p.Start() | Out-Null

+    $p.WaitForExit()

+    $stderr = $p.StandardError.ReadToEnd()

+

+    if ($stderr.Contains("Error"))

+    {

+        $env:JVM_ARCH = "32-bit"

+    }

+    else

+    {

+        $env:JVM_ARCH = "64-bit"

+    }

+}

+

+#-----------------------------------------------------------------------------

+Function SetCassandraEnvironment

+{

+    echo "Setting up Cassandra environment"

+    if (Test-Path Env:\JAVA_HOME)

+    {

+        $env:JAVA_BIN = "$env:JAVA_HOME\bin\java.exe"

+    }

+    elseif (Get-Command "java.exe")

+    {

+        $env:JAVA_BIN = "java.exe"

+    }

+    else

+    {

+        echo "ERROR!  No JAVA_HOME set and could not find java.exe in the path."

+        exit

+    }

+    SetCassandraHome

+    $env:CASSANDRA_CONF = "$env:CASSANDRA_HOME\conf"

+    $env:CASSANDRA_PARAMS="-Dcassandra -Dlogback.configurationFile=logback.xml"

+    SetCassandraMain

+    BuildClassPath

+

+    # Override these to set the amount of memory to allocate to the JVM at

+    # start-up. For production use you may wish to adjust this for your

+    # environment. MAX_HEAP_SIZE is the total amount of memory dedicated

+    # to the Java heap; HEAP_NEWSIZE refers to the size of the young

+    # generation. Both MAX_HEAP_SIZE and HEAP_NEWSIZE should be either set

+    # or not (if you set one, set the other).

+    #

+    # The main trade-off for the young generation is that the larger it

+    # is, the longer GC pause times will be. The shorter it is, the more

+    # expensive GC will be (usually).

+    #

+    # The example HEAP_NEWSIZE assumes a modern 8-core+ machine for decent

+    # times. If in doubt, and if you do not particularly want to tweak, go

+    # 100 MB per physical CPU core.

+

+    #$env:MAX_HEAP_SIZE="4096M"

+    #$env:HEAP_NEWSIZE="800M"

+    CalculateHeapSizes

+

+    ParseJVMInfo

+    # add the jamm javaagent

+    if (($env:JVM_VENDOR -ne "OpenJDK") -or ($env:JVM_VERSION.CompareTo("1.6.0") -eq 1) -or

+        (($env:JVM_VERSION -eq "1.6.0") -and ($env:JVM_PATCH_VERSION.CompareTo("22") -eq 1)))

+    {

+        $env:JVM_OPTS = "$env:JVM_OPTS -javaagent:""$env:CASSANDRA_HOME\lib\jamm-0.2.6.jar"""

+    }

+

+    # enable assertions.  disabling this in production will give a modest

+    # performance benefit (around 5%).

+    $env:JVM_OPTS = "$env:JVM_OPTS -ea"

+

+    # Specifies the default port over which Cassandra will be available for

+    # JMX connections.

+    $JMX_PORT="7199"

+

+    $env:JVM_OPTS = "$env:JVM_OPTS -Dlog4j.defaultInitOverride=true"

+

+    # some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+CMSClassUnloadingEnabled"

+

+    # enable thread priorities, primarily so we can give periodic tasks

+    # a lower priority to avoid interfering with client workload

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseThreadPriorities"

+    # allows lowering thread priority without being root.  see

+    # http://tech.stolsvik.com/2010/01/linux-java-thread-priorities-workar

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:ThreadPriorityPolicy=42"

+

+    # min and max heap sizes should be set to the same value to avoid

+    # stop-the-world GC pauses during resize, and so that we can lock the

+    # heap in memory on startup to prevent any of it from being swapped

+    # out.

+    $env:JVM_OPTS="$env:JVM_OPTS -Xms$env:MAX_HEAP_SIZE"

+    $env:JVM_OPTS="$env:JVM_OPTS -Xmx$env:MAX_HEAP_SIZE"

+    $env:JVM_OPTS="$env:JVM_OPTS -Xmn$env:HEAP_NEWSIZE"

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+HeapDumpOnOutOfMemoryError"

+

+    # Per-thread stack size.

+    $env:JVM_OPTS="$env:JVM_OPTS -Xss256k"

+

+    # Larger interned string table, for gossip's benefit (CASSANDRA-6410)

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:StringTableSize=1000003"

+

+    # GC tuning options

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseParNewGC"

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseConcMarkSweepGC"

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+CMSParallelRemarkEnabled"

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:SurvivorRatio=8"

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:MaxTenuringThreshold=1"

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:CMSInitiatingOccupancyFraction=75"

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseCMSInitiatingOccupancyOnly"

+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseTLAB"

+    if (($env:JVM_VERSION.CompareTo("1.7") -eq 1) -and ($env:JVM_ARCH -eq "64-Bit"))

+    {

+        $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseCondCardMark"

+    }

+    if ( (($env:JVM_VERSION.CompareTo("1.7") -ge 0) -and ($env:JVM_PATCH_VERSION.CompareTo("60") -ge 0)) -or

+         ($env:JVM_VERSION.CompareTo("1.8") -ge 0))

+    {

+        $env:JVM_OPTS="$env:JVM_OPTS -XX:+CMSParallelInitialMarkEnabled -XX:+CMSEdenChunksRecordAlways"

+    }

+

+    # GC logging options -- uncomment to enable

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintGCDetails"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintGCDateStamps"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintHeapAtGC"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintTenuringDistribution"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintGCApplicationStoppedTime"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintPromotionFailure"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:PrintFLSStatistics=1"

+    # $env:JVM_OPTS="$env:JVM_OPTS -Xloggc:/var/log/cassandra/gc-`date +%s`.log"

+

+    # If you are using JDK 6u34 7u2 or later you can enable GC log rotation

+    # don't stick the date in the log name if rotation is on.

+    # $env:JVM_OPTS="$env:JVM_OPTS -Xloggc:/var/log/cassandra/gc.log"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseGCLogFileRotation"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:NumberOfGCLogFiles=10"

+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:GCLogFileSize=10M"

+

+    # Configure the following for JEMallocAllocator and if jemalloc is not available in the system

+    # library path (Example: /usr/local/lib/). Usually "make install" will do the right thing.

+    # set LD_LIBRARY_PATH=<JEMALLOC_HOME>/lib/

+    # $env:JVM_OPTS="$env:JVM_OPTS -Djava.library.path=<JEMALLOC_HOME>/lib/"

+

+    # uncomment to have Cassandra JVM listen for remote debuggers/profilers on port 1414

+    # $env:JVM_OPTS="$env:JVM_OPTS -Xdebug -Xnoagent -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=1414"

+

+    # Prefer binding to IPv4 network intefaces (when net.ipv6.bindv6only=1). See

+    # http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6342561 (short version:

+    # comment out this entry to enable IPv6 support).

+    $env:JVM_OPTS="$env:JVM_OPTS -Djava.net.preferIPv4Stack=true"

+

+    # jmx: metrics and administration interface

+    #

+    # add this if you're having trouble connecting:

+    # $env:JVM_OPTS="$env:JVM_OPTS -Djava.rmi.server.hostname=<public name>"

+    #

+    # see

+    # https://blogs.oracle.com/jmxetc/entry/troubleshooting_connection_problems_in_jconsole

+    # for more on configuring JMX through firewalls, etc. (Short version:

+    # get it working with no firewall first.)

+    $env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.port=$JMX_PORT"

+    $env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.ssl=false"

+    $env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.authenticate=false"

+    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.password.file=/etc/cassandra/jmxremote.password"

+    $env:JVM_OPTS="$env:JVM_OPTS $JVM_EXTRA_OPTS"

+

+    $env:JVM_OPTS = "$env:JVM_OPTS -Dlog4j.configuration=log4j-server.properties"

+}


diff --git a/conf/cassandra-env.sh b/conf/cassandra-env.sh
index 3544426..875cbeb 100644
--- a/conf/cassandra-env.sh
+++ b/conf/cassandra-env.sh

@@ -95,7 +95,12 @@
 JVM_PATCH_VERSION=${jvmver#*_}
 
 if [ "$JVM_VERSION" \< "1.7" ] ; then
-    echo "Cassandra 2.0 and later require Java 7 or later."
+    echo "Cassandra 2.0 and later require Java 7u25 or later."
+    exit 1;
+fi
+
+if [ "$JVM_VERSION" \< "1.8" ] && [ "$JVM_PATCH_VERSION" -lt "25" ] ; then
+    echo "Cassandra 2.0 and later require Java 7u25 or later."
     exit 1;
 fi
 
@@ -168,7 +173,7 @@
 JVM_OPTS="$JVM_OPTS -ea"
 
 # add the jamm javaagent
-JVM_OPTS="$JVM_OPTS -javaagent:$CASSANDRA_HOME/lib/jamm-0.2.5.jar"
+JVM_OPTS="$JVM_OPTS -javaagent:$CASSANDRA_HOME/lib/jamm-0.2.6.jar"
 
 # some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541
 JVM_OPTS="$JVM_OPTS -XX:+CMSClassUnloadingEnabled"
@@ -212,12 +217,10 @@
 JVM_OPTS="$JVM_OPTS -XX:CMSInitiatingOccupancyFraction=75"
 JVM_OPTS="$JVM_OPTS -XX:+UseCMSInitiatingOccupancyOnly"
 JVM_OPTS="$JVM_OPTS -XX:+UseTLAB"
+JVM_OPTS="$JVM_OPTS -XX:CompileCommandFile=$CASSANDRA_CONF/hotspot_compiler"
+JVM_OPTS="$JVM_OPTS -XX:CMSWaitDuration=10000"
 
 # note: bash evals '1.7.x' as > '1.7' so this is really a >= 1.7 jvm check
-if { [ "$JVM_VERSION" \> "1.7" ] && [ "$JVM_VERSION" \< "1.8.0" ] && [ "$JVM_PATCH_VERSION" -ge "60" ]; } || [ "$JVM_VERSION" \> "1.8" ] ; then
-    JVM_OPTS="$JVM_OPTS -XX:+CMSParallelInitialMarkEnabled -XX:+CMSEdenChunksRecordAlways"
-fi
-
 if [ "$JVM_ARCH" = "64-Bit" ] ; then
     JVM_OPTS="$JVM_OPTS -XX:+UseCondCardMark"
 fi
@@ -246,6 +249,9 @@
 # uncomment to have Cassandra JVM listen for remote debuggers/profilers on port 1414
 # JVM_OPTS="$JVM_OPTS -Xdebug -Xnoagent -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=1414"
 
+# uncomment to have Cassandra JVM log internal method compilation (developers only)
+# JVM_OPTS="$JVM_OPTS -XX:+UnlockDiagnosticVMOptions -XX:+LogCompilation"
+
 # Prefer binding to IPv4 network intefaces (when net.ipv6.bindv6only=1). See
 # http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6342561 (short version:
 # comment out this entry to enable IPv6 support).
@@ -260,9 +266,20 @@
 # https://blogs.oracle.com/jmxetc/entry/troubleshooting_connection_problems_in_jconsole
 # for more on configuring JMX through firewalls, etc. (Short version:
 # get it working with no firewall first.)
+
+# To use mx4j, an HTML interface for JMX, add mx4j-tools.jar to the lib/
+# directory.
+# See http://wiki.apache.org/cassandra/Operations#Monitoring_with_MX4J
+# By default mx4j listens on 0.0.0.0:8081. Uncomment the following lines
+# to control its listen address and port.
+#MX4J_ADDRESS="-Dmx4jaddress=127.0.0.1"
+#MX4J_PORT="-Dmx4jport=8081"
+
 JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.port=$JMX_PORT"
 JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.rmi.port=$JMX_PORT"
 JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.ssl=false"
 JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.authenticate=false"
 #JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.password.file=/etc/cassandra/jmxremote.password"
+JVM_OPTS="$JVM_OPTS $MX4J_ADDRESS"
+JVM_OPTS="$JVM_OPTS $MX4J_PORT"
 JVM_OPTS="$JVM_OPTS $JVM_EXTRA_OPTS"

diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index 5eaffc2..c95c68c 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml

@@ -30,10 +30,10 @@
 # that do not have vnodes enabled.
 # initial_token:
 
+# See http://wiki.apache.org/cassandra/HintedHandoff
 # May either be "true" or "false" to enable globally, or contain a list
 # of data centers to enable per-datacenter.
 # hinted_handoff_enabled: DC1,DC2
-# See http://wiki.apache.org/cassandra/HintedHandoff
 hinted_handoff_enabled: true
 # this defines the maximum amount of time a dead host will have hints
 # generated.  After it has been dead this long, new hints for it will not be
@@ -94,11 +94,14 @@
 # Directories where Cassandra should store data on disk.  Cassandra
 # will spread data evenly across them, subject to the granularity of
 # the configured compaction strategy.
-data_file_directories:
-    - /var/lib/cassandra/data
+# If not set, the default directory is $CASSANDRA_HOME/data/data.
+# data_file_directories:
+#     - /var/lib/cassandra/data
 
-# commit log
-commitlog_directory: /var/lib/cassandra/commitlog
+# commit log.  when running on magnetic HDD, this should be a
+# separate spindle than the data directories.
+# If not set, the default directory is $CASSANDRA_HOME/data/commitlog.
+# commitlog_directory: /var/lib/cassandra/commitlog
 
 # policy for data disk failures:
 # stop_paranoid: shut down gossip and Thrift even for single-sstable errors.
@@ -154,7 +157,7 @@
 row_cache_size_in_mb: 0
 
 # Duration in seconds after which Cassandra should
-# safe the row cache. Caches are saved to saved_caches_directory as specified
+# save the row cache. Caches are saved to saved_caches_directory as specified
 # in this configuration file.
 #
 # Saved caches greatly improve cold-start speeds, and is relatively cheap in
@@ -168,6 +171,32 @@
 # Disabled by default, meaning all keys are going to be saved
 # row_cache_keys_to_save: 100
 
+# Maximum size of the counter cache in memory.
+#
+# Counter cache helps to reduce counter locks' contention for hot counter cells.
+# In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before
+# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration
+# of the lock hold, helping with hot counter cell updates, but will not allow skipping
+# the read entirely. Only the local (clock, count) tuple of a counter cell is kept
+# in memory, not the whole counter, so it's relatively cheap.
+#
+# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
+#
+# Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache.
+# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache.
+counter_cache_size_in_mb:
+
+# Duration in seconds after which Cassandra should
+# save the counter cache (keys only). Caches are saved to saved_caches_directory as
+# specified in this configuration file.
+#
+# Default is 7200 or 2 hours.
+counter_cache_save_period: 7200
+
+# Number of keys from the counter cache to save
+# Disabled by default, meaning all keys are going to be saved
+# counter_cache_keys_to_save: 100
+
 # The off-heap memory allocator.  Affects storage engine metadata as
 # well as caches.  Experiments show that JEMAlloc saves some memory
 # than the native GCC allocator (i.e., JEMalloc is more
@@ -182,7 +211,8 @@
 # memory_allocator: NativeAllocator
 
 # saved caches
-saved_caches_directory: /var/lib/cassandra/saved_caches
+# If not set, the default directory is $CASSANDRA_HOME/data/saved_caches.
+# saved_caches_directory: /var/lib/cassandra/saved_caches
 
 # commitlog_sync may be either "periodic" or "batch." 
 # When in batch mode, Cassandra won't ack writes until the commit log
@@ -195,9 +225,9 @@
 #
 # the other option is "periodic" where writes may be acked immediately
 # and the CommitLog is simply synced every commitlog_sync_period_in_ms
-# milliseconds.  By default this allows 1024*(CPU cores) pending
-# entries on the commitlog queue.  If you are writing very large blobs,
-# you should reduce that; 16*cores works reasonably well for 1MB blobs.
+# milliseconds.  commitlog_periodic_queue_size allows 1024*(CPU cores) pending
+# entries on the commitlog queue by default.  If you are writing very large
+# blobs, you should reduce that; 16*cores works reasonably well for 1MB blobs.
 # It should be at least as large as the concurrent_writes setting.
 commitlog_sync: periodic
 commitlog_sync_period_in_ms: 10000
@@ -231,44 +261,78 @@
 # bottleneck will be reads that need to fetch data from
 # disk. "concurrent_reads" should be set to (16 * number_of_drives) in
 # order to allow the operations to enqueue low enough in the stack
-# that the OS and drives can reorder them.
+# that the OS and drives can reorder them. Same applies to
+# "concurrent_counter_writes", since counter writes read the current
+# values before incrementing and writing them back.
 #
 # On the other hand, since writes are almost never IO bound, the ideal
 # number of "concurrent_writes" is dependent on the number of cores in
 # your system; (8 * number_of_cores) is a good rule of thumb.
 concurrent_reads: 32
 concurrent_writes: 32
+concurrent_counter_writes: 32
 
 # Total memory to use for sstable-reading buffers.  Defaults to
 # the smaller of 1/4 of heap or 512MB.
 # file_cache_size_in_mb: 512
 
-# Total memory to use for memtables.  Cassandra will flush the largest
-# memtable when this much memory is used.
-# If omitted, Cassandra will set it to 1/4 of the heap.
-# memtable_total_space_in_mb: 2048
+# Total permitted memory to use for memtables. Cassandra will stop 
+# accepting writes when the limit is exceeded until a flush completes,
+# and will trigger a flush based on memtable_cleanup_threshold
+# If omitted, Cassandra will set both to 1/4 the size of the heap.
+# memtable_heap_space_in_mb: 2048
+# memtable_offheap_space_in_mb: 2048
+
+# Ratio of occupied non-flushing memtable size to total permitted size
+# that will trigger a flush of the largest memtable.  Lager mct will
+# mean larger flushes and hence less compaction, but also less concurrent
+# flush activity which can make it difficult to keep your disks fed
+# under heavy write load.
+#
+# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
+# memtable_cleanup_threshold: 0.11
+
+# Specify the way Cassandra allocates and manages memtable memory.
+# Options are:
+#   heap_buffers:    on heap nio buffers
+#   offheap_buffers: off heap (direct) nio buffers
+#   offheap_objects: native memory, eliminating nio buffer heap overhead
+memtable_allocation_type: heap_buffers
 
 # Total space to use for commitlogs.  Since commitlog segments are
 # mmapped, and hence use up address space, the default size is 32
-# on 32-bit JVMs, and 1024 on 64-bit JVMs.
+# on 32-bit JVMs, and 8192 on 64-bit JVMs.
 #
 # If space gets above this value (it will round up to the next nearest
 # segment multiple), Cassandra will flush every dirty CF in the oldest
 # segment and remove it.  So a small total commitlog space will tend
 # to cause more flush activity on less-active columnfamilies.
-# commitlog_total_space_in_mb: 4096
+# commitlog_total_space_in_mb: 8192
 
 # This sets the amount of memtable flush writer threads.  These will
 # be blocked by disk io, and each one will hold a memtable in memory
-# while blocked. If you have a large heap and many data directories,
-# you can increase this value for better flush performance.
-# By default this will be set to the amount of data directories defined.
-#memtable_flush_writers: 1
+# while blocked. 
+#
+# memtable_flush_writers defaults to the smaller of (number of disks,
+# number of cores), with a minimum of 2 and a maximum of 8.
+# 
+# If your data directories are backed by SSD, you should increase this
+# to the number of cores.
+#memtable_flush_writers: 8
 
-# the number of full memtables to allow pending flush, that is,
-# waiting for a writer thread.  At a minimum, this should be set to
-# the maximum number of secondary indexes created on a single CF.
-memtable_flush_queue_size: 4
+# A fixed memory pool size in MB for for SSTable index summaries. If left
+# empty, this will default to 5% of the heap size. If the memory usage of
+# all index summaries exceeds this limit, SSTables with low read rates will
+# shrink their index summaries in order to meet this limit.  However, this
+# is a best-effort process. In extreme conditions Cassandra may need to use
+# more than this amount of memory.
+index_summary_capacity_in_mb:
+
+# How frequently index summaries should be resampled.  This is done
+# periodically to redistribute memory from the fixed-size pool to sstables
+# proportional their recent read rates.  Setting to -1 will disable this
+# process, leaving existing index summaries at their current sampling level.
+index_summary_resize_interval_in_minutes: 60
 
 # Whether to, when doing sequential writing, fsync() at intervals in
 # order to force the operating system to flush the dirty
@@ -285,17 +349,20 @@
 # encryption_options
 ssl_storage_port: 7001
 
-# Address to bind to and tell other Cassandra nodes to connect to. You
-# _must_ change this if you want multiple nodes to be able to
-# communicate!
-# 
+# Address or interface to bind to and tell other Cassandra nodes to connect to.
+# You _must_ change this if you want multiple nodes to be able to communicate!
+#
+# Set listen_address OR listen_interface, not both. Interfaces must correspond
+# to a single address, IP aliasing is not supported.
+#
 # Leaving it blank leaves it up to InetAddress.getLocalHost(). This
 # will always do the Right Thing _if_ the node is properly configured
 # (hostname, name resolution, etc), and the Right Thing is to use the
 # address associated with the hostname (it might not be).
 #
-# Setting this to 0.0.0.0 is always wrong.
+# Setting listen_address to 0.0.0.0 is always wrong.
 listen_address: localhost
+# listen_interface: eth0
 
 # Address to broadcast to other Cassandra nodes
 # Leaving this blank will set it to the same value as listen_address
@@ -324,19 +391,29 @@
 # Whether to start the thrift rpc server.
 start_rpc: true
 
-# The address to bind the Thrift RPC service and native transport
-# server -- clients connect here.
+# The address or interface to bind the Thrift RPC service and native transport
+# server to.
 #
-# Leaving this blank has the same effect it does for ListenAddress,
+# Set rpc_address OR rpc_interface, not both. Interfaces must correspond
+# to a single address, IP aliasing is not supported.
+#
+# Leaving rpc_address blank has the same effect as on listen_address
 # (i.e. it will be based on the configured hostname of the node).
 #
-# Note that unlike ListenAddress above, it is allowed to specify 0.0.0.0
-# here if you want to listen on all interfaces, but that will break clients 
-# that rely on node auto-discovery.
+# Note that unlike listen_address, you can specify 0.0.0.0, but you must also
+# set broadcast_rpc_address to a value other than 0.0.0.0.
 rpc_address: localhost
+# rpc_interface: eth1
+
 # port for Thrift to listen for clients on
 rpc_port: 9160
 
+# RPC address to broadcast to drivers and other Cassandra nodes. This cannot
+# be set to 0.0.0.0. If left blank, this will be set to the value of
+# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must
+# be set.
+# broadcast_rpc_address: 1.2.3.4
+
 # enable or disable keepalive on rpc/native connections
 rpc_keepalive: true
 
@@ -438,11 +515,6 @@
 # Caution should be taken on increasing the size of this threshold as it can lead to node instability.
 batch_size_warn_threshold_in_kb: 5
 
-# Size limit for rows being compacted in memory.  Larger rows will spill
-# over to disk and use a slower two-pass compaction process.  A message
-# will be logged specifying the row key.
-in_memory_compaction_limit_in_mb: 64
-
 # Number of simultaneous compactions to allow, NOT including
 # validation "compactions" for anti-entropy repair.  Simultaneous
 # compactions can help preserve read performance in a mixed read/write
@@ -452,17 +524,13 @@
 # slowly or too fast, you should look at
 # compaction_throughput_mb_per_sec first.
 #
-# concurrent_compactors defaults to the number of cores.
-# Uncomment to make compaction mono-threaded, the pre-0.8 default.
+# concurrent_compactors defaults to the smaller of (number of disks,
+# number of cores), with a minimum of 2 and a maximum of 8.
+# 
+# If your data directories are backed by SSD, you should increase this
+# to the number of cores.
 #concurrent_compactors: 1
 
-# Multi-threaded compaction. When enabled, each compaction will use
-# up to one thread per core, plus one thread per sstable being merged.
-# This is usually only useful for SSD-based hardware: otherwise, 
-# your concern is usually to get compaction to do LESS i/o (see:
-# compaction_throughput_mb_per_sec), not more.
-multithreaded_compaction: false
-
 # Throttles compaction to the given total throughput across the entire
 # system. The faster you insert data, the faster you need to compact in
 # order to keep the sstable count down, but in general, setting this to
@@ -471,10 +539,11 @@
 # of compaction, including validation compaction.
 compaction_throughput_mb_per_sec: 16
 
-# Track cached row keys during compaction, and re-cache their new
-# positions in the compacted sstable.  Disable if you use really large
-# key caches.
-compaction_preheat_key_cache: true
+# When compacting, the replacement sstable(s) can be opened before they
+# are completely written, and used in place of the prior sstables for
+# any range that has been written. This helps to smoothly transfer reads 
+# between the sstables, reducing page cache churn and keeping hot rows hot
+sstable_preemptive_open_interval_in_mb: 50
 
 # Throttles all outbound streaming file transfers on this node to the
 # given total throughput in Mbps. This is necessary because Cassandra does
@@ -495,6 +564,8 @@
 range_request_timeout_in_ms: 10000
 # How long the coordinator should wait for writes to complete
 write_request_timeout_in_ms: 2000
+# How long the coordinator should wait for counter writes to complete
+counter_write_request_timeout_in_ms: 5000
 # How long a coordinator should continue to retry a CAS operation
 # that contends with other proposals for the same row
 cas_contention_timeout_in_ms: 1000
@@ -688,9 +759,3 @@
 # reducing overhead from the TCP protocol itself, at the cost of increasing
 # latency if you block for cross-datacenter responses.
 inter_dc_tcp_nodelay: false
-
-# Enable or disable kernel page cache preheating from contents of the key cache after compaction.
-# When enabled it would preheat only first "page" (4KB) of each row to optimize
-# for sequential access. Note: This could be harmful for fat rows, see CASSANDRA-4937
-# for further details on that topic.
-preheat_kernel_page_cache: false

diff --git a/conf/cqlshrc.sample b/conf/cqlshrc.sample
index 365defd..6558ad2 100644
--- a/conf/cqlshrc.sample
+++ b/conf/cqlshrc.sample

@@ -26,18 +26,23 @@
 completekey = tab
 
 [cql]
-version = 3.0
+version = 3.1.5
 
 [connection]
 hostname = 127.0.0.1
-port = 9160
-; enable below for ssl
-;factory = cqlshlib.ssl.ssl_transport_factory
+port = 9042
+
+[tracing]
+max_trace_wait = 10.0
 
 ;[ssl]
 ;certfile = ~/keys/cassandra.cert
 ;; optional - true by default.
 ;validate = true
+;; to be provided when require_client_auth=true
+;userkey = ~/key.pem
+;; to be provided when require_client_auth=true
+;usercert = ~/cert.pem
 
 ;; optional section, overrides default certfile in [ssl] section, if present
 ;[certfiles]

diff --git a/conf/log4j-server.properties b/conf/log4j-server.properties
deleted file mode 100644
index 086306e..0000000
--- a/conf/log4j-server.properties
+++ /dev/null

@@ -1,44 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# for production, you should probably set pattern to %c instead of %l.  
-# (%l is slower.)
-
-# output messages into a rolling log file as well as stdout
-log4j.rootLogger=INFO,stdout,R
-
-# stdout
-log4j.appender.stdout=org.apache.log4j.ConsoleAppender
-log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.stdout.layout.ConversionPattern=%5p %d{HH:mm:ss,SSS} %m%n
-
-# rolling log file
-log4j.appender.R=org.apache.log4j.RollingFileAppender
-log4j.appender.R.maxFileSize=20MB
-log4j.appender.R.maxBackupIndex=50
-log4j.appender.R.layout=org.apache.log4j.PatternLayout
-log4j.appender.R.layout.ConversionPattern=%5p [%t] %d{ISO8601} %F (line %L) %m%n
-# Edit the next line to point to your logs directory
-log4j.appender.R.File=/var/log/cassandra/system.log
-
-# Application logging options
-#log4j.logger.org.apache.cassandra=DEBUG
-#log4j.logger.org.apache.cassandra.db=DEBUG
-#log4j.logger.org.apache.cassandra.service.StorageProxy=DEBUG
-
-# Adding this to avoid thrift logging disconnect errors.
-log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR
-

diff --git a/conf/log4j-tools.properties b/conf/log4j-tools.properties
deleted file mode 100644
index a8f4d9f..0000000
--- a/conf/log4j-tools.properties
+++ /dev/null

@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# for production, you should probably set the root to INFO
-# and the pattern to %c instead of %l.  (%l is slower.)
-
-# output messages into a rolling log file as well as stdout
-log4j.rootLogger=WARN,stderr
-
-# stderr
-log4j.appender.stderr=org.apache.log4j.ConsoleAppender
-log4j.appender.stderr.target=System.err
-log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
-log4j.appender.stderr.layout.ConversionPattern=%5p %d{HH:mm:ss,SSS} %m%n

diff --git a/conf/logback-tools.xml b/conf/logback-tools.xml
new file mode 100644
index 0000000..ade6c12
--- /dev/null
+++ b/conf/logback-tools.xml

@@ -0,0 +1,33 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<configuration>
+  <appender name="STDERR" target="System.err" class="ch.qos.logback.core.ConsoleAppender">
+    <encoder>
+      <pattern>%-5level %date{HH:mm:ss,SSS} %msg%n</pattern>
+    </encoder>
+    <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
+      <level>WARN</level>
+    </filter>
+  </appender>
+
+  <root level="WARN">
+    <appender-ref ref="STDERR" />
+  </root>
+</configuration>

diff --git a/conf/logback.xml b/conf/logback.xml
new file mode 100644
index 0000000..e170d41
--- /dev/null
+++ b/conf/logback.xml

@@ -0,0 +1,53 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<configuration scan="true">
+  <jmxConfigurator />
+  <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
+    <file>${cassandra.logdir}/system.log</file>
+    <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
+      <fileNamePattern>${cassandra.logdir}/system.log.%i.zip</fileNamePattern>
+      <minIndex>1</minIndex>
+      <maxIndex>20</maxIndex>
+    </rollingPolicy>
+
+    <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
+      <maxFileSize>20MB</maxFileSize>
+    </triggeringPolicy>
+    <encoder>
+      <pattern>%-5level [%thread] %date{ISO8601} %F:%L - %msg%n</pattern>
+      <!-- old-style log format
+      <pattern>%5level [%thread] %date{ISO8601} %F (line %L) %msg%n</pattern>
+      -->
+    </encoder>
+  </appender>
+  
+  <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+    <encoder>
+      <pattern>%-5level %date{HH:mm:ss,SSS} %msg%n</pattern>
+    </encoder>
+  </appender>
+        
+  <root level="INFO">
+    <appender-ref ref="FILE" />
+    <appender-ref ref="STDOUT" />
+  </root>
+  
+  <logger name="com.thinkaurelius.thrift" level="ERROR"/>
+</configuration>

diff --git a/debian/README.Debian b/debian/README.Debian
index 9d7ea40..0a54d2b 100644
--- a/debian/README.Debian
+++ b/debian/README.Debian

@@ -2,7 +2,8 @@
 ====================
 
 This package is not a part of Debian, (and there are no immediate plans
-to have it added). Bugs should be sent to eevans@apache.org, *not* filed
-in the Debian BTS.
+to have it added). Bugs should be sent to the Apache Cassandra JIRA, *not*
+filed in the Debian BTS.
 
- -- Eric Evans <eevans@apache.org>  Sun, 26 Jul 2009 14:35:11 -0500
+  https://issues.apache.org/jira/browse/CASSANDRA
+

diff --git a/debian/cassandra-sysctl.conf b/debian/cassandra-sysctl.conf
index 2173765..443e83f 100644
--- a/debian/cassandra-sysctl.conf
+++ b/debian/cassandra-sysctl.conf

@@ -1 +1,2 @@
 vm.max_map_count = 1048575
+net.ipv4.tcp_keepalive_time=300

diff --git a/debian/cassandra-tools.install b/debian/cassandra-tools.install
new file mode 100644
index 0000000..6df21f3
--- /dev/null
+++ b/debian/cassandra-tools.install

@@ -0,0 +1,7 @@
+tools/bin/json2sstable usr/bin
+tools/bin/sstable2json usr/bin
+tools/bin/sstablelevelreset usr/bin
+tools/bin/sstablemetadata usr/bin
+tools/bin/sstablerepairedset usr/bin
+tools/bin/sstablesplit usr/bin
+tools/bin/token-generator usr/bin

diff --git a/debian/cassandra.bash-completion b/debian/cassandra.bash-completion
new file mode 100644
index 0000000..4847889
--- /dev/null
+++ b/debian/cassandra.bash-completion

@@ -0,0 +1 @@
+debian/nodetool-completion nodetool

diff --git a/debian/cassandra.in.sh b/debian/cassandra.in.sh
index 4da19d8..efb7abb 100644
--- a/debian/cassandra.in.sh
+++ b/debian/cassandra.in.sh

@@ -26,5 +26,5 @@
 if [ "$JVM_VENDOR" != "OpenJDK" -o "$JVM_VERSION" \> "1.6.0" ] \
       || [ "$JVM_VERSION" = "1.6.0" -a "$JVM_PATCH_VERSION" -ge 23 ]
 then
-    JAVA_AGENT="$JAVA_AGENT -javaagent:$CASSANDRA_HOME/lib/jamm-0.2.5.jar"
+    JAVA_AGENT="$JAVA_AGENT -javaagent:$CASSANDRA_HOME/lib/jamm-0.2.6.jar"
 fi

diff --git a/debian/cassandra.install b/debian/cassandra.install
index 4f3115b..a4654d1 100644
--- a/debian/cassandra.install
+++ b/debian/cassandra.install

@@ -1,11 +1,11 @@
 conf/cassandra-topology.yaml etc/cassandra
-conf/log4j-server.properties etc/cassandra
 conf/cassandra.yaml etc/cassandra
 conf/cassandra-env.sh etc/cassandra
 conf/cassandra-rackdc.properties etc/cassandra
 conf/commitlog_archiving.properties etc/cassandra
 conf/cassandra-topology.properties etc/cassandra
-conf/log4j-tools.properties etc/cassandra
+conf/logback.xml etc/cassandra
+conf/logback-tools.xml etc/cassandra
 conf/triggers/* etc/cassandra/triggers
 debian/cassandra.in.sh usr/share/cassandra
 debian/cassandra.conf etc/security/limits.d
@@ -13,18 +13,12 @@
 bin/cassandra usr/sbin
 bin/cassandra-cli usr/bin
 bin/nodetool usr/bin
-bin/json2sstable usr/bin
-bin/sstable2json usr/bin
 bin/sstablekeys usr/bin
 bin/sstableloader usr/bin
 bin/cqlsh usr/bin
 bin/sstablescrub usr/bin
 bin/sstableupgrade usr/bin
-bin/sstablesplit usr/bin
 tools/bin/cassandra-stress usr/bin
-tools/bin/token-generator usr/bin
-tools/bin/sstablelevelreset usr/bin
-tools/bin/sstablemetadata usr/bin
 lib/*.jar usr/share/cassandra/lib
 lib/*.zip usr/share/cassandra/lib
 lib/licenses usr/share/doc/cassandra

diff --git a/debian/changelog b/debian/changelog
index e0b1eae..f2ecceb 100644
--- a/debian/changelog
+++ b/debian/changelog

@@ -1,26 +1,62 @@
-cassandra (2.0.10) unstable; urgency=medium
+cassandra (2.1.0) unstable; urgency=medium
 
   * New release
 
- -- Sylvain Lebresne <slebresne@apache.org>  Fri, 08 Aug 2014 10:50:44 +0200
+ -- Sylvain Lebresne <slebresne@apache.org>  Sun, 07 Sep 2014 15:21:41 +0200
 
-cassandra (2.0.9) unstable; urgency=medium
+cassandra (2.1.0~rc7) unstable; urgency=medium
 
-  * New release
+  * New RC release
 
- -- Sylvain Lebresne <slebresne@apache.org>  Thu, 26 Jun 2014 10:30:22 +0200
+ -- Sylvain Lebresne <slebresne@apache.org>  Thu, 28 Aug 2014 16:32:12 +0200
 
-cassandra (2.0.8) unstable; urgency=medium
+cassandra (2.1.0~rc6) unstable; urgency=medium
 
-  * New release
+  * New RC release
 
- -- Sylvain Lebresne <slebresne@apache.org>  Tue, 06 May 2014 10:28:46 +0200
+ -- Sylvain Lebresne <slebresne@apache.org>  Sat, 09 Aug 2014 13:46:39 +0200
 
-cassandra (2.0.7) unstable; urgency=low
+cassandra (2.1.0~rc5) unstable; urgency=medium
 
-  * New release
+  * New RC release
 
- -- Sylvain Lebresne <slebresne@apache.org>  Mon, 14 Apr 2014 16:42:09 +0200
+ -- Sylvain Lebresne <slebresne@apache.org>  Sat, 02 Aug 2014 13:45:54 +0200
+
+cassandra (2.1.0~rc4) unstable; urgency=medium
+
+  * New RC release
+
+ -- Eric Evans <eevans@apache.org>  Fri, 18 Jul 2014 13:40:48 -0500
+
+cassandra (2.1.0~rc3) unstable; urgency=medium
+
+  * New RC release
+
+ -- Sylvain Lebresne <slebresne@apache.org>  Tue, 08 Jul 2014 14:04:10 +0200
+
+cassandra (2.1.0~rc2) unstable; urgency=medium
+
+  * New RC release
+
+ -- Sylvain Lebresne <slebresne@apache.org>  Mon, 23 Jun 2014 18:14:29 +0200
+
+cassandra (2.1.0~rc1) unstable; urgency=medium
+
+  * New RC release
+
+ -- Sylvain Lebresne <slebresne@apache.org>  Fri, 30 May 2014 17:25:14 +0200
+
+cassandra (2.1.0~beta2) unstable; urgency=medium
+
+  * New beta release
+
+ -- Sylvain Lebresne <slebresne@apache.org>  Thu, 01 May 2014 16:39:21 +0200
+
+cassandra (2.1.0~beta1) unstable; urgency=low
+
+  * New beta release
+
+ -- Sylvain Lebresne <slebresne@apache.org>  Mon, 17 Feb 2014 16:50:33 +0100
 
 cassandra (2.0.6) unstable; urgency=low
 

diff --git a/debian/control b/debian/control
index 2695e8d..a48441b 100644
--- a/debian/control
+++ b/debian/control

@@ -3,7 +3,7 @@
 Priority: extra
 Maintainer: Eric Evans <eevans@apache.org>
 Uploaders: Sylvain Lebresne <slebresne@apache.org>
-Build-Depends: debhelper (>= 5), openjdk-7-jdk | java7-jdk, ant (>= 1.7), ant-optional (>= 1.7), python-support
+Build-Depends: debhelper (>= 5), openjdk-7-jdk | java7-jdk, ant (>= 1.7), ant-optional (>= 1.7), python-support, dpatch, bash-completion
 Homepage: http://cassandra.apache.org
 Vcs-Git: http://git-wip-us.apache.org/repos/asf/cassandra.git
 Vcs-Browser: https://git-wip-us.apache.org/repos/asf?p=cassandra.git
@@ -11,10 +11,20 @@
 
 Package: cassandra
 Architecture: all
-Depends: openjdk-7-jre-headless | java7-runtime, adduser, libjna-java, python (>= 2.5), python-support (>= 0.90.0), ${misc:Depends}
+Depends: openjdk-7-jre-headless | java7-runtime, adduser, python (>= 2.5), python-support (>= 0.90.0), ${misc:Depends}
 Recommends: ntp | time-daemon
+Suggests: cassandra-tools
 Conflicts: apache-cassandra1
 Replaces: apache-cassandra1
 Description: distributed storage system for structured data
  Cassandra is a distributed (peer-to-peer) system for the management
  and storage of structured data.
+
+Package: cassandra-tools
+Architecture: all
+Depends: cassandra (= ${binary:Version}), ${misc:Depends}
+Description: distributed storage system for structured data
+ Cassandra is a distributed (peer-to-peer) system for the management
+ and storage of structured data.
+ .
+ This package contains extra tools for working with Cassandra clusters.

diff --git a/debian/init b/debian/init
index 145fb15..56f2523 100644
--- a/debian/init
+++ b/debian/init

@@ -38,9 +38,6 @@
     exit 3
 fi
 
-# Add JNA to EXTRA_CLASSPATH
-export EXTRA_CLASSPATH="/usr/share/java/jna.jar:$EXTRA_CLASSPATH"
-
 export JVM_OPTS
 
 # Export JAVA_HOME, if set.

diff --git a/debian/nodetool-completion b/debian/nodetool-completion
new file mode 100644
index 0000000..7dc35de
--- /dev/null
+++ b/debian/nodetool-completion

@@ -0,0 +1,224 @@
+have nodetool && have cqlsh &&
+{
+
+    show_keyspaces()
+    {
+        local ks=$(get_keyspaces)
+        COMPREPLY=( $(compgen -W "$ks" -- "$1") )
+    }
+
+    get_keyspaces()
+    {
+        [ -z "$keyspaces" ] && keyspaces=$(echo "DESCRIBE KEYSPACES" | cqlsh | egrep -v '^$')
+        echo $keyspaces
+    }
+
+    show_datacenters()
+    {
+        cur=$1
+        set|grep -q ^dcs || dcs=$(echo "select data_center from system.peers;"|cqlsh |tail -n +4|sort|uniq|awk '{if(length($1)>1) print $1}'|xargs)
+        COMPREPLY=( $(compgen -W "$dcs" -- "$cur") )
+    }
+
+    show_cfs()
+    {
+        local cur prev cfs
+        prev=$1
+        cur=$2
+        cfs=$(get_cfs $1 $2)
+        COMPREPLY=( $(compgen -W "$cfs" -- "$cur") )
+    }
+
+    get_cfs()
+    {
+        local prev
+        prev=$1
+        [ -z "${cf[$prev]}" ] && cf[$prev]=$(echo "DESCRIBE COLUMNFAMILIES" | cqlsh -k ${prev} | egrep -v '^$')
+        echo ${cf[$prev]}
+    }
+
+    show_last_cfs()
+    {
+        local cur cfs re
+        cur=$1
+        re=$(echo ${COMP_WORDS[@]:3:$(($COMP_CWORD - 3))} | sed -e 's/ /\\|/g')
+        cfs=$(get_cfs ${COMP_WORDS[2]} | sed -e "s/$re//g")
+        COMPREPLY=( $(compgen -W "$cfs" -- "${cur}") )
+    }
+
+    _nodetool()
+    {
+        local cur prev ks
+        COMPREPLY=()
+        _get_comp_words_by_ref cur prev
+
+        local shopt='
+            cfstats
+            compactionstats
+            compactionhistory
+            decommission
+            describecluster
+            disablebackup
+            disablebinary
+            disablegossip
+            disablehandoff
+            disablethrift
+            drain
+            enablebackup
+            enablebinary
+            enablegossip
+            enablehandoff
+            enablethrift
+            getcompactionthroughput
+            getlogginglevels
+            getstreamthroughput
+            gossipinfo
+            help
+            invalidatecountercache
+            invalidatekeycache
+            invalidaterowcache
+            join
+            listsnapshots
+            pausehandoff
+            proxyhistograms
+            rangekeysample
+            reloadtriggers
+            resetlocalschema
+            resumehandoff
+            ring
+            setlogginglevel
+            status
+            statusbinary
+            statusthrift
+            stopdaemon
+            tpstats
+            version
+            '
+
+        local lngopt='
+            cfhistograms
+            cleanup
+            clearsnapshot
+            compact
+            describering
+            disableautocompaction
+            enableautocompaction
+            flush
+            getcompactionthreshold
+            getendpoints
+            getsstables
+            info
+            move
+            netstats
+            rebuild
+            rebuild_index
+            refresh
+            removenode
+            repair
+            scrub
+            setcachecapacity
+            setcachekeystosave
+            setcompactionthreshold
+            setcompactionthroughput
+            setstreamthroughput
+            settraceprobability
+            snapshot
+            stop
+            taketoken
+            truncatehints
+            upgradesstables
+            '
+
+        local optwks='
+            cfhistograms
+            cleanup
+            clearsnapshot
+            compact
+            describering
+            flush
+            getcompactionthreshold
+            getendpoints
+            getsstables
+            rebuild_index
+            refresh
+            repair
+            scrub
+            setcompactionthreshold
+            snapshot
+            '
+
+        local optwcfs='
+            cleanup
+            compact
+            disableautocompaction
+            enableautocompaction
+            flush
+            repair
+            scrub
+            upgradesstables
+            '
+
+        if [[ $COMP_CWORD -eq 1 ]] ; then
+            COMPREPLY=( $(compgen -W "${lngopt} ${shopt}" -- "${cur}") )
+        elif [[ $(echo "${lngopt}"|egrep -c "\b${prev}\b") -gt 0 ]] ; then
+            if echo $optwks|grep -q "\b$prev\b" ; then
+                show_keyspaces "${cur}"
+            else
+                case "${prev}" in
+                    removenode)
+                        # we don't want to lose time using nodetool status a 2nd time
+                        # in case of force or status
+                        if [[ "${cur}" =~ ^(f|s) ]] ; then
+                            COMPREPLY=( $(compgen -W "status force" -- "${cur}") )
+                        else
+                            [ -z "$IDS" ] && IDS=$(nodetool status|grep %|awk '{print $7}'|xargs)
+                            COMPREPLY=( $(compgen -W "status force $IDS" -- "${cur}") )
+                        fi
+                        return 0
+                        ;;
+                    stop)
+                        COMPREPLY=( $(compgen -W "COMPACTION VALIDATION CLEANUP SCRUB INDEX_BUILD" -- "${cur}") )
+                        return 0
+                        ;;
+                    info)
+                        COMPREPLY=( $(compgen -W "-T --tokens" -- "${cur}") )
+                        return 0
+                        ;;
+                    rebuild)
+                        show_datacenters "${cur}"
+                        return 0
+                        ;;
+                    upgradesstables)
+                        ks=$(get_keyspaces)
+                        COMPREPLY=( $(compgen -W "-a --include-all-sstables $ks" -- "${cur}") )
+                        return 0
+                        ;;
+                esac
+            fi
+        elif [[ $COMP_CWORD -eq 3 ]] ; then
+            case "${COMP_WORDS[1]}" in
+                cfhistograms|cleanup|compact|flush|getcompactionthreshold|getendpoints|getsstables|rebuild_index|refresh|repair|scrub|setcompactionthreshold)
+                    show_cfs ${prev} ${cur}
+                    return 0
+                    ;;
+                upgradesstables)
+                    if [[ ! ${prev} == -* ]]; then
+                        show_cfs ${prev} ${cur}
+                    fi
+                    return 0
+                    ;;
+                snapshot)
+                    COMPREPLY=( $(compgen -W "-cf" -- "${cur}") )
+                    return 0
+                    ;;
+            esac
+        elif [[ "${optwcfs}" == *${COMP_WORDS[1]}* ]] ; then
+            show_last_cfs ${cur}
+        elif [[ $COMP_CWORD -eq 4 && ${COMP_WORDS[1]} == "snapshot" ]] ; then
+            show_cfs ${COMP_WORDS[2]} ${cur}
+        elif [[ $COMP_CWORD -eq 5 && ${COMP_WORDS[1]} == "snapshot" ]] ; then
+            COMPREPLY=( $(compgen -W "-t" -- "${cur}") )
+        fi
+    }
+    complete -F _nodetool nodetool
+}

diff --git a/debian/patches/001cassandra_yaml_dirs.dpatch b/debian/patches/001cassandra_yaml_dirs.dpatch
new file mode 100644
index 0000000..3d545e5
--- /dev/null
+++ b/debian/patches/001cassandra_yaml_dirs.dpatch

@@ -0,0 +1,36 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 001cassandra_yaml_dirs.dpatch by Tyler Hobbs <tyler@datastax.com>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: No description.
+
+@DPATCH@
+diff -urNad '--exclude=CVS' '--exclude=.svn' '--exclude=.git' '--exclude=.arch' '--exclude=.hg' '--exclude=_darcs' '--exclude=.bzr' cassandra~/conf/cassandra.yaml cassandra/conf/cassandra.yaml
+--- cassandra~/conf/cassandra.yaml	2014-06-05 13:36:22.000000000 -0500
++++ cassandra/conf/cassandra.yaml	2014-06-05 13:39:20.569034040 -0500
+@@ -94,13 +94,13 @@
+ # will spread data evenly across them, subject to the granularity of
+ # the configured compaction strategy.
+ # If not set, the default directory is $CASSANDRA_HOME/data/data.
+-# data_file_directories:
+-#     - /var/lib/cassandra/data
++data_file_directories:
++    - /var/lib/cassandra/data
+ 
+ # commit log.  when running on magnetic HDD, this should be a
+ # separate spindle than the data directories.
+ # If not set, the default directory is $CASSANDRA_HOME/data/commitlog.
+-# commitlog_directory: /var/lib/cassandra/commitlog
++commitlog_directory: /var/lib/cassandra/commitlog
+ 
+ # policy for data disk failures:
+ # stop_paranoid: shut down gossip and Thrift even for single-sstable errors.
+@@ -203,7 +203,7 @@
+ 
+ # saved caches
+ # If not set, the default directory is $CASSANDRA_HOME/data/saved_caches.
+-# saved_caches_directory: /var/lib/cassandra/saved_caches
++saved_caches_directory: /var/lib/cassandra/saved_caches
+ 
+ # commitlog_sync may be either "periodic" or "batch." 
+ # When in batch mode, Cassandra won't ack writes until the commit log

diff --git a/debian/patches/002cassandra_logdir_fix.dpatch b/debian/patches/002cassandra_logdir_fix.dpatch
new file mode 100644
index 0000000..8836eb4
--- /dev/null
+++ b/debian/patches/002cassandra_logdir_fix.dpatch

@@ -0,0 +1,19 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## cassandra_logdir_fix.dpatch by Michael Shuler <michael@pbandjelly.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: No description.
+
+@DPATCH@
+diff -urNad '--exclude=CVS' '--exclude=.svn' '--exclude=.git' '--exclude=.arch' '--exclude=.hg' '--exclude=_darcs' '--exclude=.bzr' cassandra~/bin/cassandra cassandra/bin/cassandra
+--- cassandra~/bin/cassandra	2014-09-15 19:42:28.000000000 -0500
++++ cassandra/bin/cassandra	2014-09-15 21:15:15.627505503 -0500
+@@ -134,7 +134,7 @@
+     props="$3"
+     class="$4"
+     cassandra_parms="-Dlogback.configurationFile=logback.xml"
+-    cassandra_parms="$cassandra_parms -Dcassandra.logdir=$CASSANDRA_HOME/logs"
++    cassandra_parms="$cassandra_parms -Dcassandra.logdir=/var/log/cassandra"
+     cassandra_parms="$cassandra_parms -Dcassandra.storagedir=$cassandra_storagedir"
+ 
+     if [ "x$pidpath" != "x" ]; then

diff --git a/debian/patches/00list b/debian/patches/00list
new file mode 100644
index 0000000..59b0d8b
--- /dev/null
+++ b/debian/patches/00list

@@ -0,0 +1,2 @@
+001cassandra_yaml_dirs.dpatch
+002cassandra_logdir_fix.dpatch

diff --git a/debian/rules b/debian/rules
index de5583f..405175d 100755
--- a/debian/rules
+++ b/debian/rules

@@ -3,6 +3,8 @@
 # Uncomment to enable verbose mode.
 #export DH_VERBOSE=1
 
+include /usr/share/dpatch/dpatch.make
+
 ANT = /usr/bin/ant
 VERSION = $(shell dpkg-parsechangelog | sed -ne 's/^Version: \([^-|~|+]*\).*/\1/p')
 
@@ -10,7 +12,7 @@
 	dh_testdir
 	$(ANT) test
 
-clean:
+clean: unpatch
 	dh_testdir
 	dh_testroot
 	$(ANT) realclean
@@ -22,7 +24,7 @@
 	dh_clean
 
 build: build-stamp
-build-stamp:
+build-stamp: patch-stamp
 	dh_testdir
 	printf "version=%s" $(VERSION) > build.properties
 
@@ -45,9 +47,11 @@
 	dh_install build/apache-cassandra-thrift-$(VERSION).jar \
 		usr/share/cassandra
 
-	# Copy stress jar
+	# Copy stress jars
 	dh_install build/tools/lib/stress.jar \
 		usr/share/cassandra
+	dh_install tools/lib/*.jar \
+		usr/share/cassandra
 
 	dh_link usr/share/cassandra/apache-cassandra-$(VERSION).jar \
 		usr/share/cassandra/apache-cassandra.jar
@@ -61,6 +65,8 @@
 	dh_installchangelogs
 	dh_installinit -u'start 50 2 3 4 5 . stop 50 0 1 6 .'
 	dh_installdocs README.asc CHANGES.txt NEWS.txt
+	dh_installexamples tools/*.yaml
+	dh_bash-completion
 	dh_compress
 	dh_fixperms
 	dh_installdeb

diff --git a/doc/cql/CQL.textile b/doc/cql/CQL.textile
index a874a29..519f8d2 100644
--- a/doc/cql/CQL.textile
+++ b/doc/cql/CQL.textile

@@ -550,7 +550,6 @@
 |default_validation|text|Determines the default storage type of column values (which itself determines the validation for column values). This option does not affect the types of columns which were defined in a @CREATE COLUMNFAMILY@ statement-- only new columns. Valid values are listed in the "Data Storage Types":#storageTypes table above.|
 |min_compaction_threshold|4|Minimum number of SSTables needed to start a minor compaction.|
 |max_compaction_threshold|32|Maximum number of SSTables allowed before a minor compaction is forced.|
-|replicate_on_write|false| |
 |compaction_strategy_options|none|CompactionStrategy specific options such as "sstable_size_in_mb" for LeveledCompactionStrategy and "min_sstable_size" for SizeTieredCompactionStrategy|
 |compression_parameters|none|Compression parameters such as "sstable_compressor" and "chunk_length_kb"|
 

diff --git a/doc/cql3/CQL.textile b/doc/cql3/CQL.textile
index f53448e..3b322d5 100644
--- a/doc/cql3/CQL.textile
+++ b/doc/cql3/CQL.textile

@@ -1,6 +1,6 @@
 <link rel="StyleSheet" href="CQL.css" type="text/css" media="screen">
 
-h1. Cassandra Query Language (CQL) v3.1.7
+h1. Cassandra Query Language (CQL) v3.2.0
 
 
  <span id="tableOfContents">
@@ -216,7 +216,7 @@
 
 bc(syntax).. 
 <create-table-stmt> ::= CREATE ( TABLE | COLUMNFAMILY ) ( IF NOT EXISTS )? <tablename>
-                          '(' <definition> ( ',' <definition> )* ')'
+                          '(' <column-definition> ( ',' <column-definition> )* ')'
                           ( WITH <option> ( AND <option>)* )?
 
 <column-definition> ::= <identifier> <type> ( STATIC )? ( PRIMARY KEY )?
@@ -255,7 +255,7 @@
 
 h4(#createTableName). @<tablename>@
 
-Valid table names are the same than valid "keyspace names":#createKeyspaceStmt (up to 32 characters long alphanumerical identifiers). If the table name is provided alone, the table is created within the current keyspace (see <a href="#useStmt"><tt>USE</tt></a>), but if it is prefixed by an existing keyspace name (see "@<tablename>@":#statements grammar), it is created in the specified keyspace (but does *not* change the current keyspace).
+Valid table names are the same as valid "keyspace names":#createKeyspaceStmt (up to 32 characters long alphanumerical identifiers). If the table name is provided alone, the table is created within the current keyspace (see <a href="#useStmt"><tt>USE</tt></a>), but if it is prefixed by an existing keyspace name (see "@<tablename>@":#statements grammar), it is created in the specified keyspace (but does *not* change the current keyspace).
 
 
 h4(#createTableColumn). @<column-definition>@
@@ -327,7 +327,6 @@
 |@bloom_filter_fp_chance@     | _simple_ | 0.00075     | The target probability of false positive of the sstable bloom filters. Said bloom filters will be sized to provide the provided probability (thus lowering this value impact the size of bloom filters in-memory and on-disk)|
 |@compaction@                 | _map_    | _see below_ | The compaction options to use, see below.|
 |@compression@                | _map_    | _see below_ | Compression options, see below. |
-|@replicate_on_write@         | _simple_ | true        | Whether to replicate data on write. This can only be set to false for tables with counters values. Disabling this is dangerous and can result in random lose of counters, don't disable unless you are sure to know what you are doing|
 |@caching@                    | _simple_ | keys_only   | Whether to cache keys ("key cache") and/or rows ("row cache") for this table. Valid values are: @all@, @keys_only@, @rows_only@ and @none@. |
 
 
@@ -386,7 +385,7 @@
 WITH comment = 'A most excellent and useful column family'
  AND read_repair_chance = 0.2;
 p. 
-The @ALTER@ statement is used to manipulate table definitions. It allows to add new columns, drop existing ones, change the type of existing columns, or update the table options. As for table creation, @ALTER COLUMNFAMILY@ is allowed as an alias for @ALTER TABLE@.
+The @ALTER@ statement is used to manipulate table definitions. It allows for adding new columns, dropping existing ones, changing the type of existing columns, or updating the table options. As with table creation, @ALTER COLUMNFAMILY@ is allowed as an alias for @ALTER TABLE@.
 
 The @<tablename>@ is the table name optionally preceded by the keyspace name.  The @<instruction>@ defines the alteration to perform:
 * @ALTER@: Update the type of a given defined column. Note that the type of the "clustering columns":#createTablepartitionClustering cannot be modified as it induces the on-disk ordering of rows. Columns on which a "secondary index":#createIndexStmt is defined have the same restriction. Other columns are free from those restrictions (no validation of existing data is performed), but it is usually a bad idea to change the type to a non-compatible one, unless no data have been inserted for that column yet, as this could confuse CQL drivers/tools.
@@ -406,7 +405,7 @@
 
 The @DROP TABLE@ statement results in the immediate, irreversible removal of a table, including all data contained in it. As for table creation, @DROP COLUMNFAMILY@ is allowed as an alias for @DROP TABLE@.
 
-If the table does not exists, the statement will return an error, unless @IF EXISTS@ is used in which case the operation is a no-op.
+If the table does not exist, the statement will return an error, unless @IF EXISTS@ is used in which case the operation is a no-op.
 
 h3(#truncateStmt). TRUNCATE
 
@@ -425,14 +424,20 @@
 
 __Syntax:__
 
-bc(syntax). <create-index-stmt> ::= CREATE ( CUSTOM )? INDEX ( IF NOT EXISTS )? <identifier>? ON <tablename> '(' <identifier> ')'
-                                        ( USING <string> ( WITH OPTIONS = <map-literal> )? )?
+bc(syntax).. 
+<create-index-stmt> ::= CREATE ( CUSTOM )? INDEX ( IF NOT EXISTS )? ( <indexname> )?
+                            ON <tablename> '(' <index-identifier> ')'
+                            ( USING <string> ( WITH OPTIONS = <map-literal> )? )?
 
+<index-identifier> ::= <identifier>
+                     | keys( <identifier> )
+p. 
 __Sample:__
 
 bc(sample). 
 CREATE INDEX userIndex ON NerdMovies (user);
 CREATE INDEX ON Mutants (abilityId);
+CREATE INDEX ON users (keys(favs));
 CREATE CUSTOM INDEX ON users (email) USING 'path.to.the.IndexClass';
 CREATE CUSTOM INDEX ON users (email) USING 'path.to.the.IndexClass' WITH OPTIONS = {'storage': '/mnt/ssd/indexes/'};
 
@@ -440,20 +445,134 @@
 
 Attempting to create an already existing index will return an error unless the @IF NOT EXISTS@ option is used. If it is used, the statement will be a no-op if the index already exists.
 
+h4(#keysIndex). Indexes on Map Keys
+
+When creating an index on a "map column":#map, you may index either the keys or the values.  If the column identifier is placed within the @keys()@ function, the index will be on the map keys, allowing you to use @CONTAINS KEY@ in @WHERE@ clauses.  Otherwise, the index will be on the map values.
+
 h3(#dropIndexStmt). DROP INDEX
 
 __Syntax:__
 
-bc(syntax).  <drop-index-stmt> ::= DROP INDEX ( IF EXISTS )? <identifier>
+bc(syntax).  <drop-index-stmt> ::= DROP INDEX ( IF EXISTS )? ( <keyspace> '.' )? <identifier>
 
 __Sample:__
 
-bc(sample). DROP INDEX userIndex;
+bc(sample).. 
+DROP INDEX userIndex;
 
-The @DROP INDEX@ statement is used to drop an existing secondary index. The argument of the statement is the index name.
+DROP INDEX userkeyspace.address_index;
+p. 
+The @DROP INDEX@ statement is used to drop an existing secondary index. The argument of the statement is the index name, which may optionally specify the keyspace of the index.
 
 If the index does not exists, the statement will return an error, unless @IF EXISTS@ is used in which case the operation is a no-op.
 
+h3(#createTypeStmt). CREATE TYPE
+
+__Syntax:__
+
+bc(syntax).. 
+<create-type-stmt> ::= CREATE TYPE ( IF NOT EXISTS )? <typename>
+                         '(' <field-definition> ( ',' <field-definition> )* ')'
+
+<typename> ::= ( <keyspace-name> '.' )? <identifier>
+
+<field-definition> ::= <identifier> <type>
+
+p. 
+__Sample:__
+
+bc(sample).. 
+CREATE TYPE address (
+    street_name text,
+    street_number int,
+    city text,
+    state text,
+    zip int
+)
+
+CREATE TYPE work_and_home_addresses (
+    home_address address,
+    work_address address
+)
+p. 
+The @CREATE TYPE@ statement creates a new user-defined type.  Each type is a set of named, typed fields.  Field types may be any valid type, including collections and other existing user-defined types.
+
+Attempting to create an already existing type will result in an error unless the @IF NOT EXISTS@ option is used.  If it is used, the statement will be a no-op if the type already exists.
+
+h4(#createTypeName). @<typename>@
+
+Valid type names are identifiers.  The names of existing CQL types and "reserved type names":#appendixB may not be used.
+
+If the type name is provided alone, the type is created with the current keyspace (see <a href="#useStmt"><tt>USE</tt></a>). If it is prefixed by an existing keyspace name, the type is created within the specified keyspace instead of the current keyspace.
+
+h3(#alterTypeStmt). ALTER TYPE
+
+__Syntax:__
+
+bc(syntax).. 
+<alter-type-stmt> ::= ALTER TYPE <typename> <instruction>
+
+<instruction> ::= ALTER <field-name> TYPE <type>
+                | ADD <field-name> <type>
+                | RENAME <field-name> TO <field-name> ( AND <field-name> TO <field-name> )*
+p. 
+__Sample:__
+
+bc(sample).. 
+ALTER TYPE address ALTER zip TYPE varint
+
+ALTER TYPE address ADD country text
+
+ALTER TYPE address RENAME zip TO zipcode AND street_name TO street
+p. 
+The @ALTER TYPE@ statement is used to manipulate type definitions. It allows for adding new fields, renaming existing fields, or changing the type of existing fields.
+
+When altering the type of a column, the new type must be compatible with the previous type.
+
+h3(#dropTypeStmt). DROP TYPE
+
+__Syntax:__
+
+bc(syntax).. 
+<drop-type-stmt> ::= DROP TYPE ( IF EXISTS )? <typename>
+p. 
+The @DROP TYPE@ statement results in the immediate, irreversible removal of a type.  Attempting to drop a type that is still in use by another type or a table will result in an error.
+
+If the type does not exist, an error will be returned unless @IF EXISTS@ is used, in which case the operation is a no-op.
+
+h3(#createTriggerStmt). CREATE TRIGGER
+
+__Syntax:__
+
+bc(syntax).. 
+<create-trigger-stmt> ::= CREATE TRIGGER ( IF NOT EXISTS )? ( <triggername> )?
+                            ON <tablename> 
+                            USING <string>
+
+p. 
+__Sample:__
+
+bc(sample). 
+CREATE TRIGGER myTrigger ON myTable USING 'org.apache.cassandra.triggers.InvertedIndex';
+
+The actual logic that makes up the trigger can be written in any Java (JVM) language and exists outside the database. You place the trigger code in a @lib/triggers@ subdirectory of the Cassandra installation directory, it loads during cluster startup, and exists on every node that participates in a cluster. The trigger defined on a table fires before a requested DML statement occurs, which ensures the atomicity of the transaction.
+
+h3(#dropTriggerStmt). DROP TRIGGER
+
+__Syntax:__
+
+bc(syntax).. 
+<drop-trigger-stmt> ::= DROP TRIGGER ( IF EXISTS )? ( <triggername> )?
+                            ON <tablename>
+
+p. 
+__Sample:__
+
+bc(sample). 
+DROP TRIGGER myTrigger ON myTable;
+
+@DROP TRIGGER@ statement removes the registration of a trigger created using @CREATE TRIGGER@.
+
 h2(#dataManipulation). Data Manipulation
 
 h3(#insertStmt). INSERT
@@ -671,7 +790,7 @@
              | '(' <identifier> (',' <identifier>)* ')' IN '(' ( <term-tuple> ( ',' <term-tuple>)* )? ')'
              | TOKEN '(' <identifier> ( ',' <identifer>)* ')' <op> <term>
 
-<op> ::= '=' | '<' | '>' | '<=' | '>='
+<op> ::= '=' | '<' | '>' | '<=' | '>=' | CONTAINS | CONTAINS KEY
 <order-by> ::= <ordering> ( ',' <odering> )*
 <ordering> ::= <identifer> ( ASC | DESC )?
 <term-tuple> ::= '(' <term> (',' <term>)* ')'
@@ -756,6 +875,8 @@
 bc(sample). 
 SELECT * FROM posts WHERE userid='john doe' AND (blog_title, posted_at) IN (('John''s Blog', '2012-01-01), ('Extreme Chess', '2014-06-01'))
 
+The @CONTAINS@ operator may only be used on collection columns (lists, sets, and maps).  In the case of maps, @CONTAINS@ applies to the map values. The @CONTAINS KEY@ operator may only be used on map columns and applies to the map keys.
+
 h4(#selectOrderBy). @<order-by>@
 
 The @ORDER BY@ option allows to select the order of the returned results. It takes as argument a list of column names along with the order for the column (@ASC@ for ascendant and @DESC@ for descendant, omitting the order being equivalent to @ASC@). Currently the possible orderings are limited (which depends on the table "@CLUSTERING ORDER@":#createTableOptions):
@@ -812,6 +933,7 @@
 bc(syntax).. 
 <type> ::= <native-type>
          | <collection-type>
+         | <tuple-type>
          | <string>       // Used for custom types. The fully-qualified name of a JAVA class
 
 <native-type> ::= ascii
@@ -834,6 +956,7 @@
 <collection-type> ::= list '<' <native-type> '>'
                     | set  '<' <native-type> '>'
                     | map  '<' <native-type> ',' <native-type> '>'
+<tuple-type> ::= tuple '<' <type> (',' <type>)* '>'
 p. Note that the native types are keywords and as such are case-insensitive. They are however not reserved ones.
 
 p. The following table gives additional informations on the native data types, and on which kind of "constants":#constants each type supports:
@@ -1165,10 +1288,31 @@
 | @WRITETIME@    | no  |
 | @DISTINCT@     | no  |
 
+h2(#appendixB). Appendix B: CQL Reserved Types
+
+The following type names are not currently used by CQL, but are reserved for potential future use.  User-defined types may not use reserved type names as their name.
+
+|_. type      |
+| @byte@      |
+| @smallint@  |
+| @complex@   |
+| @enum@      |
+| @date@      |
+| @interval@  |
+| @macaddr@   |
+| @bitstring@ |
 
 h2(#changes). Changes
 
-The following describes the addition/changes brought for each version of CQL.
+The following describes the changes in each version of CQL.
+
+h3. 3.2.0
+
+* User-defined types are now supported through "@CREATE TYPE@":#createTypeStmt, "@ALTER TYPE@":#alterTypeStmt, and "@DROP TYPE@":#dropTypeStmt
+* "@CREATE INDEX@":#createIndexStmt now supports indexing collection columns, including indexing the keys of map collections through the @keys()@ function
+* Indexes on collections may be queried using the new @CONTAINS@ and @CONTAINS KEY@ operators
+* Tuple types were added to hold fixed-length sets of typed positional fields (see the section on "types":#types)
+* "@DROP INDEX@":#dropIndexStmt now supports optionally specifying a keyspace
 
 h3. 3.1.7
 

diff --git a/doc/native_protocol_v1.spec b/doc/native_protocol_v1.spec
index 08cb91e..83154fc 100644
--- a/doc/native_protocol_v1.spec
+++ b/doc/native_protocol_v1.spec

@@ -472,8 +472,10 @@
       the empty string "") if the change was affecting a keyspace and not a
       table.
 
-  Note that queries to create and drop an index are considered as change
-  updating the table the index is on.
+  Note that queries to create and drop an index are considered changes
+  updating the table the index is on.  Queries that create, alter, or drop
+  user-defined types (availble in Cassandra 2.1+) are considered changes
+  updating the keyspace the type is defined in.
 
 
 4.2.6. EVENT
@@ -499,6 +501,9 @@
       followed by the name of the affected keyspace and the name of the
       affected table within that keyspace. For changes that affect a keyspace
       directly, the table name will be empty (i.e. the empty string "").
+      Changes to user-defined types (available in Cassandra 2.1+) will result
+      in an "UPDATED" change for the keyspace containing the type, and the
+      table name will be empty.
 
   All EVENT message have a streamId of -1 (Section 2.3).
 

diff --git a/doc/native_protocol_v2.spec b/doc/native_protocol_v2.spec
index 11d380f..0f0cbcb 100644
--- a/doc/native_protocol_v2.spec
+++ b/doc/native_protocol_v2.spec

@@ -590,8 +590,10 @@
       the empty string "") if the change was affecting a keyspace and not a
       table.
 
-  Note that queries to create and drop an index are considered as change
-  updating the table the index is on.
+  Note that queries to create and drop an index are considered changes
+  updating the table the index is on.  Queries that create, alter, or drop
+  user-defined types (availble in Cassandra 2.1+) are considered changes
+  updating the keyspace the type is defined in.
 
 
 4.2.6. EVENT
@@ -617,6 +619,9 @@
       followed by the name of the affected keyspace and the name of the
       affected table within that keyspace. For changes that affect a keyspace
       directly, the table name will be empty (i.e. the empty string "").
+      Changes to user-defined types (available in Cassandra 2.1+) will result
+      in an "UPDATED" change for the keyspace containing the type, and the
+      table name will be empty.
 
   All EVENT message have a streamId of -1 (Section 2.3).
 

diff --git a/doc/native_protocol_v3.spec b/doc/native_protocol_v3.spec
new file mode 100644
index 0000000..13b6ac6
--- /dev/null
+++ b/doc/native_protocol_v3.spec

@@ -0,0 +1,914 @@
+
+                             CQL BINARY PROTOCOL v3
+
+
+Table of Contents
+
+  1. Overview
+  2. Frame header
+    2.1. version
+    2.2. flags
+    2.3. stream
+    2.4. opcode
+    2.5. length
+  3. Notations
+  4. Messages
+    4.1. Requests
+      4.1.1. STARTUP
+      4.1.2. AUTH_RESPONSE
+      4.1.3. OPTIONS
+      4.1.4. QUERY
+      4.1.5. PREPARE
+      4.1.6. EXECUTE
+      4.1.7. BATCH
+      4.1.8. REGISTER
+    4.2. Responses
+      4.2.1. ERROR
+      4.2.2. READY
+      4.2.3. AUTHENTICATE
+      4.2.4. SUPPORTED
+      4.2.5. RESULT
+        4.2.5.1. Void
+        4.2.5.2. Rows
+        4.2.5.3. Set_keyspace
+        4.2.5.4. Prepared
+        4.2.5.5. Schema_change
+      4.2.6. EVENT
+      4.2.7. AUTH_CHALLENGE
+      4.2.8. AUTH_SUCCESS
+  5. Compression
+  6. Collection types
+  7. User Defined and tuple types
+  8. Result paging
+  9. Error codes
+  10. Changes from v2
+
+
+1. Overview
+
+  The CQL binary protocol is a frame based protocol. Frames are defined as:
+
+      0         8        16        24        32         40
+      +---------+---------+---------+---------+---------+
+      | version |  flags  |      stream       | opcode  |
+      +---------+---------+---------+---------+---------+
+      |                length                 |
+      +---------+---------+---------+---------+
+      |                                       |
+      .            ...  body ...              .
+      .                                       .
+      .                                       .
+      +----------------------------------------
+
+  The protocol is big-endian (network byte order).
+
+  Each frame contains a fixed size header (9 bytes) followed by a variable size
+  body. The header is described in Section 2. The content of the body depends
+  on the header opcode value (the body can in particular be empty for some
+  opcode values). The list of allowed opcode is defined Section 2.3 and the
+  details of each corresponding message is described Section 4.
+
+  The protocol distinguishes 2 types of frames: requests and responses. Requests
+  are those frame sent by the clients to the server, response are the ones sent
+  by the server. Note however that the protocol supports server pushes (events)
+  so responses does not necessarily come right after a client request.
+
+  Note to client implementors: clients library should always assume that the
+  body of a given frame may contain more data than what is described in this
+  document. It will however always be safe to ignore the remaining of the frame
+  body in such cases. The reason is that this may allow to sometimes extend the
+  protocol with optional features without needing to change the protocol
+  version.
+
+
+
+2. Frame header
+
+2.1. version
+
+  The version is a single byte that indicate both the direction of the message
+  (request or response) and the version of the protocol in use. The up-most bit
+  of version is used to define the direction of the message: 0 indicates a
+  request, 1 indicates a responses. This can be useful for protocol analyzers to
+  distinguish the nature of the packet from the direction which it is moving.
+  The rest of that byte is the protocol version (3 for the protocol defined in
+  this document). In other words, for this version of the protocol, version will
+  have one of:
+    0x03    Request frame for this protocol version
+    0x83    Response frame for this protocol version
+
+  Please note that the while every message ship with the version, only one version
+  of messages is accepted on a given connection. In other words, the first message
+  exchanged (STARTUP) sets the version for the connection for the lifetime of this
+  connection.
+
+  This document describe the version 3 of the protocol. For the changes made since
+  version 2, see Section 10.
+
+
+2.2. flags
+
+  Flags applying to this frame. The flags have the following meaning (described
+  by the mask that allow to select them):
+    0x01: Compression flag. If set, the frame body is compressed. The actual
+          compression to use should have been set up beforehand through the
+          Startup message (which thus cannot be compressed; Section 4.1.1).
+    0x02: Tracing flag. For a request frame, this indicate the client requires
+          tracing of the request. Note that not all requests support tracing.
+          Currently, only QUERY, PREPARE and EXECUTE queries support tracing.
+          Other requests will simply ignore the tracing flag if set. If a
+          request support tracing and the tracing flag was set, the response to
+          this request will have the tracing flag set and contain tracing
+          information.
+          If a response frame has the tracing flag set, its body contains
+          a tracing ID. The tracing ID is a [uuid] and is the first thing in
+          the frame body. The rest of the body will then be the usual body
+          corresponding to the response opcode.
+
+  The rest of the flags is currently unused and ignored.
+
+2.3. stream
+
+  A frame has a stream id (a [short] value). When sending request messages, this
+  stream id must be set by the client to a non-negative value (negative stream id
+  are reserved for streams initiated by the server; currently all EVENT messages
+  (section 4.2.6) have a streamId of -1). If a client sends a request message
+  with the stream id X, it is guaranteed that the stream id of the response to
+  that message will be X.
+
+  This allow to deal with the asynchronous nature of the protocol. If a client
+  sends multiple messages simultaneously (without waiting for responses), there
+  is no guarantee on the order of the responses. For instance, if the client
+  writes REQ_1, REQ_2, REQ_3 on the wire (in that order), the server might
+  respond to REQ_3 (or REQ_2) first. Assigning different stream id to these 3
+  requests allows the client to distinguish to which request an received answer
+  respond to. As there can only be 32768 different simultaneous streams, it is up
+  to the client to reuse stream id.
+
+  Note that clients are free to use the protocol synchronously (i.e. wait for
+  the response to REQ_N before sending REQ_N+1). In that case, the stream id
+  can be safely set to 0. Clients should also feel free to use only a subset of
+  the 32768 maximum possible stream ids if it is simpler for those
+  implementation.
+
+2.4. opcode
+
+  An integer byte that distinguish the actual message:
+    0x00    ERROR
+    0x01    STARTUP
+    0x02    READY
+    0x03    AUTHENTICATE
+    0x05    OPTIONS
+    0x06    SUPPORTED
+    0x07    QUERY
+    0x08    RESULT
+    0x09    PREPARE
+    0x0A    EXECUTE
+    0x0B    REGISTER
+    0x0C    EVENT
+    0x0D    BATCH
+    0x0E    AUTH_CHALLENGE
+    0x0F    AUTH_RESPONSE
+    0x10    AUTH_SUCCESS
+
+  Messages are described in Section 4.
+
+  (Note that there is no 0x04 message in this version of the protocol)
+
+
+2.5. length
+
+  A 4 byte integer representing the length of the body of the frame (note:
+  currently a frame is limited to 256MB in length).
+
+
+3. Notations
+
+  To describe the layout of the frame body for the messages in Section 4, we
+  define the following:
+
+    [int]          A 4 bytes integer
+    [long]         A 8 bytes integer
+    [short]        A 2 bytes unsigned integer
+    [string]       A [short] n, followed by n bytes representing an UTF-8
+                   string.
+    [long string]  An [int] n, followed by n bytes representing an UTF-8 string.
+    [uuid]         A 16 bytes long uuid.
+    [string list]  A [short] n, followed by n [string].
+    [bytes]        A [int] n, followed by n bytes if n >= 0. If n < 0,
+                   no byte should follow and the value represented is `null`.
+    [short bytes]  A [short] n, followed by n bytes if n >= 0.
+
+    [option]       A pair of <id><value> where <id> is a [short] representing
+                   the option id and <value> depends on that option (and can be
+                   of size 0). The supported id (and the corresponding <value>)
+                   will be described when this is used.
+    [option list]  A [short] n, followed by n [option].
+    [inet]         An address (ip and port) to a node. It consists of one
+                   [byte] n, that represents the address size, followed by n
+                   [byte] representing the IP address (in practice n can only be
+                   either 4 (IPv4) or 16 (IPv6)), following by one [int]
+                   representing the port.
+    [consistency]  A consistency level specification. This is a [short]
+                   representing a consistency level with the following
+                   correspondance:
+                     0x0000    ANY
+                     0x0001    ONE
+                     0x0002    TWO
+                     0x0003    THREE
+                     0x0004    QUORUM
+                     0x0005    ALL
+                     0x0006    LOCAL_QUORUM
+                     0x0007    EACH_QUORUM
+                     0x0008    SERIAL
+                     0x0009    LOCAL_SERIAL
+                     0x000A    LOCAL_ONE
+
+    [string map]      A [short] n, followed by n pair <k><v> where <k> and <v>
+                      are [string].
+    [string multimap] A [short] n, followed by n pair <k><v> where <k> is a
+                      [string] and <v> is a [string list].
+
+
+4. Messages
+
+4.1. Requests
+
+  Note that outside of their normal responses (described below), all requests
+  can get an ERROR message (Section 4.2.1) as response.
+
+4.1.1. STARTUP
+
+  Initialize the connection. The server will respond by either a READY message
+  (in which case the connection is ready for queries) or an AUTHENTICATE message
+  (in which case credentials will need to be provided using AUTH_RESPONSE).
+
+  This must be the first message of the connection, except for OPTIONS that can
+  be sent before to find out the options supported by the server. Once the
+  connection has been initialized, a client should not send any more STARTUP
+  message.
+
+  The body is a [string map] of options. Possible options are:
+    - "CQL_VERSION": the version of CQL to use. This option is mandatory and
+      currenty, the only version supported is "3.0.0". Note that this is
+      different from the protocol version.
+    - "COMPRESSION": the compression algorithm to use for frames (See section 5).
+      This is optional, if not specified no compression will be used.
+
+
+4.1.2. AUTH_RESPONSE
+
+  Answers a server authentication challenge.
+
+  Authentication in the protocol is SASL based. The server sends authentication
+  challenges (a bytes token) to which the client answer with this message. Those
+  exchanges continue until the server accepts the authentication by sending a
+  AUTH_SUCCESS message after a client AUTH_RESPONSE. It is however that client that
+  initiate the exchange by sending an initial AUTH_RESPONSE in response to a
+  server AUTHENTICATE request.
+
+  The body of this message is a single [bytes] token. The details of what this
+  token contains (and when it can be null/empty, if ever) depends on the actual
+  authenticator used.
+
+  The response to a AUTH_RESPONSE is either a follow-up AUTH_CHALLENGE message,
+  an AUTH_SUCCESS message or an ERROR message.
+
+
+4.1.3. OPTIONS
+
+  Asks the server to return what STARTUP options are supported. The body of an
+  OPTIONS message should be empty and the server will respond with a SUPPORTED
+  message.
+
+
+4.1.4. QUERY
+
+  Performs a CQL query. The body of the message must be:
+    <query><query_parameters>
+  where <query> is a [long string] representing the query and
+  <query_parameters> must be
+    <consistency><flags>[<n>[name_1]<value_1>...[name_n]<value_n>][<result_page_size>][<paging_state>][<serial_consistency>][<timestamp>]
+  where:
+    - <consistency> is the [consistency] level for the operation.
+    - <flags> is a [byte] whose bits define the options for this query and
+      in particular influence what the remainder of the message contains.
+      A flag is set if the bit corresponding to its `mask` is set. Supported
+      flags are, given there mask:
+        0x01: Values. In that case, a [short] <n> followed by <n> [bytes]
+              values are provided. Those value are used for bound variables in
+              the query. Optionally, if the 0x40 flag is present, each value
+              will be preceded by a [string] name, representing the name of
+              the marker the value must be binded to. This is optional, and
+              if not present, values will be binded by position.
+        0x02: Skip_metadata. If present, the Result Set returned as a response
+              to that query (if any) will have the NO_METADATA flag (see
+              Section 4.2.5.2).
+        0x04: Page_size. In that case, <result_page_size> is an [int]
+              controlling the desired page size of the result (in CQL3 rows).
+              See the section on paging (Section 8) for more details.
+        0x08: With_paging_state. If present, <paging_state> should be present.
+              <paging_state> is a [bytes] value that should have been returned
+              in a result set (Section 4.2.5.2). If provided, the query will be
+              executed but starting from a given paging state. This also to
+              continue paging on a different node from the one it has been
+              started (See Section 8 for more details).
+        0x10: With serial consistency. If present, <serial_consistency> should be
+              present. <serial_consistency> is the [consistency] level for the
+              serial phase of conditional updates. That consitency can only be
+              either SERIAL or LOCAL_SERIAL and if not present, it defaults to
+              SERIAL. This option will be ignored for anything else that a
+              conditional update/insert.
+        0x20: With default timestamp. If present, <timestamp> should be present.
+              <timestamp> is a [long] representing the default timestamp for the query
+              in microseconds (negative values are forbidden). If provided, this will
+              replace the server side assigned timestamp as default timestamp.
+              Note that a timestamp in the query itself will still override
+              this timestamp. This is entirely optional.
+        0x40: With names for values. This only makes sense if the 0x01 flag is set and
+              is ignored otherwise. If present, the values from the 0x01 flag will
+              be preceded by a name (see above). Note that this is only useful for
+              QUERY requests where named bind markers are used; for EXECUTE statements,
+              since the names for the expected values was returned during preparation,
+              a client can always provide values in the right order without any names
+              and using this flag, while supported, is almost surely inefficient.
+
+  Note that the consistency is ignored by some queries (USE, CREATE, ALTER,
+  TRUNCATE, ...).
+
+  The server will respond to a QUERY message with a RESULT message, the content
+  of which depends on the query.
+
+
+4.1.5. PREPARE
+
+  Prepare a query for later execution (through EXECUTE). The body consists of
+  the CQL query to prepare as a [long string].
+
+  The server will respond with a RESULT message with a `prepared` kind (0x0004,
+  see Section 4.2.5).
+
+
+4.1.6. EXECUTE
+
+  Executes a prepared query. The body of the message must be:
+    <id><query_parameters>
+  where <id> is the prepared query ID. It's the [short bytes] returned as a
+  response to a PREPARE message. As for <query_parameters>, it has the exact
+  same definition than in QUERY (see Section 4.1.4).
+
+  The response from the server will be a RESULT message.
+
+
+4.1.7. BATCH
+
+  Allows executing a list of queries (prepared or not) as a batch (note that
+  only DML statements are accepted in a batch). The body of the message must
+  be:
+    <type><n><query_1>...<query_n><consistency><flags>[<serial_consistency>][<timestamp>]
+  where:
+    - <type> is a [byte] indicating the type of batch to use:
+        - If <type> == 0, the batch will be "logged". This is equivalent to a
+          normal CQL3 batch statement.
+        - If <type> == 1, the batch will be "unlogged".
+        - If <type> == 2, the batch will be a "counter" batch (and non-counter
+          statements will be rejected).
+    - <flags> is a [byte] whose bits define the options for this query and
+      in particular influence the remainder of the message contains. It is similar
+      to the <flags> from QUERY and EXECUTE methods, except that the 4 rightmost
+      bits must always be 0 as their corresponding option do not make sense for
+      Batch. A flag is set if the bit corresponding to its `mask` is set. Supported
+      flags are, given there mask:
+        0x10: With serial consistency. If present, <serial_consistency> should be
+              present. <serial_consistency> is the [consistency] level for the
+              serial phase of conditional updates. That consitency can only be
+              either SERIAL or LOCAL_SERIAL and if not present, it defaults to
+              SERIAL. This option will be ignored for anything else that a
+              conditional update/insert.
+        0x20: With default timestamp. If present, <timestamp> should be present.
+              <timestamp> is a [long] representing the default timestamp for the query
+              in microseconds. If provided, this will replace the server side assigned
+              timestamp as default timestamp. Note that a timestamp in the query itself
+              will still override this timestamp. This is entirely optional.
+        0x40: With names for values. If set, then all values for all <query_i> must be
+              preceded by a [string] <name_i> that have the same meaning as in QUERY
+              requests.
+    - <n> is a [short] indicating the number of following queries.
+    - <query_1>...<query_n> are the queries to execute. A <query_i> must be of the
+      form:
+        <kind><string_or_id><n>[<name_1>]<value_1>...[<name_n>]<value_n>
+      where:
+       - <kind> is a [byte] indicating whether the following query is a prepared
+         one or not. <kind> value must be either 0 or 1.
+       - <string_or_id> depends on the value of <kind>. If <kind> == 0, it should be
+         a [long string] query string (as in QUERY, the query string might contain
+         bind markers). Otherwise (that is, if <kind> == 1), it should be a
+         [short bytes] representing a prepared query ID.
+       - <n> is a [short] indicating the number (possibly 0) of following values.
+       - <name_i> is the optional name of the following <value_i>. It must be present
+         if and only if the 0x40 flag is provided for the batch.
+       - <value_i> is the [bytes] to use for bound variable i (of bound variable <name_i>
+         if the 0x40 flag is used).
+    - <consistency> is the [consistency] level for the operation.
+    - <serial_consistency> is only present if the 0x10 flag is set. In that case,
+      <serial_consistency> is the [consistency] level for the serial phase of
+      conditional updates. That consitency can only be either SERIAL or
+      LOCAL_SERIAL and if not present will defaults to SERIAL. This option will
+      be ignored for anything else that a conditional update/insert.
+
+  The server will respond with a RESULT message.
+
+
+4.1.8. REGISTER
+
+  Register this connection to receive some type of events. The body of the
+  message is a [string list] representing the event types to register to. See
+  section 4.2.6 for the list of valid event types.
+
+  The response to a REGISTER message will be a READY message.
+
+  Please note that if a client driver maintains multiple connections to a
+  Cassandra node and/or connections to multiple nodes, it is advised to
+  dedicate a handful of connections to receive events, but to *not* register
+  for events on all connections, as this would only result in receiving
+  multiple times the same event messages, wasting bandwidth.
+
+
+4.2. Responses
+
+  This section describes the content of the frame body for the different
+  responses. Please note that to make room for future evolution, clients should
+  support extra informations (that they should simply discard) to the one
+  described in this document at the end of the frame body.
+
+4.2.1. ERROR
+
+  Indicates an error processing a request. The body of the message will be an
+  error code ([int]) followed by a [string] error message. Then, depending on
+  the exception, more content may follow. The error codes are defined in
+  Section 9, along with their additional content if any.
+
+
+4.2.2. READY
+
+  Indicates that the server is ready to process queries. This message will be
+  sent by the server either after a STARTUP message if no authentication is
+  required, or after a successful CREDENTIALS message.
+
+  The body of a READY message is empty.
+
+
+4.2.3. AUTHENTICATE
+
+  Indicates that the server require authentication, and which authentication
+  mechanism to use.
+
+  The authentication is SASL based and thus consists on a number of server
+  challenges (AUTH_CHALLENGE, Section 4.2.7) followed by client responses
+  (AUTH_RESPONSE, Section 4.1.2). The Initial exchange is however boostrapped
+  by an initial client response. The details of that exchange (including how
+  much challenge-response pair are required) are specific to the authenticator
+  in use. The exchange ends when the server sends an AUTH_SUCCESS message or
+  an ERROR message.
+
+  This message will be sent following a STARTUP message if authentication is
+  required and must be answered by a AUTH_RESPONSE message from the client.
+
+  The body consists of a single [string] indicating the full class name of the
+  IAuthenticator in use.
+
+
+4.2.4. SUPPORTED
+
+  Indicates which startup options are supported by the server. This message
+  comes as a response to an OPTIONS message.
+
+  The body of a SUPPORTED message is a [string multimap]. This multimap gives
+  for each of the supported STARTUP options, the list of supported values.
+
+
+4.2.5. RESULT
+
+  The result to a query (QUERY, PREPARE, EXECUTE or BATCH messages).
+
+  The first element of the body of a RESULT message is an [int] representing the
+  `kind` of result. The rest of the body depends on the kind. The kind can be
+  one of:
+    0x0001    Void: for results carrying no information.
+    0x0002    Rows: for results to select queries, returning a set of rows.
+    0x0003    Set_keyspace: the result to a `use` query.
+    0x0004    Prepared: result to a PREPARE message.
+    0x0005    Schema_change: the result to a schema altering query.
+
+  The body for each kind (after the [int] kind) is defined below.
+
+
+4.2.5.1. Void
+
+  The rest of the body for a Void result is empty. It indicates that a query was
+  successful without providing more information.
+
+
+4.2.5.2. Rows
+
+  Indicates a set of rows. The rest of body of a Rows result is:
+    <metadata><rows_count><rows_content>
+  where:
+    - <metadata> is composed of:
+        <flags><columns_count>[<paging_state>][<global_table_spec>?<col_spec_1>...<col_spec_n>]
+      where:
+        - <flags> is an [int]. The bits of <flags> provides information on the
+          formatting of the remaining informations. A flag is set if the bit
+          corresponding to its `mask` is set. Supported flags are, given there
+          mask:
+            0x0001    Global_tables_spec: if set, only one table spec (keyspace
+                      and table name) is provided as <global_table_spec>. If not
+                      set, <global_table_spec> is not present.
+            0x0002    Has_more_pages: indicates whether this is not the last
+                      page of results and more should be retrieve. If set, the
+                      <paging_state> will be present. The <paging_state> is a
+                      [bytes] value that should be used in QUERY/EXECUTE to
+                      continue paging and retrieve the remained of the result for
+                      this query (See Section 8 for more details).
+            0x0004    No_metadata: if set, the <metadata> is only composed of
+                      these <flags>, the <column_count> and optionally the
+                      <paging_state> (depending on the Has_more_pages flage) but
+                      no other information (so no <global_table_spec> nor <col_spec_i>).
+                      This will only ever be the case if this was requested
+                      during the query (see QUERY and RESULT messages).
+        - <columns_count> is an [int] representing the number of columns selected
+          by the query this result is of. It defines the number of <col_spec_i>
+          elements in and the number of element for each row in <rows_content>.
+        - <global_table_spec> is present if the Global_tables_spec is set in
+          <flags>. If present, it is composed of two [string] representing the
+          (unique) keyspace name and table name the columns return are of.
+        - <col_spec_i> specifies the columns returned in the query. There is
+          <column_count> such column specifications that are composed of:
+            (<ksname><tablename>)?<name><type>
+          The initial <ksname> and <tablename> are two [string] are only present
+          if the Global_tables_spec flag is not set. The <column_name> is a
+          [string] and <type> is an [option] that correspond to the description
+          (what this description is depends a bit on the context: in results to
+          selects, this will be either the user chosen alias or the selection used
+          (often a colum name, but it can be a function call too). In results to
+          a PREPARE, this will be either the name of the bind variable corresponding
+          or the column name for the variable if it is "anonymous") and type of
+          the corresponding result. The option for <type> is either a native
+          type (see below), in which case the option has no value, or a
+          'custom' type, in which case the value is a [string] representing
+          the full qualified class name of the type represented. Valid option
+          ids are:
+            0x0000    Custom: the value is a [string], see above.
+            0x0001    Ascii
+            0x0002    Bigint
+            0x0003    Blob
+            0x0004    Boolean
+            0x0005    Counter
+            0x0006    Decimal
+            0x0007    Double
+            0x0008    Float
+            0x0009    Int
+            0x000B    Timestamp
+            0x000C    Uuid
+            0x000D    Varchar
+            0x000E    Varint
+            0x000F    Timeuuid
+            0x0010    Inet
+            0x0020    List: the value is an [option], representing the type
+                            of the elements of the list.
+            0x0021    Map: the value is two [option], representing the types of the
+                           keys and values of the map
+            0x0022    Set: the value is an [option], representing the type
+                            of the elements of the set
+            0x0030    UDT: the value is <ks><udt_name><n><name_1><type_1>...<name_n><type_n>
+                           where:
+                              - <ks> is a [string] representing the keyspace name this
+                                UDT is part of.
+                              - <udt_name> is a [string] representing the UDT name.
+                              - <n> is a [short] reprensenting the number of fields of
+                                the UDT, and thus the number of <name_i><type_i> pair
+                                following
+                              - <name_i> is a [string] representing the name of the
+                                i_th field of the UDT.
+                              - <type_i> is an [option] representing the type of the
+                                i_th field of the UDT.
+            0x0031    Tuple: the value is <n><type_1>...<type_n> where <n> is a [short]
+                             representing the number of value in the type, and <type_i>
+                             are [option] representing the type of the i_th component
+                             of the tuple
+
+    - <rows_count> is an [int] representing the number of rows present in this
+      result. Those rows are serialized in the <rows_content> part.
+    - <rows_content> is composed of <row_1>...<row_m> where m is <rows_count>.
+      Each <row_i> is composed of <value_1>...<value_n> where n is
+      <columns_count> and where <value_j> is a [bytes] representing the value
+      returned for the jth column of the ith row. In other words, <rows_content>
+      is composed of (<rows_count> * <columns_count>) [bytes].
+
+
+4.2.5.3. Set_keyspace
+
+  The result to a `use` query. The body (after the kind [int]) is a single
+  [string] indicating the name of the keyspace that has been set.
+
+
+4.2.5.4. Prepared
+
+  The result to a PREPARE message. The rest of the body of a Prepared result is:
+    <id><metadata><result_metadata>
+  where:
+    - <id> is [short bytes] representing the prepared query ID.
+    - <metadata> is defined exactly as for a Rows RESULT (See section 4.2.5.2; you
+      can however assume that the Has_more_pages flag is always off) and
+      is the specification for the variable bound in this prepare statement.
+    - <result_metadata> is defined exactly as <metadata> but correspond to the
+      metadata for the resultSet that execute this query will yield. Note that
+      <result_metadata> may be empty (have the No_metadata flag and 0 columns, See
+      section 4.2.5.2) and will be for any query that is not a Select. There is
+      in fact never a guarantee that this will non-empty so client should protect
+      themselves accordingly. The presence of this information is an
+      optimization that allows to later execute the statement that has been
+      prepared without requesting the metadata (Skip_metadata flag in EXECUTE).
+      Clients can safely discard this metadata if they do not want to take
+      advantage of that optimization.
+
+  Note that prepared query ID return is global to the node on which the query
+  has been prepared. It can be used on any connection to that node and this
+  until the node is restarted (after which the query must be reprepared).
+
+4.2.5.5. Schema_change
+
+  The result to a schema altering query (creation/update/drop of a
+  keyspace/table/index). The body (after the kind [int]) is the same
+  as the body for a "SCHEMA_CHANGE" event, so 3 strings:
+    <change_type><target><options>
+  Please refer to the section 4.2.6 below for the meaning of those fields.
+
+  Note that queries to create and drop an index are considered as change
+  updating the table the index is on.
+
+
+4.2.6. EVENT
+
+  And event pushed by the server. A client will only receive events for the
+  type it has REGISTER to. The body of an EVENT message will start by a
+  [string] representing the event type. The rest of the message depends on the
+  event type. The valid event types are:
+    - "TOPOLOGY_CHANGE": events related to change in the cluster topology.
+      Currently, events are sent when new nodes are added to the cluster, and
+      when nodes are removed. The body of the message (after the event type)
+      consists of a [string] and an [inet], corresponding respectively to the
+      type of change ("NEW_NODE" or "REMOVED_NODE") followed by the address of
+      the new/removed node.
+    - "STATUS_CHANGE": events related to change of node status. Currently,
+      up/down events are sent. The body of the message (after the event type)
+      consists of a [string] and an [inet], corresponding respectively to the
+      type of status change ("UP" or "DOWN") followed by the address of the
+      concerned node.
+    - "SCHEMA_CHANGE": events related to schema change. After the event type,
+      the rest of the message will be <change_type><target><options> where:
+        - <change_type> is a [string] representing the type of changed involved.
+          It will be one of "CREATED", "UPDATED" or "DROPPED".
+        - <target> is a [string] that can be one of "KEYSPACE", "TABLE" or "TYPE"
+          and describes what has been modified ("TYPE" stands for modifications
+          related to user types).
+        - <options> depends on the preceding <target>. If <target> is
+          "KEYSPACE", then <options> will be a single [string] representing the
+          keyspace changed. Otherwise, if <target> is "TABLE" or "TYPE", then
+          <options> will be 2 [string]: the first one will be the keyspace
+          containing the affected object, and the second one will be the name
+          of said affected object (so either the table name or the user type
+          name).
+
+  All EVENT message have a streamId of -1 (Section 2.3).
+
+  Please note that "NEW_NODE" and "UP" events are sent based on internal Gossip
+  communication and as such may be sent a short delay before the binary
+  protocol server on the newly up node is fully started. Clients are thus
+  advise to wait a short time before trying to connect to the node (1 seconds
+  should be enough), otherwise they may experience a connection refusal at
+  first.
+
+4.2.7. AUTH_CHALLENGE
+
+  A server authentication challenge (see AUTH_RESPONSE (Section 4.1.2) for more
+  details).
+
+  The body of this message is a single [bytes] token. The details of what this
+  token contains (and when it can be null/empty, if ever) depends on the actual
+  authenticator used.
+
+  Clients are expected to answer the server challenge by an AUTH_RESPONSE
+  message.
+
+4.2.7. AUTH_SUCCESS
+
+  Indicate the success of the authentication phase. See Section 4.2.3 for more
+  details.
+
+  The body of this message is a single [bytes] token holding final information
+  from the server that the client may require to finish the authentication
+  process. What that token contains and whether it can be null depends on the
+  actual authenticator used.
+
+
+5. Compression
+
+  Frame compression is supported by the protocol, but then only the frame body
+  is compressed (the frame header should never be compressed).
+
+  Before being used, client and server must agree on a compression algorithm to
+  use, which is done in the STARTUP message. As a consequence, a STARTUP message
+  must never be compressed.  However, once the STARTUP frame has been received
+  by the server can be compressed (including the response to the STARTUP
+  request). Frame do not have to be compressed however, even if compression has
+  been agreed upon (a server may only compress frame above a certain size at its
+  discretion). A frame body should be compressed if and only if the compressed
+  flag (see Section 2.2) is set.
+
+  As of this version 2 of the protocol, the following compressions are available:
+    - lz4 (https://code.google.com/p/lz4/). In that, note that the 4 first bytes
+      of the body will be the uncompressed length (followed by the compressed
+      bytes).
+    - snappy (https://code.google.com/p/snappy/). This compression might not be
+      available as it depends on a native lib (server-side) that might not be
+      avaivable on some installation.
+
+
+6. Collection types
+
+  This section describe the serialization format for the collection types:
+  list, map and set. This serialization format is both useful to decode values
+  returned in RESULT messages but also to encode values for EXECUTE ones.
+
+  The serialization formats are:
+     List: a [int] n indicating the size of the list, followed by n elements.
+           Each element is [bytes] representing the serialized element
+           value.
+     Map: a [int] n indicating the size of the map, followed by n entries.
+          Each entry is composed of two [bytes] representing the key and
+          the value of the entry map.
+     Set: a [int] n indicating the size of the set, followed by n elements.
+          Each element is [bytes] representing the serialized element
+          value.
+
+
+7. User defined and tuple types
+
+  This section describes the serialization format for User defined types (UDT) and
+  tuple values. UDT (resp. tuple) values are the values of the User Defined Types
+  (resp. tuple type) as defined in section 4.2.5.2.
+
+  A UDT value is composed of successive [bytes] values, one for each field of the UDT
+  value (in the order defined by the type). A UDT value will generally have one value
+  for each field of the type it represents, but it is allowed to have less values than
+  the type has fields.
+
+  A tuple value has the exact same serialization format, i.e. a succession of
+  [bytes] values representing the components of the tuple.
+
+
+8. Result paging
+
+  The protocol allows for paging the result of queries. For that, the QUERY and
+  EXECUTE messages have a <result_page_size> value that indicate the desired
+  page size in CQL3 rows.
+
+  If a positive value is provided for <result_page_size>, the result set of the
+  RESULT message returned for the query will contain at most the
+  <result_page_size> first rows of the query result. If that first page of result
+  contains the full result set for the query, the RESULT message (of kind `Rows`)
+  will have the Has_more_pages flag *not* set. However, if some results are not
+  part of the first response, the Has_more_pages flag will be set and the result
+  will contain a <paging_state> value. In that case, the <paging_state> value
+  should be used in a QUERY or EXECUTE message (that has the *same* query than
+  the original one or the behavior is undefined) to retrieve the next page of
+  results.
+
+  Only CQL3 queries that return a result set (RESULT message with a Rows `kind`)
+  support paging. For other type of queries, the <result_page_size> value is
+  ignored.
+
+  Note to client implementors:
+  - While <result_page_size> can be as low as 1, it will likely be detrimental
+    to performance to pick a value too low. A value below 100 is probably too
+    low for most use cases.
+  - Clients should not rely on the actual size of the result set returned to
+    decide if there is more result to fetch or not. Instead, they should always
+    check the Has_more_pages flag (unless they did not enabled paging for the query
+    obviously). Clients should also not assert that no result will have more than
+    <result_page_size> results. While the current implementation always respect
+    the exact value of <result_page_size>, we reserve ourselves the right to return
+    slightly smaller or bigger pages in the future for performance reasons.
+
+
+9. Error codes
+
+  The supported error codes are described below:
+    0x0000    Server error: something unexpected happened. This indicates a
+              server-side bug.
+    0x000A    Protocol error: some client message triggered a protocol
+              violation (for instance a QUERY message is sent before a STARTUP
+              one has been sent)
+    0x0100    Bad credentials: CREDENTIALS request failed because Cassandra
+              did not accept the provided credentials.
+
+    0x1000    Unavailable exception. The rest of the ERROR message body will be
+                <cl><required><alive>
+              where:
+                <cl> is the [consistency] level of the query having triggered
+                     the exception.
+                <required> is an [int] representing the number of node that
+                           should be alive to respect <cl>
+                <alive> is an [int] representing the number of replica that
+                        were known to be alive when the request has been
+                        processed (since an unavailable exception has been
+                        triggered, there will be <alive> < <required>)
+    0x1001    Overloaded: the request cannot be processed because the
+              coordinator node is overloaded
+    0x1002    Is_bootstrapping: the request was a read request but the
+              coordinator node is bootstrapping
+    0x1003    Truncate_error: error during a truncation error.
+    0x1100    Write_timeout: Timeout exception during a write request. The rest
+              of the ERROR message body will be
+                <cl><received><blockfor><writeType>
+              where:
+                <cl> is the [consistency] level of the query having triggered
+                     the exception.
+                <received> is an [int] representing the number of nodes having
+                           acknowledged the request.
+                <blockfor> is the number of replica whose acknowledgement is
+                           required to achieve <cl>.
+                <writeType> is a [string] that describe the type of the write
+                            that timeouted. The value of that string can be one
+                            of:
+                             - "SIMPLE": the write was a non-batched
+                               non-counter write.
+                             - "BATCH": the write was a (logged) batch write.
+                               If this type is received, it means the batch log
+                               has been successfully written (otherwise a
+                               "BATCH_LOG" type would have been send instead).
+                             - "UNLOGGED_BATCH": the write was an unlogged
+                               batch. Not batch log write has been attempted.
+                             - "COUNTER": the write was a counter write
+                               (batched or not).
+                             - "BATCH_LOG": the timeout occured during the
+                               write to the batch log when a (logged) batch
+                               write was requested.
+    0x1200    Read_timeout: Timeout exception during a read request. The rest
+              of the ERROR message body will be
+                <cl><received><blockfor><data_present>
+              where:
+                <cl> is the [consistency] level of the query having triggered
+                     the exception.
+                <received> is an [int] representing the number of nodes having
+                           answered the request.
+                <blockfor> is the number of replica whose response is
+                           required to achieve <cl>. Please note that it is
+                           possible to have <received> >= <blockfor> if
+                           <data_present> is false. And also in the (unlikely)
+                           case were <cl> is achieved but the coordinator node
+                           timeout while waiting for read-repair
+                           acknowledgement.
+                <data_present> is a single byte. If its value is 0, it means
+                               the replica that was asked for data has not
+                               responded. Otherwise, the value is != 0.
+
+    0x2000    Syntax_error: The submitted query has a syntax error.
+    0x2100    Unauthorized: The logged user doesn't have the right to perform
+              the query.
+    0x2200    Invalid: The query is syntactically correct but invalid.
+    0x2300    Config_error: The query is invalid because of some configuration issue
+    0x2400    Already_exists: The query attempted to create a keyspace or a
+              table that was already existing. The rest of the ERROR message
+              body will be <ks><table> where:
+                <ks> is a [string] representing either the keyspace that
+                     already exists, or the keyspace in which the table that
+                     already exists is.
+                <table> is a [string] representing the name of the table that
+                        already exists. If the query was attempting to create a
+                        keyspace, <table> will be present but will be the empty
+                        string.
+    0x2500    Unprepared: Can be thrown while a prepared statement tries to be
+              executed if the provide prepared statement ID is not known by
+              this host. The rest of the ERROR message body will be [short
+              bytes] representing the unknown ID.
+
+10. Changes from v2
+  * stream id is now 2 bytes long (a [short] value), so the header is now 1 byte longer (9 bytes total).
+  * BATCH messages now have <flags> (like QUERY and EXECUTE) and a corresponding optional
+    <serial_consistency> parameters (see Section 4.1.7).
+  * User Defined Types and tuple types have to added to ResultSet metadata (see 4.2.5.2) and a
+    new section on the serialization format of UDT and tuple values has been added to the documentation
+    (Section 7).
+  * The serialization format for collection has changed (both the collection size and
+    the length of each argument is now 4 bytes long). See Section 6.
+  * QUERY, EXECUTE and BATCH messages can now optionally provide the default timestamp for the query.
+    As this feature is optionally enabled by clients, implementing it is at the discretion of the
+    client.
+  * QUERY, EXECUTE and BATCH messages can now optionally provide the names for the values of the
+    query. As this feature is optionally enabled by clients, implementing it is at the discretion of the
+    client.
+  * The format of "Schema_change" results (Section 4.2.5.5) and "SCHEMA_CHANGE" events (Section 4.2.6)
+    has been modified, and now includes changes related to user types.
+

diff --git a/examples/client_only/conf/cassandra.yaml b/examples/client_only/conf/cassandra.yaml
index f372dbf..a6b3b43 100644
--- a/examples/client_only/conf/cassandra.yaml
+++ b/examples/client_only/conf/cassandra.yaml

@@ -49,11 +49,6 @@
 # cross-dc handoff tends to be slower
 max_hints_delivery_threads: 2
 
-# The following setting populates the page cache on memtable flush and compaction
-# WARNING: Enable this setting only when the whole node's data fits in memory.
-# Defaults to: false
-# populate_io_cache_on_flush: false
-
 # authentication backend, implementing IAuthenticator; used to identify users
 authenticator: org.apache.cassandra.auth.AllowAllAuthenticator
 

diff --git a/examples/hadoop_cql3_word_count/src/WordCountSetup.java b/examples/hadoop_cql3_word_count/src/WordCountSetup.java
index cffe272..e514d63 100644
--- a/examples/hadoop_cql3_word_count/src/WordCountSetup.java
+++ b/examples/hadoop_cql3_word_count/src/WordCountSetup.java

@@ -70,7 +70,7 @@
 
             client.execute_cql3_query(ByteBufferUtil.bytes(query), Compression.NONE, ConsistencyLevel.ONE);
 
-	    String verifyQuery = "select count(*) from system.peers";
+            String verifyQuery = "select count(*) from system.peers";
             CqlResult result = client.execute_cql3_query(ByteBufferUtil.bytes(verifyQuery), Compression.NONE, ConsistencyLevel.ONE);
 
             long magnitude = ByteBufferUtil.toLong(result.rows.get(0).columns.get(0).value);

diff --git a/examples/hadoop_word_count/src/WordCount.java b/examples/hadoop_word_count/src/WordCount.java
index 398a7cb..f6bca77 100644
--- a/examples/hadoop_word_count/src/WordCount.java
+++ b/examples/hadoop_word_count/src/WordCount.java

@@ -20,12 +20,13 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.thrift.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.Column;
 import org.apache.cassandra.hadoop.ColumnFamilyInputFormat;
 import org.apache.cassandra.hadoop.ConfigHelper;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -70,7 +71,7 @@
         System.exit(0);
     }
 
-    public static class TokenizerMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, Column>, Text, IntWritable>
+    public static class TokenizerMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, Cell>, Text, IntWritable>
     {
         private final static IntWritable one = new IntWritable(1);
         private Text word = new Text();
@@ -81,17 +82,17 @@
         {
         }
 
-        public void map(ByteBuffer key, SortedMap<ByteBuffer, Column> columns, Context context) throws IOException, InterruptedException
+        public void map(ByteBuffer key, SortedMap<ByteBuffer, Cell> columns, Context context) throws IOException, InterruptedException
         {
-            for (Column column : columns.values())
+            for (Cell cell : columns.values())
             {
-                String name  = ByteBufferUtil.string(column.name());
+                String name  = ByteBufferUtil.string(cell.name().toByteBuffer());
                 String value = null;
                 
                 if (name.contains("int"))
-                    value = String.valueOf(ByteBufferUtil.toInt(column.value()));
+                    value = String.valueOf(ByteBufferUtil.toInt(cell.value()));
                 else
-                    value = ByteBufferUtil.string(column.value());
+                    value = ByteBufferUtil.string(cell.value());
                                
                 logger.debug("read {}:{}={} from {}",
                              new Object[] {ByteBufferUtil.string(key), name, value, context.getInputSplit()});

diff --git a/examples/hadoop_word_count/src/WordCountCounters.java b/examples/hadoop_word_count/src/WordCountCounters.java
index 55d0889..39fb778 100644
--- a/examples/hadoop_word_count/src/WordCountCounters.java
+++ b/examples/hadoop_word_count/src/WordCountCounters.java

@@ -18,8 +18,10 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.SortedMap;
 
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.thrift.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -34,10 +36,8 @@
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 
-import org.apache.cassandra.db.Column;
 import org.apache.cassandra.hadoop.ColumnFamilyInputFormat;
 import org.apache.cassandra.hadoop.ConfigHelper;
-import org.apache.cassandra.thrift.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 /**
@@ -60,15 +60,15 @@
         System.exit(0);
     }
 
-    public static class SumMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, Column>, Text, LongWritable>
+    public static class SumMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, Cell>, Text, LongWritable>
     {
-        public void map(ByteBuffer key, SortedMap<ByteBuffer, Column> columns, Context context) throws IOException, InterruptedException
+        public void map(ByteBuffer key, SortedMap<ByteBuffer, Cell> columns, Context context) throws IOException, InterruptedException
         {
             long sum = 0;
-            for (Column column : columns.values())
+            for (Cell cell : columns.values())
             {
-                logger.debug("read " + key + ":" + column.name() + " from " + context.getInputSplit());
-                sum += ByteBufferUtil.toLong(column.value());
+                logger.debug("read " + key + ":" + cell.name() + " from " + context.getInputSplit());
+                sum += ByteBufferUtil.toLong(cell.value());
             }
             context.write(new Text(ByteBufferUtil.string(key)), new LongWritable(sum));
         }

diff --git a/examples/hadoop_word_count/src/WordCountSetup.java b/examples/hadoop_word_count/src/WordCountSetup.java
index 6dd2ba7..0ef5341 100644
--- a/examples/hadoop_word_count/src/WordCountSetup.java
+++ b/examples/hadoop_word_count/src/WordCountSetup.java

@@ -177,7 +177,7 @@
         ksDef.putToStrategy_options("replication_factor", "1");
         client.system_add_keyspace(ksDef);
 
-	int magnitude = getNumberOfHosts(client);
+        int magnitude = getNumberOfHosts(client);
         Uninterruptibles.sleepUninterruptibly(magnitude, TimeUnit.SECONDS);
     }
 

diff --git a/examples/triggers/src/org/apache/cassandra/triggers/InvertedIndex.java b/examples/triggers/src/org/apache/cassandra/triggers/InvertedIndex.java
index ae58b33..11e98b5 100644
--- a/examples/triggers/src/org/apache/cassandra/triggers/InvertedIndex.java
+++ b/examples/triggers/src/org/apache/cassandra/triggers/InvertedIndex.java

@@ -27,9 +27,9 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.RowMutation;
+import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.io.util.FileUtils;
 
 public class InvertedIndex implements ITrigger
@@ -37,16 +37,16 @@
     private static final Logger logger = LoggerFactory.getLogger(InvertedIndex.class);
     private Properties properties = loadProperties();
 
-    public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+    public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
     {
-        List<RowMutation> mutations = new ArrayList<>();
+        List<Mutation> mutations = new ArrayList<>(update.getColumnCount());
 
-        for (Column cell : update)
+        for (Cell cell : update)
         {
             // Skip the row marker and other empty values, since they lead to an empty key.
             if (cell.value().remaining() > 0)
             {
-                RowMutation mutation = new RowMutation(properties.getProperty("keyspace"), cell.value());
+                Mutation mutation = new Mutation(properties.getProperty("keyspace"), cell.value());
                 mutation.add(properties.getProperty("columnfamily"), cell.name(), key, System.currentTimeMillis());
                 mutations.add(mutation);
             }

diff --git a/interface/cassandra.thrift b/interface/cassandra.thrift
index e469416..2484a5d 100644
--- a/interface/cassandra.thrift
+++ b/interface/cassandra.thrift

@@ -55,7 +55,7 @@
 # An effort should be made not to break forward-client-compatibility either
 # (e.g. one should avoid removing obsolete fields from the IDL), but no
 # guarantees in this respect are made by the Cassandra project.
-const string VERSION = "19.38.0"
+const string VERSION = "19.39.0"
 
 
 #
@@ -117,7 +117,6 @@
     4: optional CounterSuperColumn counter_super_column
 }
 
-
 #
 # Exceptions
 # (note that internal server errors will raise a TApplicationException, courtesy of Thrift)
@@ -459,7 +458,6 @@
     16: optional i32 id,
     17: optional i32 min_compaction_threshold,
     18: optional i32 max_compaction_threshold,
-    24: optional bool replicate_on_write,
     26: optional string key_validation_class,
     28: optional binary key_alias,
     29: optional string compaction_strategy,
@@ -468,12 +466,13 @@
     33: optional double bloom_filter_fp_chance,
     34: optional string caching="keys_only",
     37: optional double dclocal_read_repair_chance = 0.0,
-    38: optional bool populate_io_cache_on_flush,
     39: optional i32 memtable_flush_period_in_ms,
     40: optional i32 default_time_to_live,
-    41: optional i32 index_interval,
     42: optional string speculative_retry="NONE",
     43: optional list<TriggerDef> triggers,
+    44: optional string cells_per_row_to_cache = "100",
+    45: optional i32 min_index_interval,
+    46: optional i32 max_index_interval,
 
     /* All of the following are now ignored and unsupplied. */
 
@@ -492,11 +491,17 @@
     /** @deprecated */
     23: optional double memtable_operations_in_millions,
     /** @deprecated */
+    24: optional bool replicate_on_write,
+    /** @deprecated */
     25: optional double merge_shards_chance,
     /** @deprecated */
     27: optional string row_cache_provider,
     /** @deprecated */
     31: optional i32 row_cache_keys_to_save,
+    /** @deprecated */
+    38: optional bool populate_io_cache_on_flush,
+    /** @deprecated */
+    41: optional i32 index_interval,
 }
 
 /* describes a keyspace. */
@@ -565,6 +570,35 @@
     3: required i64 row_count
 }
 
+/** The ColumnSlice is used to select a set of columns from inside a row.
+ * If start or finish are unspecified they will default to the start-of
+ * end-of value.
+ * @param start. The start of the ColumnSlice inclusive
+ * @param finish. The end of the ColumnSlice inclusive
+ */
+struct ColumnSlice {
+    1: optional binary start,
+    2: optional binary finish
+}
+
+/**
+ * Used to perform multiple slices on a single row key in one rpc operation
+ * @param key. The row key to be multi sliced
+ * @param column_parent. The column family (super columns are unsupported)
+ * @param column_slices. 0 to many ColumnSlice objects each will be used to select columns
+ * @param reversed. Direction of slice
+ * @param count. Maximum number of columns
+ * @param consistency_level. Level to perform the operation at
+ */
+struct MultiSliceRequest {
+    1: optional binary key,
+    2: optional ColumnParent column_parent,
+    3: optional list<ColumnSlice> column_slices,
+    4: optional bool reversed=false,
+    5: optional i32 count=1000,
+    6: optional ConsistencyLevel consistency_level=ConsistencyLevel.ONE
+}
+
 service Cassandra {
   # auth methods
   void login(1: required AuthenticationRequest auth_request) throws (1:AuthenticationException authnx, 2:AuthorizationException authzx),
@@ -743,6 +777,11 @@
   void truncate(1:required string cfname)
        throws (1: InvalidRequestException ire, 2: UnavailableException ue, 3: TimedOutException te),
 
+  /**
+  * Select multiple slices of a key in a single RPC operation
+  */
+  list<ColumnOrSuperColumn> get_multi_slice(1:required MultiSliceRequest request)
+       throws (1:InvalidRequestException ire, 2:UnavailableException ue, 3:TimedOutException te),
 
     
   // Meta-APIs -- APIs to get information about the node or cluster,

diff --git a/interface/thrift/gen-java/org/apache/cassandra/thrift/Cassandra.java b/interface/thrift/gen-java/org/apache/cassandra/thrift/Cassandra.java
index 15b99fa..55f4734 100644
--- a/interface/thrift/gen-java/org/apache/cassandra/thrift/Cassandra.java
+++ b/interface/thrift/gen-java/org/apache/cassandra/thrift/Cassandra.java

@@ -248,6 +248,13 @@
     public void truncate(String cfname) throws InvalidRequestException, UnavailableException, TimedOutException, org.apache.thrift.TException;
 
     /**
+     * Select multiple slices of a key in a single RPC operation
+     * 
+     * @param request
+     */
+    public List<ColumnOrSuperColumn> get_multi_slice(MultiSliceRequest request) throws InvalidRequestException, UnavailableException, TimedOutException, org.apache.thrift.TException;
+
+    /**
      * for each schema version present in the cluster, returns a list of nodes at that version.
      * hosts that do not respond will be under the key DatabaseDescriptor.INITIAL_VERSION.
      * the cluster is all on the same version if the size of the map is 1.
@@ -480,6 +487,8 @@
 
     public void truncate(String cfname, org.apache.thrift.async.AsyncMethodCallback resultHandler) throws org.apache.thrift.TException;
 
+    public void get_multi_slice(MultiSliceRequest request, org.apache.thrift.async.AsyncMethodCallback resultHandler) throws org.apache.thrift.TException;
+
     public void describe_schema_versions(org.apache.thrift.async.AsyncMethodCallback resultHandler) throws org.apache.thrift.TException;
 
     public void describe_keyspaces(org.apache.thrift.async.AsyncMethodCallback resultHandler) throws org.apache.thrift.TException;
@@ -1138,6 +1147,38 @@
       return;
     }
 
+    public List<ColumnOrSuperColumn> get_multi_slice(MultiSliceRequest request) throws InvalidRequestException, UnavailableException, TimedOutException, org.apache.thrift.TException
+    {
+      send_get_multi_slice(request);
+      return recv_get_multi_slice();
+    }
+
+    public void send_get_multi_slice(MultiSliceRequest request) throws org.apache.thrift.TException
+    {
+      get_multi_slice_args args = new get_multi_slice_args();
+      args.setRequest(request);
+      sendBase("get_multi_slice", args);
+    }
+
+    public List<ColumnOrSuperColumn> recv_get_multi_slice() throws InvalidRequestException, UnavailableException, TimedOutException, org.apache.thrift.TException
+    {
+      get_multi_slice_result result = new get_multi_slice_result();
+      receiveBase(result, "get_multi_slice");
+      if (result.isSetSuccess()) {
+        return result.success;
+      }
+      if (result.ire != null) {
+        throw result.ire;
+      }
+      if (result.ue != null) {
+        throw result.ue;
+      }
+      if (result.te != null) {
+        throw result.te;
+      }
+      throw new org.apache.thrift.TApplicationException(org.apache.thrift.TApplicationException.MISSING_RESULT, "get_multi_slice failed: unknown result");
+    }
+
     public Map<String,List<String>> describe_schema_versions() throws InvalidRequestException, org.apache.thrift.TException
     {
       send_describe_schema_versions();
@@ -2576,6 +2617,38 @@
       }
     }
 
+    public void get_multi_slice(MultiSliceRequest request, org.apache.thrift.async.AsyncMethodCallback resultHandler) throws org.apache.thrift.TException {
+      checkReady();
+      get_multi_slice_call method_call = new get_multi_slice_call(request, resultHandler, this, ___protocolFactory, ___transport);
+      this.___currentMethod = method_call;
+      ___manager.call(method_call);
+    }
+
+    public static class get_multi_slice_call extends org.apache.thrift.async.TAsyncMethodCall {
+      private MultiSliceRequest request;
+      public get_multi_slice_call(MultiSliceRequest request, org.apache.thrift.async.AsyncMethodCallback resultHandler, org.apache.thrift.async.TAsyncClient client, org.apache.thrift.protocol.TProtocolFactory protocolFactory, org.apache.thrift.transport.TNonblockingTransport transport) throws org.apache.thrift.TException {
+        super(client, protocolFactory, transport, resultHandler, false);
+        this.request = request;
+      }
+
+      public void write_args(org.apache.thrift.protocol.TProtocol prot) throws org.apache.thrift.TException {
+        prot.writeMessageBegin(new org.apache.thrift.protocol.TMessage("get_multi_slice", org.apache.thrift.protocol.TMessageType.CALL, 0));
+        get_multi_slice_args args = new get_multi_slice_args();
+        args.setRequest(request);
+        args.write(prot);
+        prot.writeMessageEnd();
+      }
+
+      public List<ColumnOrSuperColumn> getResult() throws InvalidRequestException, UnavailableException, TimedOutException, org.apache.thrift.TException {
+        if (getState() != org.apache.thrift.async.TAsyncMethodCall.State.RESPONSE_READ) {
+          throw new IllegalStateException("Method call not finished!");
+        }
+        org.apache.thrift.transport.TMemoryInputTransport memoryTransport = new org.apache.thrift.transport.TMemoryInputTransport(getFrameBuffer().array());
+        org.apache.thrift.protocol.TProtocol prot = client.getProtocolFactory().getProtocol(memoryTransport);
+        return (new Client(prot)).recv_get_multi_slice();
+      }
+    }
+
     public void describe_schema_versions(org.apache.thrift.async.AsyncMethodCallback resultHandler) throws org.apache.thrift.TException {
       checkReady();
       describe_schema_versions_call method_call = new describe_schema_versions_call(resultHandler, this, ___protocolFactory, ___transport);
@@ -3457,6 +3530,7 @@
       processMap.put("batch_mutate", new batch_mutate());
       processMap.put("atomic_batch_mutate", new atomic_batch_mutate());
       processMap.put("truncate", new truncate());
+      processMap.put("get_multi_slice", new get_multi_slice());
       processMap.put("describe_schema_versions", new describe_schema_versions());
       processMap.put("describe_keyspaces", new describe_keyspaces());
       processMap.put("describe_cluster_name", new describe_cluster_name());
@@ -3987,6 +4061,34 @@
       }
     }
 
+    public static class get_multi_slice<I extends Iface> extends org.apache.thrift.ProcessFunction<I, get_multi_slice_args> {
+      public get_multi_slice() {
+        super("get_multi_slice");
+      }
+
+      public get_multi_slice_args getEmptyArgsInstance() {
+        return new get_multi_slice_args();
+      }
+
+      protected boolean isOneway() {
+        return false;
+      }
+
+      public get_multi_slice_result getResult(I iface, get_multi_slice_args args) throws org.apache.thrift.TException {
+        get_multi_slice_result result = new get_multi_slice_result();
+        try {
+          result.success = iface.get_multi_slice(args.request);
+        } catch (InvalidRequestException ire) {
+          result.ire = ire;
+        } catch (UnavailableException ue) {
+          result.ue = ue;
+        } catch (TimedOutException te) {
+          result.te = te;
+        }
+        return result;
+      }
+    }
+
     public static class describe_schema_versions<I extends Iface> extends org.apache.thrift.ProcessFunction<I, describe_schema_versions_args> {
       public describe_schema_versions() {
         super("describe_schema_versions");
@@ -4660,6 +4762,7 @@
       processMap.put("batch_mutate", new batch_mutate());
       processMap.put("atomic_batch_mutate", new atomic_batch_mutate());
       processMap.put("truncate", new truncate());
+      processMap.put("get_multi_slice", new get_multi_slice());
       processMap.put("describe_schema_versions", new describe_schema_versions());
       processMap.put("describe_keyspaces", new describe_keyspaces());
       processMap.put("describe_cluster_name", new describe_cluster_name());
@@ -5877,6 +5980,73 @@
       }
     }
 
+    public static class get_multi_slice<I extends AsyncIface> extends org.apache.thrift.AsyncProcessFunction<I, get_multi_slice_args, List<ColumnOrSuperColumn>> {
+      public get_multi_slice() {
+        super("get_multi_slice");
+      }
+
+      public get_multi_slice_args getEmptyArgsInstance() {
+        return new get_multi_slice_args();
+      }
+
+      public AsyncMethodCallback<List<ColumnOrSuperColumn>> getResultHandler(final AsyncFrameBuffer fb, final int seqid) {
+        final org.apache.thrift.AsyncProcessFunction fcall = this;
+        return new AsyncMethodCallback<List<ColumnOrSuperColumn>>() { 
+          public void onComplete(List<ColumnOrSuperColumn> o) {
+            get_multi_slice_result result = new get_multi_slice_result();
+            result.success = o;
+            try {
+              fcall.sendResponse(fb,result, org.apache.thrift.protocol.TMessageType.REPLY,seqid);
+              return;
+            } catch (Exception e) {
+              LOGGER.error("Exception writing to internal frame buffer", e);
+            }
+            fb.close();
+          }
+          public void onError(Exception e) {
+            byte msgType = org.apache.thrift.protocol.TMessageType.REPLY;
+            org.apache.thrift.TBase msg;
+            get_multi_slice_result result = new get_multi_slice_result();
+            if (e instanceof InvalidRequestException) {
+                        result.ire = (InvalidRequestException) e;
+                        result.setIreIsSet(true);
+                        msg = result;
+            }
+            else             if (e instanceof UnavailableException) {
+                        result.ue = (UnavailableException) e;
+                        result.setUeIsSet(true);
+                        msg = result;
+            }
+            else             if (e instanceof TimedOutException) {
+                        result.te = (TimedOutException) e;
+                        result.setTeIsSet(true);
+                        msg = result;
+            }
+             else 
+            {
+              msgType = org.apache.thrift.protocol.TMessageType.EXCEPTION;
+              msg = (org.apache.thrift.TBase)new org.apache.thrift.TApplicationException(org.apache.thrift.TApplicationException.INTERNAL_ERROR, e.getMessage());
+            }
+            try {
+              fcall.sendResponse(fb,msg,msgType,seqid);
+              return;
+            } catch (Exception ex) {
+              LOGGER.error("Exception writing to internal frame buffer", ex);
+            }
+            fb.close();
+          }
+        };
+      }
+
+      protected boolean isOneway() {
+        return false;
+      }
+
+      public void start(I iface, get_multi_slice_args args, org.apache.thrift.async.AsyncMethodCallback<List<ColumnOrSuperColumn>> resultHandler) throws TException {
+        iface.get_multi_slice(args.request,resultHandler);
+      }
+    }
+
     public static class describe_schema_versions<I extends AsyncIface> extends org.apache.thrift.AsyncProcessFunction<I, describe_schema_versions_args, Map<String,List<String>>> {
       public describe_schema_versions() {
         super("describe_schema_versions");
@@ -11615,14 +11785,14 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list216 = iprot.readListBegin();
-                  struct.success = new ArrayList<ColumnOrSuperColumn>(_list216.size);
-                  for (int _i217 = 0; _i217 < _list216.size; ++_i217)
+                  org.apache.thrift.protocol.TList _list224 = iprot.readListBegin();
+                  struct.success = new ArrayList<ColumnOrSuperColumn>(_list224.size);
+                  for (int _i225 = 0; _i225 < _list224.size; ++_i225)
                   {
-                    ColumnOrSuperColumn _elem218;
-                    _elem218 = new ColumnOrSuperColumn();
-                    _elem218.read(iprot);
-                    struct.success.add(_elem218);
+                    ColumnOrSuperColumn _elem226;
+                    _elem226 = new ColumnOrSuperColumn();
+                    _elem226.read(iprot);
+                    struct.success.add(_elem226);
                   }
                   iprot.readListEnd();
                 }
@@ -11677,9 +11847,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
-            for (ColumnOrSuperColumn _iter219 : struct.success)
+            for (ColumnOrSuperColumn _iter227 : struct.success)
             {
-              _iter219.write(oprot);
+              _iter227.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -11734,9 +11904,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (ColumnOrSuperColumn _iter220 : struct.success)
+            for (ColumnOrSuperColumn _iter228 : struct.success)
             {
-              _iter220.write(oprot);
+              _iter228.write(oprot);
             }
           }
         }
@@ -11757,14 +11927,14 @@
         BitSet incoming = iprot.readBitSet(4);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list221 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.success = new ArrayList<ColumnOrSuperColumn>(_list221.size);
-            for (int _i222 = 0; _i222 < _list221.size; ++_i222)
+            org.apache.thrift.protocol.TList _list229 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<ColumnOrSuperColumn>(_list229.size);
+            for (int _i230 = 0; _i230 < _list229.size; ++_i230)
             {
-              ColumnOrSuperColumn _elem223;
-              _elem223 = new ColumnOrSuperColumn();
-              _elem223.read(iprot);
-              struct.success.add(_elem223);
+              ColumnOrSuperColumn _elem231;
+              _elem231 = new ColumnOrSuperColumn();
+              _elem231.read(iprot);
+              struct.success.add(_elem231);
             }
           }
           struct.setSuccessIsSet(true);
@@ -13764,13 +13934,13 @@
             case 1: // KEYS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list224 = iprot.readListBegin();
-                  struct.keys = new ArrayList<ByteBuffer>(_list224.size);
-                  for (int _i225 = 0; _i225 < _list224.size; ++_i225)
+                  org.apache.thrift.protocol.TList _list232 = iprot.readListBegin();
+                  struct.keys = new ArrayList<ByteBuffer>(_list232.size);
+                  for (int _i233 = 0; _i233 < _list232.size; ++_i233)
                   {
-                    ByteBuffer _elem226;
-                    _elem226 = iprot.readBinary();
-                    struct.keys.add(_elem226);
+                    ByteBuffer _elem234;
+                    _elem234 = iprot.readBinary();
+                    struct.keys.add(_elem234);
                   }
                   iprot.readListEnd();
                 }
@@ -13824,9 +13994,9 @@
           oprot.writeFieldBegin(KEYS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.keys.size()));
-            for (ByteBuffer _iter227 : struct.keys)
+            for (ByteBuffer _iter235 : struct.keys)
             {
-              oprot.writeBinary(_iter227);
+              oprot.writeBinary(_iter235);
             }
             oprot.writeListEnd();
           }
@@ -13866,9 +14036,9 @@
         TTupleProtocol oprot = (TTupleProtocol) prot;
         {
           oprot.writeI32(struct.keys.size());
-          for (ByteBuffer _iter228 : struct.keys)
+          for (ByteBuffer _iter236 : struct.keys)
           {
-            oprot.writeBinary(_iter228);
+            oprot.writeBinary(_iter236);
           }
         }
         struct.column_parent.write(oprot);
@@ -13880,13 +14050,13 @@
       public void read(org.apache.thrift.protocol.TProtocol prot, multiget_slice_args struct) throws org.apache.thrift.TException {
         TTupleProtocol iprot = (TTupleProtocol) prot;
         {
-          org.apache.thrift.protocol.TList _list229 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-          struct.keys = new ArrayList<ByteBuffer>(_list229.size);
-          for (int _i230 = 0; _i230 < _list229.size; ++_i230)
+          org.apache.thrift.protocol.TList _list237 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+          struct.keys = new ArrayList<ByteBuffer>(_list237.size);
+          for (int _i238 = 0; _i238 < _list237.size; ++_i238)
           {
-            ByteBuffer _elem231;
-            _elem231 = iprot.readBinary();
-            struct.keys.add(_elem231);
+            ByteBuffer _elem239;
+            _elem239 = iprot.readBinary();
+            struct.keys.add(_elem239);
           }
         }
         struct.setKeysIsSet(true);
@@ -14475,26 +14645,26 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
                 {
-                  org.apache.thrift.protocol.TMap _map232 = iprot.readMapBegin();
-                  struct.success = new HashMap<ByteBuffer,List<ColumnOrSuperColumn>>(2*_map232.size);
-                  for (int _i233 = 0; _i233 < _map232.size; ++_i233)
+                  org.apache.thrift.protocol.TMap _map240 = iprot.readMapBegin();
+                  struct.success = new HashMap<ByteBuffer,List<ColumnOrSuperColumn>>(2*_map240.size);
+                  for (int _i241 = 0; _i241 < _map240.size; ++_i241)
                   {
-                    ByteBuffer _key234;
-                    List<ColumnOrSuperColumn> _val235;
-                    _key234 = iprot.readBinary();
+                    ByteBuffer _key242;
+                    List<ColumnOrSuperColumn> _val243;
+                    _key242 = iprot.readBinary();
                     {
-                      org.apache.thrift.protocol.TList _list236 = iprot.readListBegin();
-                      _val235 = new ArrayList<ColumnOrSuperColumn>(_list236.size);
-                      for (int _i237 = 0; _i237 < _list236.size; ++_i237)
+                      org.apache.thrift.protocol.TList _list244 = iprot.readListBegin();
+                      _val243 = new ArrayList<ColumnOrSuperColumn>(_list244.size);
+                      for (int _i245 = 0; _i245 < _list244.size; ++_i245)
                       {
-                        ColumnOrSuperColumn _elem238;
-                        _elem238 = new ColumnOrSuperColumn();
-                        _elem238.read(iprot);
-                        _val235.add(_elem238);
+                        ColumnOrSuperColumn _elem246;
+                        _elem246 = new ColumnOrSuperColumn();
+                        _elem246.read(iprot);
+                        _val243.add(_elem246);
                       }
                       iprot.readListEnd();
                     }
-                    struct.success.put(_key234, _val235);
+                    struct.success.put(_key242, _val243);
                   }
                   iprot.readMapEnd();
                 }
@@ -14549,14 +14719,14 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, struct.success.size()));
-            for (Map.Entry<ByteBuffer, List<ColumnOrSuperColumn>> _iter239 : struct.success.entrySet())
+            for (Map.Entry<ByteBuffer, List<ColumnOrSuperColumn>> _iter247 : struct.success.entrySet())
             {
-              oprot.writeBinary(_iter239.getKey());
+              oprot.writeBinary(_iter247.getKey());
               {
-                oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, _iter239.getValue().size()));
-                for (ColumnOrSuperColumn _iter240 : _iter239.getValue())
+                oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, _iter247.getValue().size()));
+                for (ColumnOrSuperColumn _iter248 : _iter247.getValue())
                 {
-                  _iter240.write(oprot);
+                  _iter248.write(oprot);
                 }
                 oprot.writeListEnd();
               }
@@ -14614,14 +14784,14 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (Map.Entry<ByteBuffer, List<ColumnOrSuperColumn>> _iter241 : struct.success.entrySet())
+            for (Map.Entry<ByteBuffer, List<ColumnOrSuperColumn>> _iter249 : struct.success.entrySet())
             {
-              oprot.writeBinary(_iter241.getKey());
+              oprot.writeBinary(_iter249.getKey());
               {
-                oprot.writeI32(_iter241.getValue().size());
-                for (ColumnOrSuperColumn _iter242 : _iter241.getValue())
+                oprot.writeI32(_iter249.getValue().size());
+                for (ColumnOrSuperColumn _iter250 : _iter249.getValue())
                 {
-                  _iter242.write(oprot);
+                  _iter250.write(oprot);
                 }
               }
             }
@@ -14644,25 +14814,25 @@
         BitSet incoming = iprot.readBitSet(4);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TMap _map243 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
-            struct.success = new HashMap<ByteBuffer,List<ColumnOrSuperColumn>>(2*_map243.size);
-            for (int _i244 = 0; _i244 < _map243.size; ++_i244)
+            org.apache.thrift.protocol.TMap _map251 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
+            struct.success = new HashMap<ByteBuffer,List<ColumnOrSuperColumn>>(2*_map251.size);
+            for (int _i252 = 0; _i252 < _map251.size; ++_i252)
             {
-              ByteBuffer _key245;
-              List<ColumnOrSuperColumn> _val246;
-              _key245 = iprot.readBinary();
+              ByteBuffer _key253;
+              List<ColumnOrSuperColumn> _val254;
+              _key253 = iprot.readBinary();
               {
-                org.apache.thrift.protocol.TList _list247 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-                _val246 = new ArrayList<ColumnOrSuperColumn>(_list247.size);
-                for (int _i248 = 0; _i248 < _list247.size; ++_i248)
+                org.apache.thrift.protocol.TList _list255 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+                _val254 = new ArrayList<ColumnOrSuperColumn>(_list255.size);
+                for (int _i256 = 0; _i256 < _list255.size; ++_i256)
                 {
-                  ColumnOrSuperColumn _elem249;
-                  _elem249 = new ColumnOrSuperColumn();
-                  _elem249.read(iprot);
-                  _val246.add(_elem249);
+                  ColumnOrSuperColumn _elem257;
+                  _elem257 = new ColumnOrSuperColumn();
+                  _elem257.read(iprot);
+                  _val254.add(_elem257);
                 }
               }
-              struct.success.put(_key245, _val246);
+              struct.success.put(_key253, _val254);
             }
           }
           struct.setSuccessIsSet(true);
@@ -15283,13 +15453,13 @@
             case 1: // KEYS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list250 = iprot.readListBegin();
-                  struct.keys = new ArrayList<ByteBuffer>(_list250.size);
-                  for (int _i251 = 0; _i251 < _list250.size; ++_i251)
+                  org.apache.thrift.protocol.TList _list258 = iprot.readListBegin();
+                  struct.keys = new ArrayList<ByteBuffer>(_list258.size);
+                  for (int _i259 = 0; _i259 < _list258.size; ++_i259)
                   {
-                    ByteBuffer _elem252;
-                    _elem252 = iprot.readBinary();
-                    struct.keys.add(_elem252);
+                    ByteBuffer _elem260;
+                    _elem260 = iprot.readBinary();
+                    struct.keys.add(_elem260);
                   }
                   iprot.readListEnd();
                 }
@@ -15343,9 +15513,9 @@
           oprot.writeFieldBegin(KEYS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.keys.size()));
-            for (ByteBuffer _iter253 : struct.keys)
+            for (ByteBuffer _iter261 : struct.keys)
             {
-              oprot.writeBinary(_iter253);
+              oprot.writeBinary(_iter261);
             }
             oprot.writeListEnd();
           }
@@ -15385,9 +15555,9 @@
         TTupleProtocol oprot = (TTupleProtocol) prot;
         {
           oprot.writeI32(struct.keys.size());
-          for (ByteBuffer _iter254 : struct.keys)
+          for (ByteBuffer _iter262 : struct.keys)
           {
-            oprot.writeBinary(_iter254);
+            oprot.writeBinary(_iter262);
           }
         }
         struct.column_parent.write(oprot);
@@ -15399,13 +15569,13 @@
       public void read(org.apache.thrift.protocol.TProtocol prot, multiget_count_args struct) throws org.apache.thrift.TException {
         TTupleProtocol iprot = (TTupleProtocol) prot;
         {
-          org.apache.thrift.protocol.TList _list255 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-          struct.keys = new ArrayList<ByteBuffer>(_list255.size);
-          for (int _i256 = 0; _i256 < _list255.size; ++_i256)
+          org.apache.thrift.protocol.TList _list263 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+          struct.keys = new ArrayList<ByteBuffer>(_list263.size);
+          for (int _i264 = 0; _i264 < _list263.size; ++_i264)
           {
-            ByteBuffer _elem257;
-            _elem257 = iprot.readBinary();
-            struct.keys.add(_elem257);
+            ByteBuffer _elem265;
+            _elem265 = iprot.readBinary();
+            struct.keys.add(_elem265);
           }
         }
         struct.setKeysIsSet(true);
@@ -15978,15 +16148,15 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
                 {
-                  org.apache.thrift.protocol.TMap _map258 = iprot.readMapBegin();
-                  struct.success = new HashMap<ByteBuffer,Integer>(2*_map258.size);
-                  for (int _i259 = 0; _i259 < _map258.size; ++_i259)
+                  org.apache.thrift.protocol.TMap _map266 = iprot.readMapBegin();
+                  struct.success = new HashMap<ByteBuffer,Integer>(2*_map266.size);
+                  for (int _i267 = 0; _i267 < _map266.size; ++_i267)
                   {
-                    ByteBuffer _key260;
-                    int _val261;
-                    _key260 = iprot.readBinary();
-                    _val261 = iprot.readI32();
-                    struct.success.put(_key260, _val261);
+                    ByteBuffer _key268;
+                    int _val269;
+                    _key268 = iprot.readBinary();
+                    _val269 = iprot.readI32();
+                    struct.success.put(_key268, _val269);
                   }
                   iprot.readMapEnd();
                 }
@@ -16041,10 +16211,10 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.I32, struct.success.size()));
-            for (Map.Entry<ByteBuffer, Integer> _iter262 : struct.success.entrySet())
+            for (Map.Entry<ByteBuffer, Integer> _iter270 : struct.success.entrySet())
             {
-              oprot.writeBinary(_iter262.getKey());
-              oprot.writeI32(_iter262.getValue());
+              oprot.writeBinary(_iter270.getKey());
+              oprot.writeI32(_iter270.getValue());
             }
             oprot.writeMapEnd();
           }
@@ -16099,10 +16269,10 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (Map.Entry<ByteBuffer, Integer> _iter263 : struct.success.entrySet())
+            for (Map.Entry<ByteBuffer, Integer> _iter271 : struct.success.entrySet())
             {
-              oprot.writeBinary(_iter263.getKey());
-              oprot.writeI32(_iter263.getValue());
+              oprot.writeBinary(_iter271.getKey());
+              oprot.writeI32(_iter271.getValue());
             }
           }
         }
@@ -16123,15 +16293,15 @@
         BitSet incoming = iprot.readBitSet(4);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TMap _map264 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.I32, iprot.readI32());
-            struct.success = new HashMap<ByteBuffer,Integer>(2*_map264.size);
-            for (int _i265 = 0; _i265 < _map264.size; ++_i265)
+            org.apache.thrift.protocol.TMap _map272 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.I32, iprot.readI32());
+            struct.success = new HashMap<ByteBuffer,Integer>(2*_map272.size);
+            for (int _i273 = 0; _i273 < _map272.size; ++_i273)
             {
-              ByteBuffer _key266;
-              int _val267;
-              _key266 = iprot.readBinary();
-              _val267 = iprot.readI32();
-              struct.success.put(_key266, _val267);
+              ByteBuffer _key274;
+              int _val275;
+              _key274 = iprot.readBinary();
+              _val275 = iprot.readI32();
+              struct.success.put(_key274, _val275);
             }
           }
           struct.setSuccessIsSet(true);
@@ -17409,14 +17579,14 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list268 = iprot.readListBegin();
-                  struct.success = new ArrayList<KeySlice>(_list268.size);
-                  for (int _i269 = 0; _i269 < _list268.size; ++_i269)
+                  org.apache.thrift.protocol.TList _list276 = iprot.readListBegin();
+                  struct.success = new ArrayList<KeySlice>(_list276.size);
+                  for (int _i277 = 0; _i277 < _list276.size; ++_i277)
                   {
-                    KeySlice _elem270;
-                    _elem270 = new KeySlice();
-                    _elem270.read(iprot);
-                    struct.success.add(_elem270);
+                    KeySlice _elem278;
+                    _elem278 = new KeySlice();
+                    _elem278.read(iprot);
+                    struct.success.add(_elem278);
                   }
                   iprot.readListEnd();
                 }
@@ -17471,9 +17641,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
-            for (KeySlice _iter271 : struct.success)
+            for (KeySlice _iter279 : struct.success)
             {
-              _iter271.write(oprot);
+              _iter279.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -17528,9 +17698,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (KeySlice _iter272 : struct.success)
+            for (KeySlice _iter280 : struct.success)
             {
-              _iter272.write(oprot);
+              _iter280.write(oprot);
             }
           }
         }
@@ -17551,14 +17721,14 @@
         BitSet incoming = iprot.readBitSet(4);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list273 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.success = new ArrayList<KeySlice>(_list273.size);
-            for (int _i274 = 0; _i274 < _list273.size; ++_i274)
+            org.apache.thrift.protocol.TList _list281 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<KeySlice>(_list281.size);
+            for (int _i282 = 0; _i282 < _list281.size; ++_i282)
             {
-              KeySlice _elem275;
-              _elem275 = new KeySlice();
-              _elem275.read(iprot);
-              struct.success.add(_elem275);
+              KeySlice _elem283;
+              _elem283 = new KeySlice();
+              _elem283.read(iprot);
+              struct.success.add(_elem283);
             }
           }
           struct.setSuccessIsSet(true);
@@ -18837,14 +19007,14 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list276 = iprot.readListBegin();
-                  struct.success = new ArrayList<KeySlice>(_list276.size);
-                  for (int _i277 = 0; _i277 < _list276.size; ++_i277)
+                  org.apache.thrift.protocol.TList _list284 = iprot.readListBegin();
+                  struct.success = new ArrayList<KeySlice>(_list284.size);
+                  for (int _i285 = 0; _i285 < _list284.size; ++_i285)
                   {
-                    KeySlice _elem278;
-                    _elem278 = new KeySlice();
-                    _elem278.read(iprot);
-                    struct.success.add(_elem278);
+                    KeySlice _elem286;
+                    _elem286 = new KeySlice();
+                    _elem286.read(iprot);
+                    struct.success.add(_elem286);
                   }
                   iprot.readListEnd();
                 }
@@ -18899,9 +19069,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
-            for (KeySlice _iter279 : struct.success)
+            for (KeySlice _iter287 : struct.success)
             {
-              _iter279.write(oprot);
+              _iter287.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -18956,9 +19126,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (KeySlice _iter280 : struct.success)
+            for (KeySlice _iter288 : struct.success)
             {
-              _iter280.write(oprot);
+              _iter288.write(oprot);
             }
           }
         }
@@ -18979,14 +19149,14 @@
         BitSet incoming = iprot.readBitSet(4);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list281 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.success = new ArrayList<KeySlice>(_list281.size);
-            for (int _i282 = 0; _i282 < _list281.size; ++_i282)
+            org.apache.thrift.protocol.TList _list289 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<KeySlice>(_list289.size);
+            for (int _i290 = 0; _i290 < _list289.size; ++_i290)
             {
-              KeySlice _elem283;
-              _elem283 = new KeySlice();
-              _elem283.read(iprot);
-              struct.success.add(_elem283);
+              KeySlice _elem291;
+              _elem291 = new KeySlice();
+              _elem291.read(iprot);
+              struct.success.add(_elem291);
             }
           }
           struct.setSuccessIsSet(true);
@@ -20264,14 +20434,14 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list284 = iprot.readListBegin();
-                  struct.success = new ArrayList<KeySlice>(_list284.size);
-                  for (int _i285 = 0; _i285 < _list284.size; ++_i285)
+                  org.apache.thrift.protocol.TList _list292 = iprot.readListBegin();
+                  struct.success = new ArrayList<KeySlice>(_list292.size);
+                  for (int _i293 = 0; _i293 < _list292.size; ++_i293)
                   {
-                    KeySlice _elem286;
-                    _elem286 = new KeySlice();
-                    _elem286.read(iprot);
-                    struct.success.add(_elem286);
+                    KeySlice _elem294;
+                    _elem294 = new KeySlice();
+                    _elem294.read(iprot);
+                    struct.success.add(_elem294);
                   }
                   iprot.readListEnd();
                 }
@@ -20326,9 +20496,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
-            for (KeySlice _iter287 : struct.success)
+            for (KeySlice _iter295 : struct.success)
             {
-              _iter287.write(oprot);
+              _iter295.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -20383,9 +20553,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (KeySlice _iter288 : struct.success)
+            for (KeySlice _iter296 : struct.success)
             {
-              _iter288.write(oprot);
+              _iter296.write(oprot);
             }
           }
         }
@@ -20406,14 +20576,14 @@
         BitSet incoming = iprot.readBitSet(4);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list289 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.success = new ArrayList<KeySlice>(_list289.size);
-            for (int _i290 = 0; _i290 < _list289.size; ++_i290)
+            org.apache.thrift.protocol.TList _list297 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<KeySlice>(_list297.size);
+            for (int _i298 = 0; _i298 < _list297.size; ++_i298)
             {
-              KeySlice _elem291;
-              _elem291 = new KeySlice();
-              _elem291.read(iprot);
-              struct.success.add(_elem291);
+              KeySlice _elem299;
+              _elem299 = new KeySlice();
+              _elem299.read(iprot);
+              struct.success.add(_elem299);
             }
           }
           struct.setSuccessIsSet(true);
@@ -23809,14 +23979,14 @@
             case 3: // EXPECTED
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list292 = iprot.readListBegin();
-                  struct.expected = new ArrayList<Column>(_list292.size);
-                  for (int _i293 = 0; _i293 < _list292.size; ++_i293)
+                  org.apache.thrift.protocol.TList _list300 = iprot.readListBegin();
+                  struct.expected = new ArrayList<Column>(_list300.size);
+                  for (int _i301 = 0; _i301 < _list300.size; ++_i301)
                   {
-                    Column _elem294;
-                    _elem294 = new Column();
-                    _elem294.read(iprot);
-                    struct.expected.add(_elem294);
+                    Column _elem302;
+                    _elem302 = new Column();
+                    _elem302.read(iprot);
+                    struct.expected.add(_elem302);
                   }
                   iprot.readListEnd();
                 }
@@ -23828,14 +23998,14 @@
             case 4: // UPDATES
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list295 = iprot.readListBegin();
-                  struct.updates = new ArrayList<Column>(_list295.size);
-                  for (int _i296 = 0; _i296 < _list295.size; ++_i296)
+                  org.apache.thrift.protocol.TList _list303 = iprot.readListBegin();
+                  struct.updates = new ArrayList<Column>(_list303.size);
+                  for (int _i304 = 0; _i304 < _list303.size; ++_i304)
                   {
-                    Column _elem297;
-                    _elem297 = new Column();
-                    _elem297.read(iprot);
-                    struct.updates.add(_elem297);
+                    Column _elem305;
+                    _elem305 = new Column();
+                    _elem305.read(iprot);
+                    struct.updates.add(_elem305);
                   }
                   iprot.readListEnd();
                 }
@@ -23889,9 +24059,9 @@
           oprot.writeFieldBegin(EXPECTED_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.expected.size()));
-            for (Column _iter298 : struct.expected)
+            for (Column _iter306 : struct.expected)
             {
-              _iter298.write(oprot);
+              _iter306.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -23901,9 +24071,9 @@
           oprot.writeFieldBegin(UPDATES_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.updates.size()));
-            for (Column _iter299 : struct.updates)
+            for (Column _iter307 : struct.updates)
             {
-              _iter299.write(oprot);
+              _iter307.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -23951,18 +24121,18 @@
         if (struct.isSetExpected()) {
           {
             oprot.writeI32(struct.expected.size());
-            for (Column _iter300 : struct.expected)
+            for (Column _iter308 : struct.expected)
             {
-              _iter300.write(oprot);
+              _iter308.write(oprot);
             }
           }
         }
         if (struct.isSetUpdates()) {
           {
             oprot.writeI32(struct.updates.size());
-            for (Column _iter301 : struct.updates)
+            for (Column _iter309 : struct.updates)
             {
-              _iter301.write(oprot);
+              _iter309.write(oprot);
             }
           }
         }
@@ -23982,28 +24152,28 @@
         BitSet incoming = iprot.readBitSet(2);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list302 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.expected = new ArrayList<Column>(_list302.size);
-            for (int _i303 = 0; _i303 < _list302.size; ++_i303)
+            org.apache.thrift.protocol.TList _list310 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.expected = new ArrayList<Column>(_list310.size);
+            for (int _i311 = 0; _i311 < _list310.size; ++_i311)
             {
-              Column _elem304;
-              _elem304 = new Column();
-              _elem304.read(iprot);
-              struct.expected.add(_elem304);
+              Column _elem312;
+              _elem312 = new Column();
+              _elem312.read(iprot);
+              struct.expected.add(_elem312);
             }
           }
           struct.setExpectedIsSet(true);
         }
         if (incoming.get(1)) {
           {
-            org.apache.thrift.protocol.TList _list305 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.updates = new ArrayList<Column>(_list305.size);
-            for (int _i306 = 0; _i306 < _list305.size; ++_i306)
+            org.apache.thrift.protocol.TList _list313 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.updates = new ArrayList<Column>(_list313.size);
+            for (int _i314 = 0; _i314 < _list313.size; ++_i314)
             {
-              Column _elem307;
-              _elem307 = new Column();
-              _elem307.read(iprot);
-              struct.updates.add(_elem307);
+              Column _elem315;
+              _elem315 = new Column();
+              _elem315.read(iprot);
+              struct.updates.add(_elem315);
             }
           }
           struct.setUpdatesIsSet(true);
@@ -27589,38 +27759,38 @@
             case 1: // MUTATION_MAP
               if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
                 {
-                  org.apache.thrift.protocol.TMap _map308 = iprot.readMapBegin();
-                  struct.mutation_map = new HashMap<ByteBuffer,Map<String,List<Mutation>>>(2*_map308.size);
-                  for (int _i309 = 0; _i309 < _map308.size; ++_i309)
+                  org.apache.thrift.protocol.TMap _map316 = iprot.readMapBegin();
+                  struct.mutation_map = new HashMap<ByteBuffer,Map<String,List<Mutation>>>(2*_map316.size);
+                  for (int _i317 = 0; _i317 < _map316.size; ++_i317)
                   {
-                    ByteBuffer _key310;
-                    Map<String,List<Mutation>> _val311;
-                    _key310 = iprot.readBinary();
+                    ByteBuffer _key318;
+                    Map<String,List<Mutation>> _val319;
+                    _key318 = iprot.readBinary();
                     {
-                      org.apache.thrift.protocol.TMap _map312 = iprot.readMapBegin();
-                      _val311 = new HashMap<String,List<Mutation>>(2*_map312.size);
-                      for (int _i313 = 0; _i313 < _map312.size; ++_i313)
+                      org.apache.thrift.protocol.TMap _map320 = iprot.readMapBegin();
+                      _val319 = new HashMap<String,List<Mutation>>(2*_map320.size);
+                      for (int _i321 = 0; _i321 < _map320.size; ++_i321)
                       {
-                        String _key314;
-                        List<Mutation> _val315;
-                        _key314 = iprot.readString();
+                        String _key322;
+                        List<Mutation> _val323;
+                        _key322 = iprot.readString();
                         {
-                          org.apache.thrift.protocol.TList _list316 = iprot.readListBegin();
-                          _val315 = new ArrayList<Mutation>(_list316.size);
-                          for (int _i317 = 0; _i317 < _list316.size; ++_i317)
+                          org.apache.thrift.protocol.TList _list324 = iprot.readListBegin();
+                          _val323 = new ArrayList<Mutation>(_list324.size);
+                          for (int _i325 = 0; _i325 < _list324.size; ++_i325)
                           {
-                            Mutation _elem318;
-                            _elem318 = new Mutation();
-                            _elem318.read(iprot);
-                            _val315.add(_elem318);
+                            Mutation _elem326;
+                            _elem326 = new Mutation();
+                            _elem326.read(iprot);
+                            _val323.add(_elem326);
                           }
                           iprot.readListEnd();
                         }
-                        _val311.put(_key314, _val315);
+                        _val319.put(_key322, _val323);
                       }
                       iprot.readMapEnd();
                     }
-                    struct.mutation_map.put(_key310, _val311);
+                    struct.mutation_map.put(_key318, _val319);
                   }
                   iprot.readMapEnd();
                 }
@@ -27656,19 +27826,19 @@
           oprot.writeFieldBegin(MUTATION_MAP_FIELD_DESC);
           {
             oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.MAP, struct.mutation_map.size()));
-            for (Map.Entry<ByteBuffer, Map<String,List<Mutation>>> _iter319 : struct.mutation_map.entrySet())
+            for (Map.Entry<ByteBuffer, Map<String,List<Mutation>>> _iter327 : struct.mutation_map.entrySet())
             {
-              oprot.writeBinary(_iter319.getKey());
+              oprot.writeBinary(_iter327.getKey());
               {
-                oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, _iter319.getValue().size()));
-                for (Map.Entry<String, List<Mutation>> _iter320 : _iter319.getValue().entrySet())
+                oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, _iter327.getValue().size()));
+                for (Map.Entry<String, List<Mutation>> _iter328 : _iter327.getValue().entrySet())
                 {
-                  oprot.writeString(_iter320.getKey());
+                  oprot.writeString(_iter328.getKey());
                   {
-                    oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, _iter320.getValue().size()));
-                    for (Mutation _iter321 : _iter320.getValue())
+                    oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, _iter328.getValue().size()));
+                    for (Mutation _iter329 : _iter328.getValue())
                     {
-                      _iter321.write(oprot);
+                      _iter329.write(oprot);
                     }
                     oprot.writeListEnd();
                   }
@@ -27704,19 +27874,19 @@
         TTupleProtocol oprot = (TTupleProtocol) prot;
         {
           oprot.writeI32(struct.mutation_map.size());
-          for (Map.Entry<ByteBuffer, Map<String,List<Mutation>>> _iter322 : struct.mutation_map.entrySet())
+          for (Map.Entry<ByteBuffer, Map<String,List<Mutation>>> _iter330 : struct.mutation_map.entrySet())
           {
-            oprot.writeBinary(_iter322.getKey());
+            oprot.writeBinary(_iter330.getKey());
             {
-              oprot.writeI32(_iter322.getValue().size());
-              for (Map.Entry<String, List<Mutation>> _iter323 : _iter322.getValue().entrySet())
+              oprot.writeI32(_iter330.getValue().size());
+              for (Map.Entry<String, List<Mutation>> _iter331 : _iter330.getValue().entrySet())
               {
-                oprot.writeString(_iter323.getKey());
+                oprot.writeString(_iter331.getKey());
                 {
-                  oprot.writeI32(_iter323.getValue().size());
-                  for (Mutation _iter324 : _iter323.getValue())
+                  oprot.writeI32(_iter331.getValue().size());
+                  for (Mutation _iter332 : _iter331.getValue())
                   {
-                    _iter324.write(oprot);
+                    _iter332.write(oprot);
                   }
                 }
               }
@@ -27730,36 +27900,36 @@
       public void read(org.apache.thrift.protocol.TProtocol prot, batch_mutate_args struct) throws org.apache.thrift.TException {
         TTupleProtocol iprot = (TTupleProtocol) prot;
         {
-          org.apache.thrift.protocol.TMap _map325 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.MAP, iprot.readI32());
-          struct.mutation_map = new HashMap<ByteBuffer,Map<String,List<Mutation>>>(2*_map325.size);
-          for (int _i326 = 0; _i326 < _map325.size; ++_i326)
+          org.apache.thrift.protocol.TMap _map333 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.MAP, iprot.readI32());
+          struct.mutation_map = new HashMap<ByteBuffer,Map<String,List<Mutation>>>(2*_map333.size);
+          for (int _i334 = 0; _i334 < _map333.size; ++_i334)
           {
-            ByteBuffer _key327;
-            Map<String,List<Mutation>> _val328;
-            _key327 = iprot.readBinary();
+            ByteBuffer _key335;
+            Map<String,List<Mutation>> _val336;
+            _key335 = iprot.readBinary();
             {
-              org.apache.thrift.protocol.TMap _map329 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
-              _val328 = new HashMap<String,List<Mutation>>(2*_map329.size);
-              for (int _i330 = 0; _i330 < _map329.size; ++_i330)
+              org.apache.thrift.protocol.TMap _map337 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
+              _val336 = new HashMap<String,List<Mutation>>(2*_map337.size);
+              for (int _i338 = 0; _i338 < _map337.size; ++_i338)
               {
-                String _key331;
-                List<Mutation> _val332;
-                _key331 = iprot.readString();
+                String _key339;
+                List<Mutation> _val340;
+                _key339 = iprot.readString();
                 {
-                  org.apache.thrift.protocol.TList _list333 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-                  _val332 = new ArrayList<Mutation>(_list333.size);
-                  for (int _i334 = 0; _i334 < _list333.size; ++_i334)
+                  org.apache.thrift.protocol.TList _list341 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+                  _val340 = new ArrayList<Mutation>(_list341.size);
+                  for (int _i342 = 0; _i342 < _list341.size; ++_i342)
                   {
-                    Mutation _elem335;
-                    _elem335 = new Mutation();
-                    _elem335.read(iprot);
-                    _val332.add(_elem335);
+                    Mutation _elem343;
+                    _elem343 = new Mutation();
+                    _elem343.read(iprot);
+                    _val340.add(_elem343);
                   }
                 }
-                _val328.put(_key331, _val332);
+                _val336.put(_key339, _val340);
               }
             }
-            struct.mutation_map.put(_key327, _val328);
+            struct.mutation_map.put(_key335, _val336);
           }
         }
         struct.setMutation_mapIsSet(true);
@@ -28793,38 +28963,38 @@
             case 1: // MUTATION_MAP
               if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
                 {
-                  org.apache.thrift.protocol.TMap _map336 = iprot.readMapBegin();
-                  struct.mutation_map = new HashMap<ByteBuffer,Map<String,List<Mutation>>>(2*_map336.size);
-                  for (int _i337 = 0; _i337 < _map336.size; ++_i337)
+                  org.apache.thrift.protocol.TMap _map344 = iprot.readMapBegin();
+                  struct.mutation_map = new HashMap<ByteBuffer,Map<String,List<Mutation>>>(2*_map344.size);
+                  for (int _i345 = 0; _i345 < _map344.size; ++_i345)
                   {
-                    ByteBuffer _key338;
-                    Map<String,List<Mutation>> _val339;
-                    _key338 = iprot.readBinary();
+                    ByteBuffer _key346;
+                    Map<String,List<Mutation>> _val347;
+                    _key346 = iprot.readBinary();
                     {
-                      org.apache.thrift.protocol.TMap _map340 = iprot.readMapBegin();
-                      _val339 = new HashMap<String,List<Mutation>>(2*_map340.size);
-                      for (int _i341 = 0; _i341 < _map340.size; ++_i341)
+                      org.apache.thrift.protocol.TMap _map348 = iprot.readMapBegin();
+                      _val347 = new HashMap<String,List<Mutation>>(2*_map348.size);
+                      for (int _i349 = 0; _i349 < _map348.size; ++_i349)
                       {
-                        String _key342;
-                        List<Mutation> _val343;
-                        _key342 = iprot.readString();
+                        String _key350;
+                        List<Mutation> _val351;
+                        _key350 = iprot.readString();
                         {
-                          org.apache.thrift.protocol.TList _list344 = iprot.readListBegin();
-                          _val343 = new ArrayList<Mutation>(_list344.size);
-                          for (int _i345 = 0; _i345 < _list344.size; ++_i345)
+                          org.apache.thrift.protocol.TList _list352 = iprot.readListBegin();
+                          _val351 = new ArrayList<Mutation>(_list352.size);
+                          for (int _i353 = 0; _i353 < _list352.size; ++_i353)
                           {
-                            Mutation _elem346;
-                            _elem346 = new Mutation();
-                            _elem346.read(iprot);
-                            _val343.add(_elem346);
+                            Mutation _elem354;
+                            _elem354 = new Mutation();
+                            _elem354.read(iprot);
+                            _val351.add(_elem354);
                           }
                           iprot.readListEnd();
                         }
-                        _val339.put(_key342, _val343);
+                        _val347.put(_key350, _val351);
                       }
                       iprot.readMapEnd();
                     }
-                    struct.mutation_map.put(_key338, _val339);
+                    struct.mutation_map.put(_key346, _val347);
                   }
                   iprot.readMapEnd();
                 }
@@ -28860,19 +29030,19 @@
           oprot.writeFieldBegin(MUTATION_MAP_FIELD_DESC);
           {
             oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.MAP, struct.mutation_map.size()));
-            for (Map.Entry<ByteBuffer, Map<String,List<Mutation>>> _iter347 : struct.mutation_map.entrySet())
+            for (Map.Entry<ByteBuffer, Map<String,List<Mutation>>> _iter355 : struct.mutation_map.entrySet())
             {
-              oprot.writeBinary(_iter347.getKey());
+              oprot.writeBinary(_iter355.getKey());
               {
-                oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, _iter347.getValue().size()));
-                for (Map.Entry<String, List<Mutation>> _iter348 : _iter347.getValue().entrySet())
+                oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, _iter355.getValue().size()));
+                for (Map.Entry<String, List<Mutation>> _iter356 : _iter355.getValue().entrySet())
                 {
-                  oprot.writeString(_iter348.getKey());
+                  oprot.writeString(_iter356.getKey());
                   {
-                    oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, _iter348.getValue().size()));
-                    for (Mutation _iter349 : _iter348.getValue())
+                    oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, _iter356.getValue().size()));
+                    for (Mutation _iter357 : _iter356.getValue())
                     {
-                      _iter349.write(oprot);
+                      _iter357.write(oprot);
                     }
                     oprot.writeListEnd();
                   }
@@ -28908,19 +29078,19 @@
         TTupleProtocol oprot = (TTupleProtocol) prot;
         {
           oprot.writeI32(struct.mutation_map.size());
-          for (Map.Entry<ByteBuffer, Map<String,List<Mutation>>> _iter350 : struct.mutation_map.entrySet())
+          for (Map.Entry<ByteBuffer, Map<String,List<Mutation>>> _iter358 : struct.mutation_map.entrySet())
           {
-            oprot.writeBinary(_iter350.getKey());
+            oprot.writeBinary(_iter358.getKey());
             {
-              oprot.writeI32(_iter350.getValue().size());
-              for (Map.Entry<String, List<Mutation>> _iter351 : _iter350.getValue().entrySet())
+              oprot.writeI32(_iter358.getValue().size());
+              for (Map.Entry<String, List<Mutation>> _iter359 : _iter358.getValue().entrySet())
               {
-                oprot.writeString(_iter351.getKey());
+                oprot.writeString(_iter359.getKey());
                 {
-                  oprot.writeI32(_iter351.getValue().size());
-                  for (Mutation _iter352 : _iter351.getValue())
+                  oprot.writeI32(_iter359.getValue().size());
+                  for (Mutation _iter360 : _iter359.getValue())
                   {
-                    _iter352.write(oprot);
+                    _iter360.write(oprot);
                   }
                 }
               }
@@ -28934,36 +29104,36 @@
       public void read(org.apache.thrift.protocol.TProtocol prot, atomic_batch_mutate_args struct) throws org.apache.thrift.TException {
         TTupleProtocol iprot = (TTupleProtocol) prot;
         {
-          org.apache.thrift.protocol.TMap _map353 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.MAP, iprot.readI32());
-          struct.mutation_map = new HashMap<ByteBuffer,Map<String,List<Mutation>>>(2*_map353.size);
-          for (int _i354 = 0; _i354 < _map353.size; ++_i354)
+          org.apache.thrift.protocol.TMap _map361 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.MAP, iprot.readI32());
+          struct.mutation_map = new HashMap<ByteBuffer,Map<String,List<Mutation>>>(2*_map361.size);
+          for (int _i362 = 0; _i362 < _map361.size; ++_i362)
           {
-            ByteBuffer _key355;
-            Map<String,List<Mutation>> _val356;
-            _key355 = iprot.readBinary();
+            ByteBuffer _key363;
+            Map<String,List<Mutation>> _val364;
+            _key363 = iprot.readBinary();
             {
-              org.apache.thrift.protocol.TMap _map357 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
-              _val356 = new HashMap<String,List<Mutation>>(2*_map357.size);
-              for (int _i358 = 0; _i358 < _map357.size; ++_i358)
+              org.apache.thrift.protocol.TMap _map365 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
+              _val364 = new HashMap<String,List<Mutation>>(2*_map365.size);
+              for (int _i366 = 0; _i366 < _map365.size; ++_i366)
               {
-                String _key359;
-                List<Mutation> _val360;
-                _key359 = iprot.readString();
+                String _key367;
+                List<Mutation> _val368;
+                _key367 = iprot.readString();
                 {
-                  org.apache.thrift.protocol.TList _list361 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-                  _val360 = new ArrayList<Mutation>(_list361.size);
-                  for (int _i362 = 0; _i362 < _list361.size; ++_i362)
+                  org.apache.thrift.protocol.TList _list369 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+                  _val368 = new ArrayList<Mutation>(_list369.size);
+                  for (int _i370 = 0; _i370 < _list369.size; ++_i370)
                   {
-                    Mutation _elem363;
-                    _elem363 = new Mutation();
-                    _elem363.read(iprot);
-                    _val360.add(_elem363);
+                    Mutation _elem371;
+                    _elem371 = new Mutation();
+                    _elem371.read(iprot);
+                    _val368.add(_elem371);
                   }
                 }
-                _val356.put(_key359, _val360);
+                _val364.put(_key367, _val368);
               }
             }
-            struct.mutation_map.put(_key355, _val356);
+            struct.mutation_map.put(_key363, _val364);
           }
         }
         struct.setMutation_mapIsSet(true);
@@ -30482,6 +30652,1101 @@
 
   }
 
+  public static class get_multi_slice_args implements org.apache.thrift.TBase<get_multi_slice_args, get_multi_slice_args._Fields>, java.io.Serializable, Cloneable, Comparable<get_multi_slice_args>   {
+    private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("get_multi_slice_args");
+
+    private static final org.apache.thrift.protocol.TField REQUEST_FIELD_DESC = new org.apache.thrift.protocol.TField("request", org.apache.thrift.protocol.TType.STRUCT, (short)1);
+
+    private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
+    static {
+      schemes.put(StandardScheme.class, new get_multi_slice_argsStandardSchemeFactory());
+      schemes.put(TupleScheme.class, new get_multi_slice_argsTupleSchemeFactory());
+    }
+
+    public MultiSliceRequest request; // required
+
+    /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
+    public enum _Fields implements org.apache.thrift.TFieldIdEnum {
+      REQUEST((short)1, "request");
+
+      private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
+
+      static {
+        for (_Fields field : EnumSet.allOf(_Fields.class)) {
+          byName.put(field.getFieldName(), field);
+        }
+      }
+
+      /**
+       * Find the _Fields constant that matches fieldId, or null if its not found.
+       */
+      public static _Fields findByThriftId(int fieldId) {
+        switch(fieldId) {
+          case 1: // REQUEST
+            return REQUEST;
+          default:
+            return null;
+        }
+      }
+
+      /**
+       * Find the _Fields constant that matches fieldId, throwing an exception
+       * if it is not found.
+       */
+      public static _Fields findByThriftIdOrThrow(int fieldId) {
+        _Fields fields = findByThriftId(fieldId);
+        if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
+        return fields;
+      }
+
+      /**
+       * Find the _Fields constant that matches name, or null if its not found.
+       */
+      public static _Fields findByName(String name) {
+        return byName.get(name);
+      }
+
+      private final short _thriftId;
+      private final String _fieldName;
+
+      _Fields(short thriftId, String fieldName) {
+        _thriftId = thriftId;
+        _fieldName = fieldName;
+      }
+
+      public short getThriftFieldId() {
+        return _thriftId;
+      }
+
+      public String getFieldName() {
+        return _fieldName;
+      }
+    }
+
+    // isset id assignments
+    public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
+    static {
+      Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
+      tmpMap.put(_Fields.REQUEST, new org.apache.thrift.meta_data.FieldMetaData("request", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+          new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, MultiSliceRequest.class)));
+      metaDataMap = Collections.unmodifiableMap(tmpMap);
+      org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(get_multi_slice_args.class, metaDataMap);
+    }
+
+    public get_multi_slice_args() {
+    }
+
+    public get_multi_slice_args(
+      MultiSliceRequest request)
+    {
+      this();
+      this.request = request;
+    }
+
+    /**
+     * Performs a deep copy on <i>other</i>.
+     */
+    public get_multi_slice_args(get_multi_slice_args other) {
+      if (other.isSetRequest()) {
+        this.request = new MultiSliceRequest(other.request);
+      }
+    }
+
+    public get_multi_slice_args deepCopy() {
+      return new get_multi_slice_args(this);
+    }
+
+    @Override
+    public void clear() {
+      this.request = null;
+    }
+
+    public MultiSliceRequest getRequest() {
+      return this.request;
+    }
+
+    public get_multi_slice_args setRequest(MultiSliceRequest request) {
+      this.request = request;
+      return this;
+    }
+
+    public void unsetRequest() {
+      this.request = null;
+    }
+
+    /** Returns true if field request is set (has been assigned a value) and false otherwise */
+    public boolean isSetRequest() {
+      return this.request != null;
+    }
+
+    public void setRequestIsSet(boolean value) {
+      if (!value) {
+        this.request = null;
+      }
+    }
+
+    public void setFieldValue(_Fields field, Object value) {
+      switch (field) {
+      case REQUEST:
+        if (value == null) {
+          unsetRequest();
+        } else {
+          setRequest((MultiSliceRequest)value);
+        }
+        break;
+
+      }
+    }
+
+    public Object getFieldValue(_Fields field) {
+      switch (field) {
+      case REQUEST:
+        return getRequest();
+
+      }
+      throw new IllegalStateException();
+    }
+
+    /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
+    public boolean isSet(_Fields field) {
+      if (field == null) {
+        throw new IllegalArgumentException();
+      }
+
+      switch (field) {
+      case REQUEST:
+        return isSetRequest();
+      }
+      throw new IllegalStateException();
+    }
+
+    @Override
+    public boolean equals(Object that) {
+      if (that == null)
+        return false;
+      if (that instanceof get_multi_slice_args)
+        return this.equals((get_multi_slice_args)that);
+      return false;
+    }
+
+    public boolean equals(get_multi_slice_args that) {
+      if (that == null)
+        return false;
+
+      boolean this_present_request = true && this.isSetRequest();
+      boolean that_present_request = true && that.isSetRequest();
+      if (this_present_request || that_present_request) {
+        if (!(this_present_request && that_present_request))
+          return false;
+        if (!this.request.equals(that.request))
+          return false;
+      }
+
+      return true;
+    }
+
+    @Override
+    public int hashCode() {
+      HashCodeBuilder builder = new HashCodeBuilder();
+
+      boolean present_request = true && (isSetRequest());
+      builder.append(present_request);
+      if (present_request)
+        builder.append(request);
+
+      return builder.toHashCode();
+    }
+
+    @Override
+    public int compareTo(get_multi_slice_args other) {
+      if (!getClass().equals(other.getClass())) {
+        return getClass().getName().compareTo(other.getClass().getName());
+      }
+
+      int lastComparison = 0;
+
+      lastComparison = Boolean.valueOf(isSetRequest()).compareTo(other.isSetRequest());
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+      if (isSetRequest()) {
+        lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.request, other.request);
+        if (lastComparison != 0) {
+          return lastComparison;
+        }
+      }
+      return 0;
+    }
+
+    public _Fields fieldForId(int fieldId) {
+      return _Fields.findByThriftId(fieldId);
+    }
+
+    public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException {
+      schemes.get(iprot.getScheme()).getScheme().read(iprot, this);
+    }
+
+    public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException {
+      schemes.get(oprot.getScheme()).getScheme().write(oprot, this);
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder sb = new StringBuilder("get_multi_slice_args(");
+      boolean first = true;
+
+      sb.append("request:");
+      if (this.request == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.request);
+      }
+      first = false;
+      sb.append(")");
+      return sb.toString();
+    }
+
+    public void validate() throws org.apache.thrift.TException {
+      // check for required fields
+      if (request == null) {
+        throw new org.apache.thrift.protocol.TProtocolException("Required field 'request' was not present! Struct: " + toString());
+      }
+      // check for sub-struct validity
+      if (request != null) {
+        request.validate();
+      }
+    }
+
+    private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException {
+      try {
+        write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out)));
+      } catch (org.apache.thrift.TException te) {
+        throw new java.io.IOException(te);
+      }
+    }
+
+    private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
+      try {
+        read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in)));
+      } catch (org.apache.thrift.TException te) {
+        throw new java.io.IOException(te);
+      }
+    }
+
+    private static class get_multi_slice_argsStandardSchemeFactory implements SchemeFactory {
+      public get_multi_slice_argsStandardScheme getScheme() {
+        return new get_multi_slice_argsStandardScheme();
+      }
+    }
+
+    private static class get_multi_slice_argsStandardScheme extends StandardScheme<get_multi_slice_args> {
+
+      public void read(org.apache.thrift.protocol.TProtocol iprot, get_multi_slice_args struct) throws org.apache.thrift.TException {
+        org.apache.thrift.protocol.TField schemeField;
+        iprot.readStructBegin();
+        while (true)
+        {
+          schemeField = iprot.readFieldBegin();
+          if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { 
+            break;
+          }
+          switch (schemeField.id) {
+            case 1: // REQUEST
+              if (schemeField.type == org.apache.thrift.protocol.TType.STRUCT) {
+                struct.request = new MultiSliceRequest();
+                struct.request.read(iprot);
+                struct.setRequestIsSet(true);
+              } else { 
+                org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+              }
+              break;
+            default:
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+          }
+          iprot.readFieldEnd();
+        }
+        iprot.readStructEnd();
+
+        // check for required fields of primitive type, which can't be checked in the validate method
+        struct.validate();
+      }
+
+      public void write(org.apache.thrift.protocol.TProtocol oprot, get_multi_slice_args struct) throws org.apache.thrift.TException {
+        struct.validate();
+
+        oprot.writeStructBegin(STRUCT_DESC);
+        if (struct.request != null) {
+          oprot.writeFieldBegin(REQUEST_FIELD_DESC);
+          struct.request.write(oprot);
+          oprot.writeFieldEnd();
+        }
+        oprot.writeFieldStop();
+        oprot.writeStructEnd();
+      }
+
+    }
+
+    private static class get_multi_slice_argsTupleSchemeFactory implements SchemeFactory {
+      public get_multi_slice_argsTupleScheme getScheme() {
+        return new get_multi_slice_argsTupleScheme();
+      }
+    }
+
+    private static class get_multi_slice_argsTupleScheme extends TupleScheme<get_multi_slice_args> {
+
+      @Override
+      public void write(org.apache.thrift.protocol.TProtocol prot, get_multi_slice_args struct) throws org.apache.thrift.TException {
+        TTupleProtocol oprot = (TTupleProtocol) prot;
+        struct.request.write(oprot);
+      }
+
+      @Override
+      public void read(org.apache.thrift.protocol.TProtocol prot, get_multi_slice_args struct) throws org.apache.thrift.TException {
+        TTupleProtocol iprot = (TTupleProtocol) prot;
+        struct.request = new MultiSliceRequest();
+        struct.request.read(iprot);
+        struct.setRequestIsSet(true);
+      }
+    }
+
+  }
+
+  public static class get_multi_slice_result implements org.apache.thrift.TBase<get_multi_slice_result, get_multi_slice_result._Fields>, java.io.Serializable, Cloneable, Comparable<get_multi_slice_result>   {
+    private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("get_multi_slice_result");
+
+    private static final org.apache.thrift.protocol.TField SUCCESS_FIELD_DESC = new org.apache.thrift.protocol.TField("success", org.apache.thrift.protocol.TType.LIST, (short)0);
+    private static final org.apache.thrift.protocol.TField IRE_FIELD_DESC = new org.apache.thrift.protocol.TField("ire", org.apache.thrift.protocol.TType.STRUCT, (short)1);
+    private static final org.apache.thrift.protocol.TField UE_FIELD_DESC = new org.apache.thrift.protocol.TField("ue", org.apache.thrift.protocol.TType.STRUCT, (short)2);
+    private static final org.apache.thrift.protocol.TField TE_FIELD_DESC = new org.apache.thrift.protocol.TField("te", org.apache.thrift.protocol.TType.STRUCT, (short)3);
+
+    private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
+    static {
+      schemes.put(StandardScheme.class, new get_multi_slice_resultStandardSchemeFactory());
+      schemes.put(TupleScheme.class, new get_multi_slice_resultTupleSchemeFactory());
+    }
+
+    public List<ColumnOrSuperColumn> success; // required
+    public InvalidRequestException ire; // required
+    public UnavailableException ue; // required
+    public TimedOutException te; // required
+
+    /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
+    public enum _Fields implements org.apache.thrift.TFieldIdEnum {
+      SUCCESS((short)0, "success"),
+      IRE((short)1, "ire"),
+      UE((short)2, "ue"),
+      TE((short)3, "te");
+
+      private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
+
+      static {
+        for (_Fields field : EnumSet.allOf(_Fields.class)) {
+          byName.put(field.getFieldName(), field);
+        }
+      }
+
+      /**
+       * Find the _Fields constant that matches fieldId, or null if its not found.
+       */
+      public static _Fields findByThriftId(int fieldId) {
+        switch(fieldId) {
+          case 0: // SUCCESS
+            return SUCCESS;
+          case 1: // IRE
+            return IRE;
+          case 2: // UE
+            return UE;
+          case 3: // TE
+            return TE;
+          default:
+            return null;
+        }
+      }
+
+      /**
+       * Find the _Fields constant that matches fieldId, throwing an exception
+       * if it is not found.
+       */
+      public static _Fields findByThriftIdOrThrow(int fieldId) {
+        _Fields fields = findByThriftId(fieldId);
+        if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
+        return fields;
+      }
+
+      /**
+       * Find the _Fields constant that matches name, or null if its not found.
+       */
+      public static _Fields findByName(String name) {
+        return byName.get(name);
+      }
+
+      private final short _thriftId;
+      private final String _fieldName;
+
+      _Fields(short thriftId, String fieldName) {
+        _thriftId = thriftId;
+        _fieldName = fieldName;
+      }
+
+      public short getThriftFieldId() {
+        return _thriftId;
+      }
+
+      public String getFieldName() {
+        return _fieldName;
+      }
+    }
+
+    // isset id assignments
+    public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
+    static {
+      Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
+      tmpMap.put(_Fields.SUCCESS, new org.apache.thrift.meta_data.FieldMetaData("success", org.apache.thrift.TFieldRequirementType.DEFAULT, 
+          new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
+              new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, ColumnOrSuperColumn.class))));
+      tmpMap.put(_Fields.IRE, new org.apache.thrift.meta_data.FieldMetaData("ire", org.apache.thrift.TFieldRequirementType.DEFAULT, 
+          new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRUCT)));
+      tmpMap.put(_Fields.UE, new org.apache.thrift.meta_data.FieldMetaData("ue", org.apache.thrift.TFieldRequirementType.DEFAULT, 
+          new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRUCT)));
+      tmpMap.put(_Fields.TE, new org.apache.thrift.meta_data.FieldMetaData("te", org.apache.thrift.TFieldRequirementType.DEFAULT, 
+          new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRUCT)));
+      metaDataMap = Collections.unmodifiableMap(tmpMap);
+      org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(get_multi_slice_result.class, metaDataMap);
+    }
+
+    public get_multi_slice_result() {
+    }
+
+    public get_multi_slice_result(
+      List<ColumnOrSuperColumn> success,
+      InvalidRequestException ire,
+      UnavailableException ue,
+      TimedOutException te)
+    {
+      this();
+      this.success = success;
+      this.ire = ire;
+      this.ue = ue;
+      this.te = te;
+    }
+
+    /**
+     * Performs a deep copy on <i>other</i>.
+     */
+    public get_multi_slice_result(get_multi_slice_result other) {
+      if (other.isSetSuccess()) {
+        List<ColumnOrSuperColumn> __this__success = new ArrayList<ColumnOrSuperColumn>(other.success.size());
+        for (ColumnOrSuperColumn other_element : other.success) {
+          __this__success.add(new ColumnOrSuperColumn(other_element));
+        }
+        this.success = __this__success;
+      }
+      if (other.isSetIre()) {
+        this.ire = new InvalidRequestException(other.ire);
+      }
+      if (other.isSetUe()) {
+        this.ue = new UnavailableException(other.ue);
+      }
+      if (other.isSetTe()) {
+        this.te = new TimedOutException(other.te);
+      }
+    }
+
+    public get_multi_slice_result deepCopy() {
+      return new get_multi_slice_result(this);
+    }
+
+    @Override
+    public void clear() {
+      this.success = null;
+      this.ire = null;
+      this.ue = null;
+      this.te = null;
+    }
+
+    public int getSuccessSize() {
+      return (this.success == null) ? 0 : this.success.size();
+    }
+
+    public java.util.Iterator<ColumnOrSuperColumn> getSuccessIterator() {
+      return (this.success == null) ? null : this.success.iterator();
+    }
+
+    public void addToSuccess(ColumnOrSuperColumn elem) {
+      if (this.success == null) {
+        this.success = new ArrayList<ColumnOrSuperColumn>();
+      }
+      this.success.add(elem);
+    }
+
+    public List<ColumnOrSuperColumn> getSuccess() {
+      return this.success;
+    }
+
+    public get_multi_slice_result setSuccess(List<ColumnOrSuperColumn> success) {
+      this.success = success;
+      return this;
+    }
+
+    public void unsetSuccess() {
+      this.success = null;
+    }
+
+    /** Returns true if field success is set (has been assigned a value) and false otherwise */
+    public boolean isSetSuccess() {
+      return this.success != null;
+    }
+
+    public void setSuccessIsSet(boolean value) {
+      if (!value) {
+        this.success = null;
+      }
+    }
+
+    public InvalidRequestException getIre() {
+      return this.ire;
+    }
+
+    public get_multi_slice_result setIre(InvalidRequestException ire) {
+      this.ire = ire;
+      return this;
+    }
+
+    public void unsetIre() {
+      this.ire = null;
+    }
+
+    /** Returns true if field ire is set (has been assigned a value) and false otherwise */
+    public boolean isSetIre() {
+      return this.ire != null;
+    }
+
+    public void setIreIsSet(boolean value) {
+      if (!value) {
+        this.ire = null;
+      }
+    }
+
+    public UnavailableException getUe() {
+      return this.ue;
+    }
+
+    public get_multi_slice_result setUe(UnavailableException ue) {
+      this.ue = ue;
+      return this;
+    }
+
+    public void unsetUe() {
+      this.ue = null;
+    }
+
+    /** Returns true if field ue is set (has been assigned a value) and false otherwise */
+    public boolean isSetUe() {
+      return this.ue != null;
+    }
+
+    public void setUeIsSet(boolean value) {
+      if (!value) {
+        this.ue = null;
+      }
+    }
+
+    public TimedOutException getTe() {
+      return this.te;
+    }
+
+    public get_multi_slice_result setTe(TimedOutException te) {
+      this.te = te;
+      return this;
+    }
+
+    public void unsetTe() {
+      this.te = null;
+    }
+
+    /** Returns true if field te is set (has been assigned a value) and false otherwise */
+    public boolean isSetTe() {
+      return this.te != null;
+    }
+
+    public void setTeIsSet(boolean value) {
+      if (!value) {
+        this.te = null;
+      }
+    }
+
+    public void setFieldValue(_Fields field, Object value) {
+      switch (field) {
+      case SUCCESS:
+        if (value == null) {
+          unsetSuccess();
+        } else {
+          setSuccess((List<ColumnOrSuperColumn>)value);
+        }
+        break;
+
+      case IRE:
+        if (value == null) {
+          unsetIre();
+        } else {
+          setIre((InvalidRequestException)value);
+        }
+        break;
+
+      case UE:
+        if (value == null) {
+          unsetUe();
+        } else {
+          setUe((UnavailableException)value);
+        }
+        break;
+
+      case TE:
+        if (value == null) {
+          unsetTe();
+        } else {
+          setTe((TimedOutException)value);
+        }
+        break;
+
+      }
+    }
+
+    public Object getFieldValue(_Fields field) {
+      switch (field) {
+      case SUCCESS:
+        return getSuccess();
+
+      case IRE:
+        return getIre();
+
+      case UE:
+        return getUe();
+
+      case TE:
+        return getTe();
+
+      }
+      throw new IllegalStateException();
+    }
+
+    /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
+    public boolean isSet(_Fields field) {
+      if (field == null) {
+        throw new IllegalArgumentException();
+      }
+
+      switch (field) {
+      case SUCCESS:
+        return isSetSuccess();
+      case IRE:
+        return isSetIre();
+      case UE:
+        return isSetUe();
+      case TE:
+        return isSetTe();
+      }
+      throw new IllegalStateException();
+    }
+
+    @Override
+    public boolean equals(Object that) {
+      if (that == null)
+        return false;
+      if (that instanceof get_multi_slice_result)
+        return this.equals((get_multi_slice_result)that);
+      return false;
+    }
+
+    public boolean equals(get_multi_slice_result that) {
+      if (that == null)
+        return false;
+
+      boolean this_present_success = true && this.isSetSuccess();
+      boolean that_present_success = true && that.isSetSuccess();
+      if (this_present_success || that_present_success) {
+        if (!(this_present_success && that_present_success))
+          return false;
+        if (!this.success.equals(that.success))
+          return false;
+      }
+
+      boolean this_present_ire = true && this.isSetIre();
+      boolean that_present_ire = true && that.isSetIre();
+      if (this_present_ire || that_present_ire) {
+        if (!(this_present_ire && that_present_ire))
+          return false;
+        if (!this.ire.equals(that.ire))
+          return false;
+      }
+
+      boolean this_present_ue = true && this.isSetUe();
+      boolean that_present_ue = true && that.isSetUe();
+      if (this_present_ue || that_present_ue) {
+        if (!(this_present_ue && that_present_ue))
+          return false;
+        if (!this.ue.equals(that.ue))
+          return false;
+      }
+
+      boolean this_present_te = true && this.isSetTe();
+      boolean that_present_te = true && that.isSetTe();
+      if (this_present_te || that_present_te) {
+        if (!(this_present_te && that_present_te))
+          return false;
+        if (!this.te.equals(that.te))
+          return false;
+      }
+
+      return true;
+    }
+
+    @Override
+    public int hashCode() {
+      HashCodeBuilder builder = new HashCodeBuilder();
+
+      boolean present_success = true && (isSetSuccess());
+      builder.append(present_success);
+      if (present_success)
+        builder.append(success);
+
+      boolean present_ire = true && (isSetIre());
+      builder.append(present_ire);
+      if (present_ire)
+        builder.append(ire);
+
+      boolean present_ue = true && (isSetUe());
+      builder.append(present_ue);
+      if (present_ue)
+        builder.append(ue);
+
+      boolean present_te = true && (isSetTe());
+      builder.append(present_te);
+      if (present_te)
+        builder.append(te);
+
+      return builder.toHashCode();
+    }
+
+    @Override
+    public int compareTo(get_multi_slice_result other) {
+      if (!getClass().equals(other.getClass())) {
+        return getClass().getName().compareTo(other.getClass().getName());
+      }
+
+      int lastComparison = 0;
+
+      lastComparison = Boolean.valueOf(isSetSuccess()).compareTo(other.isSetSuccess());
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+      if (isSetSuccess()) {
+        lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.success, other.success);
+        if (lastComparison != 0) {
+          return lastComparison;
+        }
+      }
+      lastComparison = Boolean.valueOf(isSetIre()).compareTo(other.isSetIre());
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+      if (isSetIre()) {
+        lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.ire, other.ire);
+        if (lastComparison != 0) {
+          return lastComparison;
+        }
+      }
+      lastComparison = Boolean.valueOf(isSetUe()).compareTo(other.isSetUe());
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+      if (isSetUe()) {
+        lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.ue, other.ue);
+        if (lastComparison != 0) {
+          return lastComparison;
+        }
+      }
+      lastComparison = Boolean.valueOf(isSetTe()).compareTo(other.isSetTe());
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+      if (isSetTe()) {
+        lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.te, other.te);
+        if (lastComparison != 0) {
+          return lastComparison;
+        }
+      }
+      return 0;
+    }
+
+    public _Fields fieldForId(int fieldId) {
+      return _Fields.findByThriftId(fieldId);
+    }
+
+    public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException {
+      schemes.get(iprot.getScheme()).getScheme().read(iprot, this);
+    }
+
+    public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException {
+      schemes.get(oprot.getScheme()).getScheme().write(oprot, this);
+      }
+
+    @Override
+    public String toString() {
+      StringBuilder sb = new StringBuilder("get_multi_slice_result(");
+      boolean first = true;
+
+      sb.append("success:");
+      if (this.success == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.success);
+      }
+      first = false;
+      if (!first) sb.append(", ");
+      sb.append("ire:");
+      if (this.ire == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.ire);
+      }
+      first = false;
+      if (!first) sb.append(", ");
+      sb.append("ue:");
+      if (this.ue == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.ue);
+      }
+      first = false;
+      if (!first) sb.append(", ");
+      sb.append("te:");
+      if (this.te == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.te);
+      }
+      first = false;
+      sb.append(")");
+      return sb.toString();
+    }
+
+    public void validate() throws org.apache.thrift.TException {
+      // check for required fields
+      // check for sub-struct validity
+    }
+
+    private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException {
+      try {
+        write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out)));
+      } catch (org.apache.thrift.TException te) {
+        throw new java.io.IOException(te);
+      }
+    }
+
+    private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
+      try {
+        read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in)));
+      } catch (org.apache.thrift.TException te) {
+        throw new java.io.IOException(te);
+      }
+    }
+
+    private static class get_multi_slice_resultStandardSchemeFactory implements SchemeFactory {
+      public get_multi_slice_resultStandardScheme getScheme() {
+        return new get_multi_slice_resultStandardScheme();
+      }
+    }
+
+    private static class get_multi_slice_resultStandardScheme extends StandardScheme<get_multi_slice_result> {
+
+      public void read(org.apache.thrift.protocol.TProtocol iprot, get_multi_slice_result struct) throws org.apache.thrift.TException {
+        org.apache.thrift.protocol.TField schemeField;
+        iprot.readStructBegin();
+        while (true)
+        {
+          schemeField = iprot.readFieldBegin();
+          if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { 
+            break;
+          }
+          switch (schemeField.id) {
+            case 0: // SUCCESS
+              if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
+                {
+                  org.apache.thrift.protocol.TList _list372 = iprot.readListBegin();
+                  struct.success = new ArrayList<ColumnOrSuperColumn>(_list372.size);
+                  for (int _i373 = 0; _i373 < _list372.size; ++_i373)
+                  {
+                    ColumnOrSuperColumn _elem374;
+                    _elem374 = new ColumnOrSuperColumn();
+                    _elem374.read(iprot);
+                    struct.success.add(_elem374);
+                  }
+                  iprot.readListEnd();
+                }
+                struct.setSuccessIsSet(true);
+              } else { 
+                org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+              }
+              break;
+            case 1: // IRE
+              if (schemeField.type == org.apache.thrift.protocol.TType.STRUCT) {
+                struct.ire = new InvalidRequestException();
+                struct.ire.read(iprot);
+                struct.setIreIsSet(true);
+              } else { 
+                org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+              }
+              break;
+            case 2: // UE
+              if (schemeField.type == org.apache.thrift.protocol.TType.STRUCT) {
+                struct.ue = new UnavailableException();
+                struct.ue.read(iprot);
+                struct.setUeIsSet(true);
+              } else { 
+                org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+              }
+              break;
+            case 3: // TE
+              if (schemeField.type == org.apache.thrift.protocol.TType.STRUCT) {
+                struct.te = new TimedOutException();
+                struct.te.read(iprot);
+                struct.setTeIsSet(true);
+              } else { 
+                org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+              }
+              break;
+            default:
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+          }
+          iprot.readFieldEnd();
+        }
+        iprot.readStructEnd();
+
+        // check for required fields of primitive type, which can't be checked in the validate method
+        struct.validate();
+      }
+
+      public void write(org.apache.thrift.protocol.TProtocol oprot, get_multi_slice_result struct) throws org.apache.thrift.TException {
+        struct.validate();
+
+        oprot.writeStructBegin(STRUCT_DESC);
+        if (struct.success != null) {
+          oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
+          {
+            oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
+            for (ColumnOrSuperColumn _iter375 : struct.success)
+            {
+              _iter375.write(oprot);
+            }
+            oprot.writeListEnd();
+          }
+          oprot.writeFieldEnd();
+        }
+        if (struct.ire != null) {
+          oprot.writeFieldBegin(IRE_FIELD_DESC);
+          struct.ire.write(oprot);
+          oprot.writeFieldEnd();
+        }
+        if (struct.ue != null) {
+          oprot.writeFieldBegin(UE_FIELD_DESC);
+          struct.ue.write(oprot);
+          oprot.writeFieldEnd();
+        }
+        if (struct.te != null) {
+          oprot.writeFieldBegin(TE_FIELD_DESC);
+          struct.te.write(oprot);
+          oprot.writeFieldEnd();
+        }
+        oprot.writeFieldStop();
+        oprot.writeStructEnd();
+      }
+
+    }
+
+    private static class get_multi_slice_resultTupleSchemeFactory implements SchemeFactory {
+      public get_multi_slice_resultTupleScheme getScheme() {
+        return new get_multi_slice_resultTupleScheme();
+      }
+    }
+
+    private static class get_multi_slice_resultTupleScheme extends TupleScheme<get_multi_slice_result> {
+
+      @Override
+      public void write(org.apache.thrift.protocol.TProtocol prot, get_multi_slice_result struct) throws org.apache.thrift.TException {
+        TTupleProtocol oprot = (TTupleProtocol) prot;
+        BitSet optionals = new BitSet();
+        if (struct.isSetSuccess()) {
+          optionals.set(0);
+        }
+        if (struct.isSetIre()) {
+          optionals.set(1);
+        }
+        if (struct.isSetUe()) {
+          optionals.set(2);
+        }
+        if (struct.isSetTe()) {
+          optionals.set(3);
+        }
+        oprot.writeBitSet(optionals, 4);
+        if (struct.isSetSuccess()) {
+          {
+            oprot.writeI32(struct.success.size());
+            for (ColumnOrSuperColumn _iter376 : struct.success)
+            {
+              _iter376.write(oprot);
+            }
+          }
+        }
+        if (struct.isSetIre()) {
+          struct.ire.write(oprot);
+        }
+        if (struct.isSetUe()) {
+          struct.ue.write(oprot);
+        }
+        if (struct.isSetTe()) {
+          struct.te.write(oprot);
+        }
+      }
+
+      @Override
+      public void read(org.apache.thrift.protocol.TProtocol prot, get_multi_slice_result struct) throws org.apache.thrift.TException {
+        TTupleProtocol iprot = (TTupleProtocol) prot;
+        BitSet incoming = iprot.readBitSet(4);
+        if (incoming.get(0)) {
+          {
+            org.apache.thrift.protocol.TList _list377 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<ColumnOrSuperColumn>(_list377.size);
+            for (int _i378 = 0; _i378 < _list377.size; ++_i378)
+            {
+              ColumnOrSuperColumn _elem379;
+              _elem379 = new ColumnOrSuperColumn();
+              _elem379.read(iprot);
+              struct.success.add(_elem379);
+            }
+          }
+          struct.setSuccessIsSet(true);
+        }
+        if (incoming.get(1)) {
+          struct.ire = new InvalidRequestException();
+          struct.ire.read(iprot);
+          struct.setIreIsSet(true);
+        }
+        if (incoming.get(2)) {
+          struct.ue = new UnavailableException();
+          struct.ue.read(iprot);
+          struct.setUeIsSet(true);
+        }
+        if (incoming.get(3)) {
+          struct.te = new TimedOutException();
+          struct.te.read(iprot);
+          struct.setTeIsSet(true);
+        }
+      }
+    }
+
+  }
+
   public static class describe_schema_versions_args implements org.apache.thrift.TBase<describe_schema_versions_args, describe_schema_versions_args._Fields>, java.io.Serializable, Cloneable, Comparable<describe_schema_versions_args>   {
     private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("describe_schema_versions_args");
 
@@ -31134,25 +32399,25 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
                 {
-                  org.apache.thrift.protocol.TMap _map364 = iprot.readMapBegin();
-                  struct.success = new HashMap<String,List<String>>(2*_map364.size);
-                  for (int _i365 = 0; _i365 < _map364.size; ++_i365)
+                  org.apache.thrift.protocol.TMap _map380 = iprot.readMapBegin();
+                  struct.success = new HashMap<String,List<String>>(2*_map380.size);
+                  for (int _i381 = 0; _i381 < _map380.size; ++_i381)
                   {
-                    String _key366;
-                    List<String> _val367;
-                    _key366 = iprot.readString();
+                    String _key382;
+                    List<String> _val383;
+                    _key382 = iprot.readString();
                     {
-                      org.apache.thrift.protocol.TList _list368 = iprot.readListBegin();
-                      _val367 = new ArrayList<String>(_list368.size);
-                      for (int _i369 = 0; _i369 < _list368.size; ++_i369)
+                      org.apache.thrift.protocol.TList _list384 = iprot.readListBegin();
+                      _val383 = new ArrayList<String>(_list384.size);
+                      for (int _i385 = 0; _i385 < _list384.size; ++_i385)
                       {
-                        String _elem370;
-                        _elem370 = iprot.readString();
-                        _val367.add(_elem370);
+                        String _elem386;
+                        _elem386 = iprot.readString();
+                        _val383.add(_elem386);
                       }
                       iprot.readListEnd();
                     }
-                    struct.success.put(_key366, _val367);
+                    struct.success.put(_key382, _val383);
                   }
                   iprot.readMapEnd();
                 }
@@ -31189,14 +32454,14 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, struct.success.size()));
-            for (Map.Entry<String, List<String>> _iter371 : struct.success.entrySet())
+            for (Map.Entry<String, List<String>> _iter387 : struct.success.entrySet())
             {
-              oprot.writeString(_iter371.getKey());
+              oprot.writeString(_iter387.getKey());
               {
-                oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, _iter371.getValue().size()));
-                for (String _iter372 : _iter371.getValue())
+                oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, _iter387.getValue().size()));
+                for (String _iter388 : _iter387.getValue())
                 {
-                  oprot.writeString(_iter372);
+                  oprot.writeString(_iter388);
                 }
                 oprot.writeListEnd();
               }
@@ -31238,14 +32503,14 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (Map.Entry<String, List<String>> _iter373 : struct.success.entrySet())
+            for (Map.Entry<String, List<String>> _iter389 : struct.success.entrySet())
             {
-              oprot.writeString(_iter373.getKey());
+              oprot.writeString(_iter389.getKey());
               {
-                oprot.writeI32(_iter373.getValue().size());
-                for (String _iter374 : _iter373.getValue())
+                oprot.writeI32(_iter389.getValue().size());
+                for (String _iter390 : _iter389.getValue())
                 {
-                  oprot.writeString(_iter374);
+                  oprot.writeString(_iter390);
                 }
               }
             }
@@ -31262,24 +32527,24 @@
         BitSet incoming = iprot.readBitSet(2);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TMap _map375 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
-            struct.success = new HashMap<String,List<String>>(2*_map375.size);
-            for (int _i376 = 0; _i376 < _map375.size; ++_i376)
+            org.apache.thrift.protocol.TMap _map391 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
+            struct.success = new HashMap<String,List<String>>(2*_map391.size);
+            for (int _i392 = 0; _i392 < _map391.size; ++_i392)
             {
-              String _key377;
-              List<String> _val378;
-              _key377 = iprot.readString();
+              String _key393;
+              List<String> _val394;
+              _key393 = iprot.readString();
               {
-                org.apache.thrift.protocol.TList _list379 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-                _val378 = new ArrayList<String>(_list379.size);
-                for (int _i380 = 0; _i380 < _list379.size; ++_i380)
+                org.apache.thrift.protocol.TList _list395 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+                _val394 = new ArrayList<String>(_list395.size);
+                for (int _i396 = 0; _i396 < _list395.size; ++_i396)
                 {
-                  String _elem381;
-                  _elem381 = iprot.readString();
-                  _val378.add(_elem381);
+                  String _elem397;
+                  _elem397 = iprot.readString();
+                  _val394.add(_elem397);
                 }
               }
-              struct.success.put(_key377, _val378);
+              struct.success.put(_key393, _val394);
             }
           }
           struct.setSuccessIsSet(true);
@@ -31940,14 +33205,14 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list382 = iprot.readListBegin();
-                  struct.success = new ArrayList<KsDef>(_list382.size);
-                  for (int _i383 = 0; _i383 < _list382.size; ++_i383)
+                  org.apache.thrift.protocol.TList _list398 = iprot.readListBegin();
+                  struct.success = new ArrayList<KsDef>(_list398.size);
+                  for (int _i399 = 0; _i399 < _list398.size; ++_i399)
                   {
-                    KsDef _elem384;
-                    _elem384 = new KsDef();
-                    _elem384.read(iprot);
-                    struct.success.add(_elem384);
+                    KsDef _elem400;
+                    _elem400 = new KsDef();
+                    _elem400.read(iprot);
+                    struct.success.add(_elem400);
                   }
                   iprot.readListEnd();
                 }
@@ -31984,9 +33249,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
-            for (KsDef _iter385 : struct.success)
+            for (KsDef _iter401 : struct.success)
             {
-              _iter385.write(oprot);
+              _iter401.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -32025,9 +33290,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (KsDef _iter386 : struct.success)
+            for (KsDef _iter402 : struct.success)
             {
-              _iter386.write(oprot);
+              _iter402.write(oprot);
             }
           }
         }
@@ -32042,14 +33307,14 @@
         BitSet incoming = iprot.readBitSet(2);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list387 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.success = new ArrayList<KsDef>(_list387.size);
-            for (int _i388 = 0; _i388 < _list387.size; ++_i388)
+            org.apache.thrift.protocol.TList _list403 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<KsDef>(_list403.size);
+            for (int _i404 = 0; _i404 < _list403.size; ++_i404)
             {
-              KsDef _elem389;
-              _elem389 = new KsDef();
-              _elem389.read(iprot);
-              struct.success.add(_elem389);
+              KsDef _elem405;
+              _elem405 = new KsDef();
+              _elem405.read(iprot);
+              struct.success.add(_elem405);
             }
           }
           struct.setSuccessIsSet(true);
@@ -34034,14 +35299,14 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list390 = iprot.readListBegin();
-                  struct.success = new ArrayList<TokenRange>(_list390.size);
-                  for (int _i391 = 0; _i391 < _list390.size; ++_i391)
+                  org.apache.thrift.protocol.TList _list406 = iprot.readListBegin();
+                  struct.success = new ArrayList<TokenRange>(_list406.size);
+                  for (int _i407 = 0; _i407 < _list406.size; ++_i407)
                   {
-                    TokenRange _elem392;
-                    _elem392 = new TokenRange();
-                    _elem392.read(iprot);
-                    struct.success.add(_elem392);
+                    TokenRange _elem408;
+                    _elem408 = new TokenRange();
+                    _elem408.read(iprot);
+                    struct.success.add(_elem408);
                   }
                   iprot.readListEnd();
                 }
@@ -34078,9 +35343,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
-            for (TokenRange _iter393 : struct.success)
+            for (TokenRange _iter409 : struct.success)
             {
-              _iter393.write(oprot);
+              _iter409.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -34119,9 +35384,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (TokenRange _iter394 : struct.success)
+            for (TokenRange _iter410 : struct.success)
             {
-              _iter394.write(oprot);
+              _iter410.write(oprot);
             }
           }
         }
@@ -34136,14 +35401,14 @@
         BitSet incoming = iprot.readBitSet(2);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list395 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.success = new ArrayList<TokenRange>(_list395.size);
-            for (int _i396 = 0; _i396 < _list395.size; ++_i396)
+            org.apache.thrift.protocol.TList _list411 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<TokenRange>(_list411.size);
+            for (int _i412 = 0; _i412 < _list411.size; ++_i412)
             {
-              TokenRange _elem397;
-              _elem397 = new TokenRange();
-              _elem397.read(iprot);
-              struct.success.add(_elem397);
+              TokenRange _elem413;
+              _elem413 = new TokenRange();
+              _elem413.read(iprot);
+              struct.success.add(_elem413);
             }
           }
           struct.setSuccessIsSet(true);
@@ -34910,14 +36175,14 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list398 = iprot.readListBegin();
-                  struct.success = new ArrayList<TokenRange>(_list398.size);
-                  for (int _i399 = 0; _i399 < _list398.size; ++_i399)
+                  org.apache.thrift.protocol.TList _list414 = iprot.readListBegin();
+                  struct.success = new ArrayList<TokenRange>(_list414.size);
+                  for (int _i415 = 0; _i415 < _list414.size; ++_i415)
                   {
-                    TokenRange _elem400;
-                    _elem400 = new TokenRange();
-                    _elem400.read(iprot);
-                    struct.success.add(_elem400);
+                    TokenRange _elem416;
+                    _elem416 = new TokenRange();
+                    _elem416.read(iprot);
+                    struct.success.add(_elem416);
                   }
                   iprot.readListEnd();
                 }
@@ -34954,9 +36219,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
-            for (TokenRange _iter401 : struct.success)
+            for (TokenRange _iter417 : struct.success)
             {
-              _iter401.write(oprot);
+              _iter417.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -34995,9 +36260,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (TokenRange _iter402 : struct.success)
+            for (TokenRange _iter418 : struct.success)
             {
-              _iter402.write(oprot);
+              _iter418.write(oprot);
             }
           }
         }
@@ -35012,14 +36277,14 @@
         BitSet incoming = iprot.readBitSet(2);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list403 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.success = new ArrayList<TokenRange>(_list403.size);
-            for (int _i404 = 0; _i404 < _list403.size; ++_i404)
+            org.apache.thrift.protocol.TList _list419 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<TokenRange>(_list419.size);
+            for (int _i420 = 0; _i420 < _list419.size; ++_i420)
             {
-              TokenRange _elem405;
-              _elem405 = new TokenRange();
-              _elem405.read(iprot);
-              struct.success.add(_elem405);
+              TokenRange _elem421;
+              _elem421 = new TokenRange();
+              _elem421.read(iprot);
+              struct.success.add(_elem421);
             }
           }
           struct.setSuccessIsSet(true);
@@ -35674,15 +36939,15 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
                 {
-                  org.apache.thrift.protocol.TMap _map406 = iprot.readMapBegin();
-                  struct.success = new HashMap<String,String>(2*_map406.size);
-                  for (int _i407 = 0; _i407 < _map406.size; ++_i407)
+                  org.apache.thrift.protocol.TMap _map422 = iprot.readMapBegin();
+                  struct.success = new HashMap<String,String>(2*_map422.size);
+                  for (int _i423 = 0; _i423 < _map422.size; ++_i423)
                   {
-                    String _key408;
-                    String _val409;
-                    _key408 = iprot.readString();
-                    _val409 = iprot.readString();
-                    struct.success.put(_key408, _val409);
+                    String _key424;
+                    String _val425;
+                    _key424 = iprot.readString();
+                    _val425 = iprot.readString();
+                    struct.success.put(_key424, _val425);
                   }
                   iprot.readMapEnd();
                 }
@@ -35719,10 +36984,10 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.STRING, struct.success.size()));
-            for (Map.Entry<String, String> _iter410 : struct.success.entrySet())
+            for (Map.Entry<String, String> _iter426 : struct.success.entrySet())
             {
-              oprot.writeString(_iter410.getKey());
-              oprot.writeString(_iter410.getValue());
+              oprot.writeString(_iter426.getKey());
+              oprot.writeString(_iter426.getValue());
             }
             oprot.writeMapEnd();
           }
@@ -35761,10 +37026,10 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (Map.Entry<String, String> _iter411 : struct.success.entrySet())
+            for (Map.Entry<String, String> _iter427 : struct.success.entrySet())
             {
-              oprot.writeString(_iter411.getKey());
-              oprot.writeString(_iter411.getValue());
+              oprot.writeString(_iter427.getKey());
+              oprot.writeString(_iter427.getValue());
             }
           }
         }
@@ -35779,15 +37044,15 @@
         BitSet incoming = iprot.readBitSet(2);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TMap _map412 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-            struct.success = new HashMap<String,String>(2*_map412.size);
-            for (int _i413 = 0; _i413 < _map412.size; ++_i413)
+            org.apache.thrift.protocol.TMap _map428 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+            struct.success = new HashMap<String,String>(2*_map428.size);
+            for (int _i429 = 0; _i429 < _map428.size; ++_i429)
             {
-              String _key414;
-              String _val415;
-              _key414 = iprot.readString();
-              _val415 = iprot.readString();
-              struct.success.put(_key414, _val415);
+              String _key430;
+              String _val431;
+              _key430 = iprot.readString();
+              _val431 = iprot.readString();
+              struct.success.put(_key430, _val431);
             }
           }
           struct.setSuccessIsSet(true);
@@ -39005,13 +40270,13 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list416 = iprot.readListBegin();
-                  struct.success = new ArrayList<String>(_list416.size);
-                  for (int _i417 = 0; _i417 < _list416.size; ++_i417)
+                  org.apache.thrift.protocol.TList _list432 = iprot.readListBegin();
+                  struct.success = new ArrayList<String>(_list432.size);
+                  for (int _i433 = 0; _i433 < _list432.size; ++_i433)
                   {
-                    String _elem418;
-                    _elem418 = iprot.readString();
-                    struct.success.add(_elem418);
+                    String _elem434;
+                    _elem434 = iprot.readString();
+                    struct.success.add(_elem434);
                   }
                   iprot.readListEnd();
                 }
@@ -39048,9 +40313,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.success.size()));
-            for (String _iter419 : struct.success)
+            for (String _iter435 : struct.success)
             {
-              oprot.writeString(_iter419);
+              oprot.writeString(_iter435);
             }
             oprot.writeListEnd();
           }
@@ -39089,9 +40354,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (String _iter420 : struct.success)
+            for (String _iter436 : struct.success)
             {
-              oprot.writeString(_iter420);
+              oprot.writeString(_iter436);
             }
           }
         }
@@ -39106,13 +40371,13 @@
         BitSet incoming = iprot.readBitSet(2);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list421 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-            struct.success = new ArrayList<String>(_list421.size);
-            for (int _i422 = 0; _i422 < _list421.size; ++_i422)
+            org.apache.thrift.protocol.TList _list437 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+            struct.success = new ArrayList<String>(_list437.size);
+            for (int _i438 = 0; _i438 < _list437.size; ++_i438)
             {
-              String _elem423;
-              _elem423 = iprot.readString();
-              struct.success.add(_elem423);
+              String _elem439;
+              _elem439 = iprot.readString();
+              struct.success.add(_elem439);
             }
           }
           struct.setSuccessIsSet(true);
@@ -40801,14 +42066,14 @@
             case 0: // SUCCESS
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list424 = iprot.readListBegin();
-                  struct.success = new ArrayList<CfSplit>(_list424.size);
-                  for (int _i425 = 0; _i425 < _list424.size; ++_i425)
+                  org.apache.thrift.protocol.TList _list440 = iprot.readListBegin();
+                  struct.success = new ArrayList<CfSplit>(_list440.size);
+                  for (int _i441 = 0; _i441 < _list440.size; ++_i441)
                   {
-                    CfSplit _elem426;
-                    _elem426 = new CfSplit();
-                    _elem426.read(iprot);
-                    struct.success.add(_elem426);
+                    CfSplit _elem442;
+                    _elem442 = new CfSplit();
+                    _elem442.read(iprot);
+                    struct.success.add(_elem442);
                   }
                   iprot.readListEnd();
                 }
@@ -40845,9 +42110,9 @@
           oprot.writeFieldBegin(SUCCESS_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.success.size()));
-            for (CfSplit _iter427 : struct.success)
+            for (CfSplit _iter443 : struct.success)
             {
-              _iter427.write(oprot);
+              _iter443.write(oprot);
             }
             oprot.writeListEnd();
           }
@@ -40886,9 +42151,9 @@
         if (struct.isSetSuccess()) {
           {
             oprot.writeI32(struct.success.size());
-            for (CfSplit _iter428 : struct.success)
+            for (CfSplit _iter444 : struct.success)
             {
-              _iter428.write(oprot);
+              _iter444.write(oprot);
             }
           }
         }
@@ -40903,14 +42168,14 @@
         BitSet incoming = iprot.readBitSet(2);
         if (incoming.get(0)) {
           {
-            org.apache.thrift.protocol.TList _list429 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            struct.success = new ArrayList<CfSplit>(_list429.size);
-            for (int _i430 = 0; _i430 < _list429.size; ++_i430)
+            org.apache.thrift.protocol.TList _list445 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            struct.success = new ArrayList<CfSplit>(_list445.size);
+            for (int _i446 = 0; _i446 < _list445.size; ++_i446)
             {
-              CfSplit _elem431;
-              _elem431 = new CfSplit();
-              _elem431.read(iprot);
-              struct.success.add(_elem431);
+              CfSplit _elem447;
+              _elem447 = new CfSplit();
+              _elem447.read(iprot);
+              struct.success.add(_elem447);
             }
           }
           struct.setSuccessIsSet(true);
@@ -51505,13 +52770,13 @@
             case 2: // VALUES
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list432 = iprot.readListBegin();
-                  struct.values = new ArrayList<ByteBuffer>(_list432.size);
-                  for (int _i433 = 0; _i433 < _list432.size; ++_i433)
+                  org.apache.thrift.protocol.TList _list448 = iprot.readListBegin();
+                  struct.values = new ArrayList<ByteBuffer>(_list448.size);
+                  for (int _i449 = 0; _i449 < _list448.size; ++_i449)
                   {
-                    ByteBuffer _elem434;
-                    _elem434 = iprot.readBinary();
-                    struct.values.add(_elem434);
+                    ByteBuffer _elem450;
+                    _elem450 = iprot.readBinary();
+                    struct.values.add(_elem450);
                   }
                   iprot.readListEnd();
                 }
@@ -51545,9 +52810,9 @@
           oprot.writeFieldBegin(VALUES_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.values.size()));
-            for (ByteBuffer _iter435 : struct.values)
+            for (ByteBuffer _iter451 : struct.values)
             {
-              oprot.writeBinary(_iter435);
+              oprot.writeBinary(_iter451);
             }
             oprot.writeListEnd();
           }
@@ -51573,9 +52838,9 @@
         oprot.writeI32(struct.itemId);
         {
           oprot.writeI32(struct.values.size());
-          for (ByteBuffer _iter436 : struct.values)
+          for (ByteBuffer _iter452 : struct.values)
           {
-            oprot.writeBinary(_iter436);
+            oprot.writeBinary(_iter452);
           }
         }
       }
@@ -51586,13 +52851,13 @@
         struct.itemId = iprot.readI32();
         struct.setItemIdIsSet(true);
         {
-          org.apache.thrift.protocol.TList _list437 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-          struct.values = new ArrayList<ByteBuffer>(_list437.size);
-          for (int _i438 = 0; _i438 < _list437.size; ++_i438)
+          org.apache.thrift.protocol.TList _list453 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+          struct.values = new ArrayList<ByteBuffer>(_list453.size);
+          for (int _i454 = 0; _i454 < _list453.size; ++_i454)
           {
-            ByteBuffer _elem439;
-            _elem439 = iprot.readBinary();
-            struct.values.add(_elem439);
+            ByteBuffer _elem455;
+            _elem455 = iprot.readBinary();
+            struct.values.add(_elem455);
           }
         }
         struct.setValuesIsSet(true);
@@ -52903,13 +54168,13 @@
             case 2: // VALUES
               if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
                 {
-                  org.apache.thrift.protocol.TList _list440 = iprot.readListBegin();
-                  struct.values = new ArrayList<ByteBuffer>(_list440.size);
-                  for (int _i441 = 0; _i441 < _list440.size; ++_i441)
+                  org.apache.thrift.protocol.TList _list456 = iprot.readListBegin();
+                  struct.values = new ArrayList<ByteBuffer>(_list456.size);
+                  for (int _i457 = 0; _i457 < _list456.size; ++_i457)
                   {
-                    ByteBuffer _elem442;
-                    _elem442 = iprot.readBinary();
-                    struct.values.add(_elem442);
+                    ByteBuffer _elem458;
+                    _elem458 = iprot.readBinary();
+                    struct.values.add(_elem458);
                   }
                   iprot.readListEnd();
                 }
@@ -52951,9 +54216,9 @@
           oprot.writeFieldBegin(VALUES_FIELD_DESC);
           {
             oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.values.size()));
-            for (ByteBuffer _iter443 : struct.values)
+            for (ByteBuffer _iter459 : struct.values)
             {
-              oprot.writeBinary(_iter443);
+              oprot.writeBinary(_iter459);
             }
             oprot.writeListEnd();
           }
@@ -52984,9 +54249,9 @@
         oprot.writeI32(struct.itemId);
         {
           oprot.writeI32(struct.values.size());
-          for (ByteBuffer _iter444 : struct.values)
+          for (ByteBuffer _iter460 : struct.values)
           {
-            oprot.writeBinary(_iter444);
+            oprot.writeBinary(_iter460);
           }
         }
         oprot.writeI32(struct.consistency.getValue());
@@ -52998,13 +54263,13 @@
         struct.itemId = iprot.readI32();
         struct.setItemIdIsSet(true);
         {
-          org.apache.thrift.protocol.TList _list445 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-          struct.values = new ArrayList<ByteBuffer>(_list445.size);
-          for (int _i446 = 0; _i446 < _list445.size; ++_i446)
+          org.apache.thrift.protocol.TList _list461 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+          struct.values = new ArrayList<ByteBuffer>(_list461.size);
+          for (int _i462 = 0; _i462 < _list461.size; ++_i462)
           {
-            ByteBuffer _elem447;
-            _elem447 = iprot.readBinary();
-            struct.values.add(_elem447);
+            ByteBuffer _elem463;
+            _elem463 = iprot.readBinary();
+            struct.values.add(_elem463);
           }
         }
         struct.setValuesIsSet(true);

diff --git a/interface/thrift/gen-java/org/apache/cassandra/thrift/CfDef.java b/interface/thrift/gen-java/org/apache/cassandra/thrift/CfDef.java
index 0bdb500..ec10050 100644
--- a/interface/thrift/gen-java/org/apache/cassandra/thrift/CfDef.java
+++ b/interface/thrift/gen-java/org/apache/cassandra/thrift/CfDef.java

@@ -70,7 +70,6 @@
   private static final org.apache.thrift.protocol.TField ID_FIELD_DESC = new org.apache.thrift.protocol.TField("id", org.apache.thrift.protocol.TType.I32, (short)16);
   private static final org.apache.thrift.protocol.TField MIN_COMPACTION_THRESHOLD_FIELD_DESC = new org.apache.thrift.protocol.TField("min_compaction_threshold", org.apache.thrift.protocol.TType.I32, (short)17);
   private static final org.apache.thrift.protocol.TField MAX_COMPACTION_THRESHOLD_FIELD_DESC = new org.apache.thrift.protocol.TField("max_compaction_threshold", org.apache.thrift.protocol.TType.I32, (short)18);
-  private static final org.apache.thrift.protocol.TField REPLICATE_ON_WRITE_FIELD_DESC = new org.apache.thrift.protocol.TField("replicate_on_write", org.apache.thrift.protocol.TType.BOOL, (short)24);
   private static final org.apache.thrift.protocol.TField KEY_VALIDATION_CLASS_FIELD_DESC = new org.apache.thrift.protocol.TField("key_validation_class", org.apache.thrift.protocol.TType.STRING, (short)26);
   private static final org.apache.thrift.protocol.TField KEY_ALIAS_FIELD_DESC = new org.apache.thrift.protocol.TField("key_alias", org.apache.thrift.protocol.TType.STRING, (short)28);
   private static final org.apache.thrift.protocol.TField COMPACTION_STRATEGY_FIELD_DESC = new org.apache.thrift.protocol.TField("compaction_strategy", org.apache.thrift.protocol.TType.STRING, (short)29);
@@ -79,12 +78,13 @@
   private static final org.apache.thrift.protocol.TField BLOOM_FILTER_FP_CHANCE_FIELD_DESC = new org.apache.thrift.protocol.TField("bloom_filter_fp_chance", org.apache.thrift.protocol.TType.DOUBLE, (short)33);
   private static final org.apache.thrift.protocol.TField CACHING_FIELD_DESC = new org.apache.thrift.protocol.TField("caching", org.apache.thrift.protocol.TType.STRING, (short)34);
   private static final org.apache.thrift.protocol.TField DCLOCAL_READ_REPAIR_CHANCE_FIELD_DESC = new org.apache.thrift.protocol.TField("dclocal_read_repair_chance", org.apache.thrift.protocol.TType.DOUBLE, (short)37);
-  private static final org.apache.thrift.protocol.TField POPULATE_IO_CACHE_ON_FLUSH_FIELD_DESC = new org.apache.thrift.protocol.TField("populate_io_cache_on_flush", org.apache.thrift.protocol.TType.BOOL, (short)38);
   private static final org.apache.thrift.protocol.TField MEMTABLE_FLUSH_PERIOD_IN_MS_FIELD_DESC = new org.apache.thrift.protocol.TField("memtable_flush_period_in_ms", org.apache.thrift.protocol.TType.I32, (short)39);
   private static final org.apache.thrift.protocol.TField DEFAULT_TIME_TO_LIVE_FIELD_DESC = new org.apache.thrift.protocol.TField("default_time_to_live", org.apache.thrift.protocol.TType.I32, (short)40);
-  private static final org.apache.thrift.protocol.TField INDEX_INTERVAL_FIELD_DESC = new org.apache.thrift.protocol.TField("index_interval", org.apache.thrift.protocol.TType.I32, (short)41);
   private static final org.apache.thrift.protocol.TField SPECULATIVE_RETRY_FIELD_DESC = new org.apache.thrift.protocol.TField("speculative_retry", org.apache.thrift.protocol.TType.STRING, (short)42);
   private static final org.apache.thrift.protocol.TField TRIGGERS_FIELD_DESC = new org.apache.thrift.protocol.TField("triggers", org.apache.thrift.protocol.TType.LIST, (short)43);
+  private static final org.apache.thrift.protocol.TField CELLS_PER_ROW_TO_CACHE_FIELD_DESC = new org.apache.thrift.protocol.TField("cells_per_row_to_cache", org.apache.thrift.protocol.TType.STRING, (short)44);
+  private static final org.apache.thrift.protocol.TField MIN_INDEX_INTERVAL_FIELD_DESC = new org.apache.thrift.protocol.TField("min_index_interval", org.apache.thrift.protocol.TType.I32, (short)45);
+  private static final org.apache.thrift.protocol.TField MAX_INDEX_INTERVAL_FIELD_DESC = new org.apache.thrift.protocol.TField("max_index_interval", org.apache.thrift.protocol.TType.I32, (short)46);
   private static final org.apache.thrift.protocol.TField ROW_CACHE_SIZE_FIELD_DESC = new org.apache.thrift.protocol.TField("row_cache_size", org.apache.thrift.protocol.TType.DOUBLE, (short)9);
   private static final org.apache.thrift.protocol.TField KEY_CACHE_SIZE_FIELD_DESC = new org.apache.thrift.protocol.TField("key_cache_size", org.apache.thrift.protocol.TType.DOUBLE, (short)11);
   private static final org.apache.thrift.protocol.TField ROW_CACHE_SAVE_PERIOD_IN_SECONDS_FIELD_DESC = new org.apache.thrift.protocol.TField("row_cache_save_period_in_seconds", org.apache.thrift.protocol.TType.I32, (short)19);
@@ -92,9 +92,12 @@
   private static final org.apache.thrift.protocol.TField MEMTABLE_FLUSH_AFTER_MINS_FIELD_DESC = new org.apache.thrift.protocol.TField("memtable_flush_after_mins", org.apache.thrift.protocol.TType.I32, (short)21);
   private static final org.apache.thrift.protocol.TField MEMTABLE_THROUGHPUT_IN_MB_FIELD_DESC = new org.apache.thrift.protocol.TField("memtable_throughput_in_mb", org.apache.thrift.protocol.TType.I32, (short)22);
   private static final org.apache.thrift.protocol.TField MEMTABLE_OPERATIONS_IN_MILLIONS_FIELD_DESC = new org.apache.thrift.protocol.TField("memtable_operations_in_millions", org.apache.thrift.protocol.TType.DOUBLE, (short)23);
+  private static final org.apache.thrift.protocol.TField REPLICATE_ON_WRITE_FIELD_DESC = new org.apache.thrift.protocol.TField("replicate_on_write", org.apache.thrift.protocol.TType.BOOL, (short)24);
   private static final org.apache.thrift.protocol.TField MERGE_SHARDS_CHANCE_FIELD_DESC = new org.apache.thrift.protocol.TField("merge_shards_chance", org.apache.thrift.protocol.TType.DOUBLE, (short)25);
   private static final org.apache.thrift.protocol.TField ROW_CACHE_PROVIDER_FIELD_DESC = new org.apache.thrift.protocol.TField("row_cache_provider", org.apache.thrift.protocol.TType.STRING, (short)27);
   private static final org.apache.thrift.protocol.TField ROW_CACHE_KEYS_TO_SAVE_FIELD_DESC = new org.apache.thrift.protocol.TField("row_cache_keys_to_save", org.apache.thrift.protocol.TType.I32, (short)31);
+  private static final org.apache.thrift.protocol.TField POPULATE_IO_CACHE_ON_FLUSH_FIELD_DESC = new org.apache.thrift.protocol.TField("populate_io_cache_on_flush", org.apache.thrift.protocol.TType.BOOL, (short)38);
+  private static final org.apache.thrift.protocol.TField INDEX_INTERVAL_FIELD_DESC = new org.apache.thrift.protocol.TField("index_interval", org.apache.thrift.protocol.TType.I32, (short)41);
 
   private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
   static {
@@ -115,7 +118,6 @@
   public int id; // optional
   public int min_compaction_threshold; // optional
   public int max_compaction_threshold; // optional
-  public boolean replicate_on_write; // optional
   public String key_validation_class; // optional
   public ByteBuffer key_alias; // optional
   public String compaction_strategy; // optional
@@ -124,12 +126,13 @@
   public double bloom_filter_fp_chance; // optional
   public String caching; // optional
   public double dclocal_read_repair_chance; // optional
-  public boolean populate_io_cache_on_flush; // optional
   public int memtable_flush_period_in_ms; // optional
   public int default_time_to_live; // optional
-  public int index_interval; // optional
   public String speculative_retry; // optional
   public List<TriggerDef> triggers; // optional
+  public String cells_per_row_to_cache; // optional
+  public int min_index_interval; // optional
+  public int max_index_interval; // optional
   /**
    * @deprecated
    */
@@ -161,6 +164,10 @@
   /**
    * @deprecated
    */
+  public boolean replicate_on_write; // optional
+  /**
+   * @deprecated
+   */
   public double merge_shards_chance; // optional
   /**
    * @deprecated
@@ -170,6 +177,14 @@
    * @deprecated
    */
   public int row_cache_keys_to_save; // optional
+  /**
+   * @deprecated
+   */
+  public boolean populate_io_cache_on_flush; // optional
+  /**
+   * @deprecated
+   */
+  public int index_interval; // optional
 
   /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
   public enum _Fields implements org.apache.thrift.TFieldIdEnum {
@@ -186,7 +201,6 @@
     ID((short)16, "id"),
     MIN_COMPACTION_THRESHOLD((short)17, "min_compaction_threshold"),
     MAX_COMPACTION_THRESHOLD((short)18, "max_compaction_threshold"),
-    REPLICATE_ON_WRITE((short)24, "replicate_on_write"),
     KEY_VALIDATION_CLASS((short)26, "key_validation_class"),
     KEY_ALIAS((short)28, "key_alias"),
     COMPACTION_STRATEGY((short)29, "compaction_strategy"),
@@ -195,12 +209,13 @@
     BLOOM_FILTER_FP_CHANCE((short)33, "bloom_filter_fp_chance"),
     CACHING((short)34, "caching"),
     DCLOCAL_READ_REPAIR_CHANCE((short)37, "dclocal_read_repair_chance"),
-    POPULATE_IO_CACHE_ON_FLUSH((short)38, "populate_io_cache_on_flush"),
     MEMTABLE_FLUSH_PERIOD_IN_MS((short)39, "memtable_flush_period_in_ms"),
     DEFAULT_TIME_TO_LIVE((short)40, "default_time_to_live"),
-    INDEX_INTERVAL((short)41, "index_interval"),
     SPECULATIVE_RETRY((short)42, "speculative_retry"),
     TRIGGERS((short)43, "triggers"),
+    CELLS_PER_ROW_TO_CACHE((short)44, "cells_per_row_to_cache"),
+    MIN_INDEX_INTERVAL((short)45, "min_index_interval"),
+    MAX_INDEX_INTERVAL((short)46, "max_index_interval"),
     /**
      * @deprecated
      */
@@ -232,6 +247,10 @@
     /**
      * @deprecated
      */
+    REPLICATE_ON_WRITE((short)24, "replicate_on_write"),
+    /**
+     * @deprecated
+     */
     MERGE_SHARDS_CHANCE((short)25, "merge_shards_chance"),
     /**
      * @deprecated
@@ -240,7 +259,15 @@
     /**
      * @deprecated
      */
-    ROW_CACHE_KEYS_TO_SAVE((short)31, "row_cache_keys_to_save");
+    ROW_CACHE_KEYS_TO_SAVE((short)31, "row_cache_keys_to_save"),
+    /**
+     * @deprecated
+     */
+    POPULATE_IO_CACHE_ON_FLUSH((short)38, "populate_io_cache_on_flush"),
+    /**
+     * @deprecated
+     */
+    INDEX_INTERVAL((short)41, "index_interval");
 
     private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
 
@@ -281,8 +308,6 @@
           return MIN_COMPACTION_THRESHOLD;
         case 18: // MAX_COMPACTION_THRESHOLD
           return MAX_COMPACTION_THRESHOLD;
-        case 24: // REPLICATE_ON_WRITE
-          return REPLICATE_ON_WRITE;
         case 26: // KEY_VALIDATION_CLASS
           return KEY_VALIDATION_CLASS;
         case 28: // KEY_ALIAS
@@ -299,18 +324,20 @@
           return CACHING;
         case 37: // DCLOCAL_READ_REPAIR_CHANCE
           return DCLOCAL_READ_REPAIR_CHANCE;
-        case 38: // POPULATE_IO_CACHE_ON_FLUSH
-          return POPULATE_IO_CACHE_ON_FLUSH;
         case 39: // MEMTABLE_FLUSH_PERIOD_IN_MS
           return MEMTABLE_FLUSH_PERIOD_IN_MS;
         case 40: // DEFAULT_TIME_TO_LIVE
           return DEFAULT_TIME_TO_LIVE;
-        case 41: // INDEX_INTERVAL
-          return INDEX_INTERVAL;
         case 42: // SPECULATIVE_RETRY
           return SPECULATIVE_RETRY;
         case 43: // TRIGGERS
           return TRIGGERS;
+        case 44: // CELLS_PER_ROW_TO_CACHE
+          return CELLS_PER_ROW_TO_CACHE;
+        case 45: // MIN_INDEX_INTERVAL
+          return MIN_INDEX_INTERVAL;
+        case 46: // MAX_INDEX_INTERVAL
+          return MAX_INDEX_INTERVAL;
         case 9: // ROW_CACHE_SIZE
           return ROW_CACHE_SIZE;
         case 11: // KEY_CACHE_SIZE
@@ -325,12 +352,18 @@
           return MEMTABLE_THROUGHPUT_IN_MB;
         case 23: // MEMTABLE_OPERATIONS_IN_MILLIONS
           return MEMTABLE_OPERATIONS_IN_MILLIONS;
+        case 24: // REPLICATE_ON_WRITE
+          return REPLICATE_ON_WRITE;
         case 25: // MERGE_SHARDS_CHANCE
           return MERGE_SHARDS_CHANCE;
         case 27: // ROW_CACHE_PROVIDER
           return ROW_CACHE_PROVIDER;
         case 31: // ROW_CACHE_KEYS_TO_SAVE
           return ROW_CACHE_KEYS_TO_SAVE;
+        case 38: // POPULATE_IO_CACHE_ON_FLUSH
+          return POPULATE_IO_CACHE_ON_FLUSH;
+        case 41: // INDEX_INTERVAL
+          return INDEX_INTERVAL;
         default:
           return null;
       }
@@ -376,24 +409,26 @@
   private static final int __ID_ISSET_ID = 2;
   private static final int __MIN_COMPACTION_THRESHOLD_ISSET_ID = 3;
   private static final int __MAX_COMPACTION_THRESHOLD_ISSET_ID = 4;
-  private static final int __REPLICATE_ON_WRITE_ISSET_ID = 5;
-  private static final int __BLOOM_FILTER_FP_CHANCE_ISSET_ID = 6;
-  private static final int __DCLOCAL_READ_REPAIR_CHANCE_ISSET_ID = 7;
-  private static final int __POPULATE_IO_CACHE_ON_FLUSH_ISSET_ID = 8;
-  private static final int __MEMTABLE_FLUSH_PERIOD_IN_MS_ISSET_ID = 9;
-  private static final int __DEFAULT_TIME_TO_LIVE_ISSET_ID = 10;
-  private static final int __INDEX_INTERVAL_ISSET_ID = 11;
-  private static final int __ROW_CACHE_SIZE_ISSET_ID = 12;
-  private static final int __KEY_CACHE_SIZE_ISSET_ID = 13;
-  private static final int __ROW_CACHE_SAVE_PERIOD_IN_SECONDS_ISSET_ID = 14;
-  private static final int __KEY_CACHE_SAVE_PERIOD_IN_SECONDS_ISSET_ID = 15;
-  private static final int __MEMTABLE_FLUSH_AFTER_MINS_ISSET_ID = 16;
-  private static final int __MEMTABLE_THROUGHPUT_IN_MB_ISSET_ID = 17;
-  private static final int __MEMTABLE_OPERATIONS_IN_MILLIONS_ISSET_ID = 18;
+  private static final int __BLOOM_FILTER_FP_CHANCE_ISSET_ID = 5;
+  private static final int __DCLOCAL_READ_REPAIR_CHANCE_ISSET_ID = 6;
+  private static final int __MEMTABLE_FLUSH_PERIOD_IN_MS_ISSET_ID = 7;
+  private static final int __DEFAULT_TIME_TO_LIVE_ISSET_ID = 8;
+  private static final int __MIN_INDEX_INTERVAL_ISSET_ID = 9;
+  private static final int __MAX_INDEX_INTERVAL_ISSET_ID = 10;
+  private static final int __ROW_CACHE_SIZE_ISSET_ID = 11;
+  private static final int __KEY_CACHE_SIZE_ISSET_ID = 12;
+  private static final int __ROW_CACHE_SAVE_PERIOD_IN_SECONDS_ISSET_ID = 13;
+  private static final int __KEY_CACHE_SAVE_PERIOD_IN_SECONDS_ISSET_ID = 14;
+  private static final int __MEMTABLE_FLUSH_AFTER_MINS_ISSET_ID = 15;
+  private static final int __MEMTABLE_THROUGHPUT_IN_MB_ISSET_ID = 16;
+  private static final int __MEMTABLE_OPERATIONS_IN_MILLIONS_ISSET_ID = 17;
+  private static final int __REPLICATE_ON_WRITE_ISSET_ID = 18;
   private static final int __MERGE_SHARDS_CHANCE_ISSET_ID = 19;
   private static final int __ROW_CACHE_KEYS_TO_SAVE_ISSET_ID = 20;
+  private static final int __POPULATE_IO_CACHE_ON_FLUSH_ISSET_ID = 21;
+  private static final int __INDEX_INTERVAL_ISSET_ID = 22;
   private int __isset_bitfield = 0;
-  private _Fields optionals[] = {_Fields.COLUMN_TYPE,_Fields.COMPARATOR_TYPE,_Fields.SUBCOMPARATOR_TYPE,_Fields.COMMENT,_Fields.READ_REPAIR_CHANCE,_Fields.COLUMN_METADATA,_Fields.GC_GRACE_SECONDS,_Fields.DEFAULT_VALIDATION_CLASS,_Fields.ID,_Fields.MIN_COMPACTION_THRESHOLD,_Fields.MAX_COMPACTION_THRESHOLD,_Fields.REPLICATE_ON_WRITE,_Fields.KEY_VALIDATION_CLASS,_Fields.KEY_ALIAS,_Fields.COMPACTION_STRATEGY,_Fields.COMPACTION_STRATEGY_OPTIONS,_Fields.COMPRESSION_OPTIONS,_Fields.BLOOM_FILTER_FP_CHANCE,_Fields.CACHING,_Fields.DCLOCAL_READ_REPAIR_CHANCE,_Fields.POPULATE_IO_CACHE_ON_FLUSH,_Fields.MEMTABLE_FLUSH_PERIOD_IN_MS,_Fields.DEFAULT_TIME_TO_LIVE,_Fields.INDEX_INTERVAL,_Fields.SPECULATIVE_RETRY,_Fields.TRIGGERS,_Fields.ROW_CACHE_SIZE,_Fields.KEY_CACHE_SIZE,_Fields.ROW_CACHE_SAVE_PERIOD_IN_SECONDS,_Fields.KEY_CACHE_SAVE_PERIOD_IN_SECONDS,_Fields.MEMTABLE_FLUSH_AFTER_MINS,_Fields.MEMTABLE_THROUGHPUT_IN_MB,_Fields.MEMTABLE_OPERATIONS_IN_MILLIONS,_Fields.MERGE_SHARDS_CHANCE,_Fields.ROW_CACHE_PROVIDER,_Fields.ROW_CACHE_KEYS_TO_SAVE};
+  private _Fields optionals[] = {_Fields.COLUMN_TYPE,_Fields.COMPARATOR_TYPE,_Fields.SUBCOMPARATOR_TYPE,_Fields.COMMENT,_Fields.READ_REPAIR_CHANCE,_Fields.COLUMN_METADATA,_Fields.GC_GRACE_SECONDS,_Fields.DEFAULT_VALIDATION_CLASS,_Fields.ID,_Fields.MIN_COMPACTION_THRESHOLD,_Fields.MAX_COMPACTION_THRESHOLD,_Fields.KEY_VALIDATION_CLASS,_Fields.KEY_ALIAS,_Fields.COMPACTION_STRATEGY,_Fields.COMPACTION_STRATEGY_OPTIONS,_Fields.COMPRESSION_OPTIONS,_Fields.BLOOM_FILTER_FP_CHANCE,_Fields.CACHING,_Fields.DCLOCAL_READ_REPAIR_CHANCE,_Fields.MEMTABLE_FLUSH_PERIOD_IN_MS,_Fields.DEFAULT_TIME_TO_LIVE,_Fields.SPECULATIVE_RETRY,_Fields.TRIGGERS,_Fields.CELLS_PER_ROW_TO_CACHE,_Fields.MIN_INDEX_INTERVAL,_Fields.MAX_INDEX_INTERVAL,_Fields.ROW_CACHE_SIZE,_Fields.KEY_CACHE_SIZE,_Fields.ROW_CACHE_SAVE_PERIOD_IN_SECONDS,_Fields.KEY_CACHE_SAVE_PERIOD_IN_SECONDS,_Fields.MEMTABLE_FLUSH_AFTER_MINS,_Fields.MEMTABLE_THROUGHPUT_IN_MB,_Fields.MEMTABLE_OPERATIONS_IN_MILLIONS,_Fields.REPLICATE_ON_WRITE,_Fields.MERGE_SHARDS_CHANCE,_Fields.ROW_CACHE_PROVIDER,_Fields.ROW_CACHE_KEYS_TO_SAVE,_Fields.POPULATE_IO_CACHE_ON_FLUSH,_Fields.INDEX_INTERVAL};
   public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
   static {
     Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
@@ -424,8 +459,6 @@
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
     tmpMap.put(_Fields.MAX_COMPACTION_THRESHOLD, new org.apache.thrift.meta_data.FieldMetaData("max_compaction_threshold", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
-    tmpMap.put(_Fields.REPLICATE_ON_WRITE, new org.apache.thrift.meta_data.FieldMetaData("replicate_on_write", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
     tmpMap.put(_Fields.KEY_VALIDATION_CLASS, new org.apache.thrift.meta_data.FieldMetaData("key_validation_class", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
     tmpMap.put(_Fields.KEY_ALIAS, new org.apache.thrift.meta_data.FieldMetaData("key_alias", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
@@ -446,19 +479,21 @@
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
     tmpMap.put(_Fields.DCLOCAL_READ_REPAIR_CHANCE, new org.apache.thrift.meta_data.FieldMetaData("dclocal_read_repair_chance", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.DOUBLE)));
-    tmpMap.put(_Fields.POPULATE_IO_CACHE_ON_FLUSH, new org.apache.thrift.meta_data.FieldMetaData("populate_io_cache_on_flush", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
     tmpMap.put(_Fields.MEMTABLE_FLUSH_PERIOD_IN_MS, new org.apache.thrift.meta_data.FieldMetaData("memtable_flush_period_in_ms", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
     tmpMap.put(_Fields.DEFAULT_TIME_TO_LIVE, new org.apache.thrift.meta_data.FieldMetaData("default_time_to_live", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
-    tmpMap.put(_Fields.INDEX_INTERVAL, new org.apache.thrift.meta_data.FieldMetaData("index_interval", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
     tmpMap.put(_Fields.SPECULATIVE_RETRY, new org.apache.thrift.meta_data.FieldMetaData("speculative_retry", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
     tmpMap.put(_Fields.TRIGGERS, new org.apache.thrift.meta_data.FieldMetaData("triggers", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
             new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, TriggerDef.class))));
+    tmpMap.put(_Fields.CELLS_PER_ROW_TO_CACHE, new org.apache.thrift.meta_data.FieldMetaData("cells_per_row_to_cache", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
+    tmpMap.put(_Fields.MIN_INDEX_INTERVAL, new org.apache.thrift.meta_data.FieldMetaData("min_index_interval", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
+    tmpMap.put(_Fields.MAX_INDEX_INTERVAL, new org.apache.thrift.meta_data.FieldMetaData("max_index_interval", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
     tmpMap.put(_Fields.ROW_CACHE_SIZE, new org.apache.thrift.meta_data.FieldMetaData("row_cache_size", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.DOUBLE)));
     tmpMap.put(_Fields.KEY_CACHE_SIZE, new org.apache.thrift.meta_data.FieldMetaData("key_cache_size", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
@@ -473,12 +508,18 @@
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
     tmpMap.put(_Fields.MEMTABLE_OPERATIONS_IN_MILLIONS, new org.apache.thrift.meta_data.FieldMetaData("memtable_operations_in_millions", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.DOUBLE)));
+    tmpMap.put(_Fields.REPLICATE_ON_WRITE, new org.apache.thrift.meta_data.FieldMetaData("replicate_on_write", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
     tmpMap.put(_Fields.MERGE_SHARDS_CHANCE, new org.apache.thrift.meta_data.FieldMetaData("merge_shards_chance", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.DOUBLE)));
     tmpMap.put(_Fields.ROW_CACHE_PROVIDER, new org.apache.thrift.meta_data.FieldMetaData("row_cache_provider", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
     tmpMap.put(_Fields.ROW_CACHE_KEYS_TO_SAVE, new org.apache.thrift.meta_data.FieldMetaData("row_cache_keys_to_save", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
         new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
+    tmpMap.put(_Fields.POPULATE_IO_CACHE_ON_FLUSH, new org.apache.thrift.meta_data.FieldMetaData("populate_io_cache_on_flush", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
+    tmpMap.put(_Fields.INDEX_INTERVAL, new org.apache.thrift.meta_data.FieldMetaData("index_interval", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
     metaDataMap = Collections.unmodifiableMap(tmpMap);
     org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(CfDef.class, metaDataMap);
   }
@@ -494,6 +535,8 @@
 
     this.speculative_retry = "NONE";
 
+    this.cells_per_row_to_cache = "100";
+
   }
 
   public CfDef(
@@ -543,7 +586,6 @@
     this.id = other.id;
     this.min_compaction_threshold = other.min_compaction_threshold;
     this.max_compaction_threshold = other.max_compaction_threshold;
-    this.replicate_on_write = other.replicate_on_write;
     if (other.isSetKey_validation_class()) {
       this.key_validation_class = other.key_validation_class;
     }
@@ -567,10 +609,8 @@
       this.caching = other.caching;
     }
     this.dclocal_read_repair_chance = other.dclocal_read_repair_chance;
-    this.populate_io_cache_on_flush = other.populate_io_cache_on_flush;
     this.memtable_flush_period_in_ms = other.memtable_flush_period_in_ms;
     this.default_time_to_live = other.default_time_to_live;
-    this.index_interval = other.index_interval;
     if (other.isSetSpeculative_retry()) {
       this.speculative_retry = other.speculative_retry;
     }
@@ -581,6 +621,11 @@
       }
       this.triggers = __this__triggers;
     }
+    if (other.isSetCells_per_row_to_cache()) {
+      this.cells_per_row_to_cache = other.cells_per_row_to_cache;
+    }
+    this.min_index_interval = other.min_index_interval;
+    this.max_index_interval = other.max_index_interval;
     this.row_cache_size = other.row_cache_size;
     this.key_cache_size = other.key_cache_size;
     this.row_cache_save_period_in_seconds = other.row_cache_save_period_in_seconds;
@@ -588,11 +633,14 @@
     this.memtable_flush_after_mins = other.memtable_flush_after_mins;
     this.memtable_throughput_in_mb = other.memtable_throughput_in_mb;
     this.memtable_operations_in_millions = other.memtable_operations_in_millions;
+    this.replicate_on_write = other.replicate_on_write;
     this.merge_shards_chance = other.merge_shards_chance;
     if (other.isSetRow_cache_provider()) {
       this.row_cache_provider = other.row_cache_provider;
     }
     this.row_cache_keys_to_save = other.row_cache_keys_to_save;
+    this.populate_io_cache_on_flush = other.populate_io_cache_on_flush;
+    this.index_interval = other.index_interval;
   }
 
   public CfDef deepCopy() {
@@ -621,8 +669,6 @@
     this.min_compaction_threshold = 0;
     setMax_compaction_thresholdIsSet(false);
     this.max_compaction_threshold = 0;
-    setReplicate_on_writeIsSet(false);
-    this.replicate_on_write = false;
     this.key_validation_class = null;
     this.key_alias = null;
     this.compaction_strategy = null;
@@ -634,17 +680,19 @@
 
     this.dclocal_read_repair_chance = 0;
 
-    setPopulate_io_cache_on_flushIsSet(false);
-    this.populate_io_cache_on_flush = false;
     setMemtable_flush_period_in_msIsSet(false);
     this.memtable_flush_period_in_ms = 0;
     setDefault_time_to_liveIsSet(false);
     this.default_time_to_live = 0;
-    setIndex_intervalIsSet(false);
-    this.index_interval = 0;
     this.speculative_retry = "NONE";
 
     this.triggers = null;
+    this.cells_per_row_to_cache = "100";
+
+    setMin_index_intervalIsSet(false);
+    this.min_index_interval = 0;
+    setMax_index_intervalIsSet(false);
+    this.max_index_interval = 0;
     setRow_cache_sizeIsSet(false);
     this.row_cache_size = 0.0;
     setKey_cache_sizeIsSet(false);
@@ -659,11 +707,17 @@
     this.memtable_throughput_in_mb = 0;
     setMemtable_operations_in_millionsIsSet(false);
     this.memtable_operations_in_millions = 0.0;
+    setReplicate_on_writeIsSet(false);
+    this.replicate_on_write = false;
     setMerge_shards_chanceIsSet(false);
     this.merge_shards_chance = 0.0;
     this.row_cache_provider = null;
     setRow_cache_keys_to_saveIsSet(false);
     this.row_cache_keys_to_save = 0;
+    setPopulate_io_cache_on_flushIsSet(false);
+    this.populate_io_cache_on_flush = false;
+    setIndex_intervalIsSet(false);
+    this.index_interval = 0;
   }
 
   public String getKeyspace() {
@@ -988,29 +1042,6 @@
     __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAX_COMPACTION_THRESHOLD_ISSET_ID, value);
   }
 
-  public boolean isReplicate_on_write() {
-    return this.replicate_on_write;
-  }
-
-  public CfDef setReplicate_on_write(boolean replicate_on_write) {
-    this.replicate_on_write = replicate_on_write;
-    setReplicate_on_writeIsSet(true);
-    return this;
-  }
-
-  public void unsetReplicate_on_write() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __REPLICATE_ON_WRITE_ISSET_ID);
-  }
-
-  /** Returns true if field replicate_on_write is set (has been assigned a value) and false otherwise */
-  public boolean isSetReplicate_on_write() {
-    return EncodingUtils.testBit(__isset_bitfield, __REPLICATE_ON_WRITE_ISSET_ID);
-  }
-
-  public void setReplicate_on_writeIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __REPLICATE_ON_WRITE_ISSET_ID, value);
-  }
-
   public String getKey_validation_class() {
     return this.key_validation_class;
   }
@@ -1233,29 +1264,6 @@
     __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __DCLOCAL_READ_REPAIR_CHANCE_ISSET_ID, value);
   }
 
-  public boolean isPopulate_io_cache_on_flush() {
-    return this.populate_io_cache_on_flush;
-  }
-
-  public CfDef setPopulate_io_cache_on_flush(boolean populate_io_cache_on_flush) {
-    this.populate_io_cache_on_flush = populate_io_cache_on_flush;
-    setPopulate_io_cache_on_flushIsSet(true);
-    return this;
-  }
-
-  public void unsetPopulate_io_cache_on_flush() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __POPULATE_IO_CACHE_ON_FLUSH_ISSET_ID);
-  }
-
-  /** Returns true if field populate_io_cache_on_flush is set (has been assigned a value) and false otherwise */
-  public boolean isSetPopulate_io_cache_on_flush() {
-    return EncodingUtils.testBit(__isset_bitfield, __POPULATE_IO_CACHE_ON_FLUSH_ISSET_ID);
-  }
-
-  public void setPopulate_io_cache_on_flushIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __POPULATE_IO_CACHE_ON_FLUSH_ISSET_ID, value);
-  }
-
   public int getMemtable_flush_period_in_ms() {
     return this.memtable_flush_period_in_ms;
   }
@@ -1302,29 +1310,6 @@
     __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __DEFAULT_TIME_TO_LIVE_ISSET_ID, value);
   }
 
-  public int getIndex_interval() {
-    return this.index_interval;
-  }
-
-  public CfDef setIndex_interval(int index_interval) {
-    this.index_interval = index_interval;
-    setIndex_intervalIsSet(true);
-    return this;
-  }
-
-  public void unsetIndex_interval() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __INDEX_INTERVAL_ISSET_ID);
-  }
-
-  /** Returns true if field index_interval is set (has been assigned a value) and false otherwise */
-  public boolean isSetIndex_interval() {
-    return EncodingUtils.testBit(__isset_bitfield, __INDEX_INTERVAL_ISSET_ID);
-  }
-
-  public void setIndex_intervalIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __INDEX_INTERVAL_ISSET_ID, value);
-  }
-
   public String getSpeculative_retry() {
     return this.speculative_retry;
   }
@@ -1388,6 +1373,76 @@
     }
   }
 
+  public String getCells_per_row_to_cache() {
+    return this.cells_per_row_to_cache;
+  }
+
+  public CfDef setCells_per_row_to_cache(String cells_per_row_to_cache) {
+    this.cells_per_row_to_cache = cells_per_row_to_cache;
+    return this;
+  }
+
+  public void unsetCells_per_row_to_cache() {
+    this.cells_per_row_to_cache = null;
+  }
+
+  /** Returns true if field cells_per_row_to_cache is set (has been assigned a value) and false otherwise */
+  public boolean isSetCells_per_row_to_cache() {
+    return this.cells_per_row_to_cache != null;
+  }
+
+  public void setCells_per_row_to_cacheIsSet(boolean value) {
+    if (!value) {
+      this.cells_per_row_to_cache = null;
+    }
+  }
+
+  public int getMin_index_interval() {
+    return this.min_index_interval;
+  }
+
+  public CfDef setMin_index_interval(int min_index_interval) {
+    this.min_index_interval = min_index_interval;
+    setMin_index_intervalIsSet(true);
+    return this;
+  }
+
+  public void unsetMin_index_interval() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MIN_INDEX_INTERVAL_ISSET_ID);
+  }
+
+  /** Returns true if field min_index_interval is set (has been assigned a value) and false otherwise */
+  public boolean isSetMin_index_interval() {
+    return EncodingUtils.testBit(__isset_bitfield, __MIN_INDEX_INTERVAL_ISSET_ID);
+  }
+
+  public void setMin_index_intervalIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MIN_INDEX_INTERVAL_ISSET_ID, value);
+  }
+
+  public int getMax_index_interval() {
+    return this.max_index_interval;
+  }
+
+  public CfDef setMax_index_interval(int max_index_interval) {
+    this.max_index_interval = max_index_interval;
+    setMax_index_intervalIsSet(true);
+    return this;
+  }
+
+  public void unsetMax_index_interval() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAX_INDEX_INTERVAL_ISSET_ID);
+  }
+
+  /** Returns true if field max_index_interval is set (has been assigned a value) and false otherwise */
+  public boolean isSetMax_index_interval() {
+    return EncodingUtils.testBit(__isset_bitfield, __MAX_INDEX_INTERVAL_ISSET_ID);
+  }
+
+  public void setMax_index_intervalIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAX_INDEX_INTERVAL_ISSET_ID, value);
+  }
+
   /**
    * @deprecated
    */
@@ -1594,6 +1649,35 @@
   /**
    * @deprecated
    */
+  public boolean isReplicate_on_write() {
+    return this.replicate_on_write;
+  }
+
+  /**
+   * @deprecated
+   */
+  public CfDef setReplicate_on_write(boolean replicate_on_write) {
+    this.replicate_on_write = replicate_on_write;
+    setReplicate_on_writeIsSet(true);
+    return this;
+  }
+
+  public void unsetReplicate_on_write() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __REPLICATE_ON_WRITE_ISSET_ID);
+  }
+
+  /** Returns true if field replicate_on_write is set (has been assigned a value) and false otherwise */
+  public boolean isSetReplicate_on_write() {
+    return EncodingUtils.testBit(__isset_bitfield, __REPLICATE_ON_WRITE_ISSET_ID);
+  }
+
+  public void setReplicate_on_writeIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __REPLICATE_ON_WRITE_ISSET_ID, value);
+  }
+
+  /**
+   * @deprecated
+   */
   public double getMerge_shards_chance() {
     return this.merge_shards_chance;
   }
@@ -1679,6 +1763,64 @@
     __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __ROW_CACHE_KEYS_TO_SAVE_ISSET_ID, value);
   }
 
+  /**
+   * @deprecated
+   */
+  public boolean isPopulate_io_cache_on_flush() {
+    return this.populate_io_cache_on_flush;
+  }
+
+  /**
+   * @deprecated
+   */
+  public CfDef setPopulate_io_cache_on_flush(boolean populate_io_cache_on_flush) {
+    this.populate_io_cache_on_flush = populate_io_cache_on_flush;
+    setPopulate_io_cache_on_flushIsSet(true);
+    return this;
+  }
+
+  public void unsetPopulate_io_cache_on_flush() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __POPULATE_IO_CACHE_ON_FLUSH_ISSET_ID);
+  }
+
+  /** Returns true if field populate_io_cache_on_flush is set (has been assigned a value) and false otherwise */
+  public boolean isSetPopulate_io_cache_on_flush() {
+    return EncodingUtils.testBit(__isset_bitfield, __POPULATE_IO_CACHE_ON_FLUSH_ISSET_ID);
+  }
+
+  public void setPopulate_io_cache_on_flushIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __POPULATE_IO_CACHE_ON_FLUSH_ISSET_ID, value);
+  }
+
+  /**
+   * @deprecated
+   */
+  public int getIndex_interval() {
+    return this.index_interval;
+  }
+
+  /**
+   * @deprecated
+   */
+  public CfDef setIndex_interval(int index_interval) {
+    this.index_interval = index_interval;
+    setIndex_intervalIsSet(true);
+    return this;
+  }
+
+  public void unsetIndex_interval() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __INDEX_INTERVAL_ISSET_ID);
+  }
+
+  /** Returns true if field index_interval is set (has been assigned a value) and false otherwise */
+  public boolean isSetIndex_interval() {
+    return EncodingUtils.testBit(__isset_bitfield, __INDEX_INTERVAL_ISSET_ID);
+  }
+
+  public void setIndex_intervalIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __INDEX_INTERVAL_ISSET_ID, value);
+  }
+
   public void setFieldValue(_Fields field, Object value) {
     switch (field) {
     case KEYSPACE:
@@ -1785,14 +1927,6 @@
       }
       break;
 
-    case REPLICATE_ON_WRITE:
-      if (value == null) {
-        unsetReplicate_on_write();
-      } else {
-        setReplicate_on_write((Boolean)value);
-      }
-      break;
-
     case KEY_VALIDATION_CLASS:
       if (value == null) {
         unsetKey_validation_class();
@@ -1857,14 +1991,6 @@
       }
       break;
 
-    case POPULATE_IO_CACHE_ON_FLUSH:
-      if (value == null) {
-        unsetPopulate_io_cache_on_flush();
-      } else {
-        setPopulate_io_cache_on_flush((Boolean)value);
-      }
-      break;
-
     case MEMTABLE_FLUSH_PERIOD_IN_MS:
       if (value == null) {
         unsetMemtable_flush_period_in_ms();
@@ -1881,14 +2007,6 @@
       }
       break;
 
-    case INDEX_INTERVAL:
-      if (value == null) {
-        unsetIndex_interval();
-      } else {
-        setIndex_interval((Integer)value);
-      }
-      break;
-
     case SPECULATIVE_RETRY:
       if (value == null) {
         unsetSpeculative_retry();
@@ -1905,6 +2023,30 @@
       }
       break;
 
+    case CELLS_PER_ROW_TO_CACHE:
+      if (value == null) {
+        unsetCells_per_row_to_cache();
+      } else {
+        setCells_per_row_to_cache((String)value);
+      }
+      break;
+
+    case MIN_INDEX_INTERVAL:
+      if (value == null) {
+        unsetMin_index_interval();
+      } else {
+        setMin_index_interval((Integer)value);
+      }
+      break;
+
+    case MAX_INDEX_INTERVAL:
+      if (value == null) {
+        unsetMax_index_interval();
+      } else {
+        setMax_index_interval((Integer)value);
+      }
+      break;
+
     case ROW_CACHE_SIZE:
       if (value == null) {
         unsetRow_cache_size();
@@ -1961,6 +2103,14 @@
       }
       break;
 
+    case REPLICATE_ON_WRITE:
+      if (value == null) {
+        unsetReplicate_on_write();
+      } else {
+        setReplicate_on_write((Boolean)value);
+      }
+      break;
+
     case MERGE_SHARDS_CHANCE:
       if (value == null) {
         unsetMerge_shards_chance();
@@ -1985,6 +2135,22 @@
       }
       break;
 
+    case POPULATE_IO_CACHE_ON_FLUSH:
+      if (value == null) {
+        unsetPopulate_io_cache_on_flush();
+      } else {
+        setPopulate_io_cache_on_flush((Boolean)value);
+      }
+      break;
+
+    case INDEX_INTERVAL:
+      if (value == null) {
+        unsetIndex_interval();
+      } else {
+        setIndex_interval((Integer)value);
+      }
+      break;
+
     }
   }
 
@@ -2029,9 +2195,6 @@
     case MAX_COMPACTION_THRESHOLD:
       return Integer.valueOf(getMax_compaction_threshold());
 
-    case REPLICATE_ON_WRITE:
-      return Boolean.valueOf(isReplicate_on_write());
-
     case KEY_VALIDATION_CLASS:
       return getKey_validation_class();
 
@@ -2056,24 +2219,27 @@
     case DCLOCAL_READ_REPAIR_CHANCE:
       return Double.valueOf(getDclocal_read_repair_chance());
 
-    case POPULATE_IO_CACHE_ON_FLUSH:
-      return Boolean.valueOf(isPopulate_io_cache_on_flush());
-
     case MEMTABLE_FLUSH_PERIOD_IN_MS:
       return Integer.valueOf(getMemtable_flush_period_in_ms());
 
     case DEFAULT_TIME_TO_LIVE:
       return Integer.valueOf(getDefault_time_to_live());
 
-    case INDEX_INTERVAL:
-      return Integer.valueOf(getIndex_interval());
-
     case SPECULATIVE_RETRY:
       return getSpeculative_retry();
 
     case TRIGGERS:
       return getTriggers();
 
+    case CELLS_PER_ROW_TO_CACHE:
+      return getCells_per_row_to_cache();
+
+    case MIN_INDEX_INTERVAL:
+      return Integer.valueOf(getMin_index_interval());
+
+    case MAX_INDEX_INTERVAL:
+      return Integer.valueOf(getMax_index_interval());
+
     case ROW_CACHE_SIZE:
       return Double.valueOf(getRow_cache_size());
 
@@ -2095,6 +2261,9 @@
     case MEMTABLE_OPERATIONS_IN_MILLIONS:
       return Double.valueOf(getMemtable_operations_in_millions());
 
+    case REPLICATE_ON_WRITE:
+      return Boolean.valueOf(isReplicate_on_write());
+
     case MERGE_SHARDS_CHANCE:
       return Double.valueOf(getMerge_shards_chance());
 
@@ -2104,6 +2273,12 @@
     case ROW_CACHE_KEYS_TO_SAVE:
       return Integer.valueOf(getRow_cache_keys_to_save());
 
+    case POPULATE_IO_CACHE_ON_FLUSH:
+      return Boolean.valueOf(isPopulate_io_cache_on_flush());
+
+    case INDEX_INTERVAL:
+      return Integer.valueOf(getIndex_interval());
+
     }
     throw new IllegalStateException();
   }
@@ -2141,8 +2316,6 @@
       return isSetMin_compaction_threshold();
     case MAX_COMPACTION_THRESHOLD:
       return isSetMax_compaction_threshold();
-    case REPLICATE_ON_WRITE:
-      return isSetReplicate_on_write();
     case KEY_VALIDATION_CLASS:
       return isSetKey_validation_class();
     case KEY_ALIAS:
@@ -2159,18 +2332,20 @@
       return isSetCaching();
     case DCLOCAL_READ_REPAIR_CHANCE:
       return isSetDclocal_read_repair_chance();
-    case POPULATE_IO_CACHE_ON_FLUSH:
-      return isSetPopulate_io_cache_on_flush();
     case MEMTABLE_FLUSH_PERIOD_IN_MS:
       return isSetMemtable_flush_period_in_ms();
     case DEFAULT_TIME_TO_LIVE:
       return isSetDefault_time_to_live();
-    case INDEX_INTERVAL:
-      return isSetIndex_interval();
     case SPECULATIVE_RETRY:
       return isSetSpeculative_retry();
     case TRIGGERS:
       return isSetTriggers();
+    case CELLS_PER_ROW_TO_CACHE:
+      return isSetCells_per_row_to_cache();
+    case MIN_INDEX_INTERVAL:
+      return isSetMin_index_interval();
+    case MAX_INDEX_INTERVAL:
+      return isSetMax_index_interval();
     case ROW_CACHE_SIZE:
       return isSetRow_cache_size();
     case KEY_CACHE_SIZE:
@@ -2185,12 +2360,18 @@
       return isSetMemtable_throughput_in_mb();
     case MEMTABLE_OPERATIONS_IN_MILLIONS:
       return isSetMemtable_operations_in_millions();
+    case REPLICATE_ON_WRITE:
+      return isSetReplicate_on_write();
     case MERGE_SHARDS_CHANCE:
       return isSetMerge_shards_chance();
     case ROW_CACHE_PROVIDER:
       return isSetRow_cache_provider();
     case ROW_CACHE_KEYS_TO_SAVE:
       return isSetRow_cache_keys_to_save();
+    case POPULATE_IO_CACHE_ON_FLUSH:
+      return isSetPopulate_io_cache_on_flush();
+    case INDEX_INTERVAL:
+      return isSetIndex_interval();
     }
     throw new IllegalStateException();
   }
@@ -2325,15 +2506,6 @@
         return false;
     }
 
-    boolean this_present_replicate_on_write = true && this.isSetReplicate_on_write();
-    boolean that_present_replicate_on_write = true && that.isSetReplicate_on_write();
-    if (this_present_replicate_on_write || that_present_replicate_on_write) {
-      if (!(this_present_replicate_on_write && that_present_replicate_on_write))
-        return false;
-      if (this.replicate_on_write != that.replicate_on_write)
-        return false;
-    }
-
     boolean this_present_key_validation_class = true && this.isSetKey_validation_class();
     boolean that_present_key_validation_class = true && that.isSetKey_validation_class();
     if (this_present_key_validation_class || that_present_key_validation_class) {
@@ -2406,15 +2578,6 @@
         return false;
     }
 
-    boolean this_present_populate_io_cache_on_flush = true && this.isSetPopulate_io_cache_on_flush();
-    boolean that_present_populate_io_cache_on_flush = true && that.isSetPopulate_io_cache_on_flush();
-    if (this_present_populate_io_cache_on_flush || that_present_populate_io_cache_on_flush) {
-      if (!(this_present_populate_io_cache_on_flush && that_present_populate_io_cache_on_flush))
-        return false;
-      if (this.populate_io_cache_on_flush != that.populate_io_cache_on_flush)
-        return false;
-    }
-
     boolean this_present_memtable_flush_period_in_ms = true && this.isSetMemtable_flush_period_in_ms();
     boolean that_present_memtable_flush_period_in_ms = true && that.isSetMemtable_flush_period_in_ms();
     if (this_present_memtable_flush_period_in_ms || that_present_memtable_flush_period_in_ms) {
@@ -2433,15 +2596,6 @@
         return false;
     }
 
-    boolean this_present_index_interval = true && this.isSetIndex_interval();
-    boolean that_present_index_interval = true && that.isSetIndex_interval();
-    if (this_present_index_interval || that_present_index_interval) {
-      if (!(this_present_index_interval && that_present_index_interval))
-        return false;
-      if (this.index_interval != that.index_interval)
-        return false;
-    }
-
     boolean this_present_speculative_retry = true && this.isSetSpeculative_retry();
     boolean that_present_speculative_retry = true && that.isSetSpeculative_retry();
     if (this_present_speculative_retry || that_present_speculative_retry) {
@@ -2460,6 +2614,33 @@
         return false;
     }
 
+    boolean this_present_cells_per_row_to_cache = true && this.isSetCells_per_row_to_cache();
+    boolean that_present_cells_per_row_to_cache = true && that.isSetCells_per_row_to_cache();
+    if (this_present_cells_per_row_to_cache || that_present_cells_per_row_to_cache) {
+      if (!(this_present_cells_per_row_to_cache && that_present_cells_per_row_to_cache))
+        return false;
+      if (!this.cells_per_row_to_cache.equals(that.cells_per_row_to_cache))
+        return false;
+    }
+
+    boolean this_present_min_index_interval = true && this.isSetMin_index_interval();
+    boolean that_present_min_index_interval = true && that.isSetMin_index_interval();
+    if (this_present_min_index_interval || that_present_min_index_interval) {
+      if (!(this_present_min_index_interval && that_present_min_index_interval))
+        return false;
+      if (this.min_index_interval != that.min_index_interval)
+        return false;
+    }
+
+    boolean this_present_max_index_interval = true && this.isSetMax_index_interval();
+    boolean that_present_max_index_interval = true && that.isSetMax_index_interval();
+    if (this_present_max_index_interval || that_present_max_index_interval) {
+      if (!(this_present_max_index_interval && that_present_max_index_interval))
+        return false;
+      if (this.max_index_interval != that.max_index_interval)
+        return false;
+    }
+
     boolean this_present_row_cache_size = true && this.isSetRow_cache_size();
     boolean that_present_row_cache_size = true && that.isSetRow_cache_size();
     if (this_present_row_cache_size || that_present_row_cache_size) {
@@ -2523,6 +2704,15 @@
         return false;
     }
 
+    boolean this_present_replicate_on_write = true && this.isSetReplicate_on_write();
+    boolean that_present_replicate_on_write = true && that.isSetReplicate_on_write();
+    if (this_present_replicate_on_write || that_present_replicate_on_write) {
+      if (!(this_present_replicate_on_write && that_present_replicate_on_write))
+        return false;
+      if (this.replicate_on_write != that.replicate_on_write)
+        return false;
+    }
+
     boolean this_present_merge_shards_chance = true && this.isSetMerge_shards_chance();
     boolean that_present_merge_shards_chance = true && that.isSetMerge_shards_chance();
     if (this_present_merge_shards_chance || that_present_merge_shards_chance) {
@@ -2550,6 +2740,24 @@
         return false;
     }
 
+    boolean this_present_populate_io_cache_on_flush = true && this.isSetPopulate_io_cache_on_flush();
+    boolean that_present_populate_io_cache_on_flush = true && that.isSetPopulate_io_cache_on_flush();
+    if (this_present_populate_io_cache_on_flush || that_present_populate_io_cache_on_flush) {
+      if (!(this_present_populate_io_cache_on_flush && that_present_populate_io_cache_on_flush))
+        return false;
+      if (this.populate_io_cache_on_flush != that.populate_io_cache_on_flush)
+        return false;
+    }
+
+    boolean this_present_index_interval = true && this.isSetIndex_interval();
+    boolean that_present_index_interval = true && that.isSetIndex_interval();
+    if (this_present_index_interval || that_present_index_interval) {
+      if (!(this_present_index_interval && that_present_index_interval))
+        return false;
+      if (this.index_interval != that.index_interval)
+        return false;
+    }
+
     return true;
   }
 
@@ -2622,11 +2830,6 @@
     if (present_max_compaction_threshold)
       builder.append(max_compaction_threshold);
 
-    boolean present_replicate_on_write = true && (isSetReplicate_on_write());
-    builder.append(present_replicate_on_write);
-    if (present_replicate_on_write)
-      builder.append(replicate_on_write);
-
     boolean present_key_validation_class = true && (isSetKey_validation_class());
     builder.append(present_key_validation_class);
     if (present_key_validation_class)
@@ -2667,11 +2870,6 @@
     if (present_dclocal_read_repair_chance)
       builder.append(dclocal_read_repair_chance);
 
-    boolean present_populate_io_cache_on_flush = true && (isSetPopulate_io_cache_on_flush());
-    builder.append(present_populate_io_cache_on_flush);
-    if (present_populate_io_cache_on_flush)
-      builder.append(populate_io_cache_on_flush);
-
     boolean present_memtable_flush_period_in_ms = true && (isSetMemtable_flush_period_in_ms());
     builder.append(present_memtable_flush_period_in_ms);
     if (present_memtable_flush_period_in_ms)
@@ -2682,11 +2880,6 @@
     if (present_default_time_to_live)
       builder.append(default_time_to_live);
 
-    boolean present_index_interval = true && (isSetIndex_interval());
-    builder.append(present_index_interval);
-    if (present_index_interval)
-      builder.append(index_interval);
-
     boolean present_speculative_retry = true && (isSetSpeculative_retry());
     builder.append(present_speculative_retry);
     if (present_speculative_retry)
@@ -2697,6 +2890,21 @@
     if (present_triggers)
       builder.append(triggers);
 
+    boolean present_cells_per_row_to_cache = true && (isSetCells_per_row_to_cache());
+    builder.append(present_cells_per_row_to_cache);
+    if (present_cells_per_row_to_cache)
+      builder.append(cells_per_row_to_cache);
+
+    boolean present_min_index_interval = true && (isSetMin_index_interval());
+    builder.append(present_min_index_interval);
+    if (present_min_index_interval)
+      builder.append(min_index_interval);
+
+    boolean present_max_index_interval = true && (isSetMax_index_interval());
+    builder.append(present_max_index_interval);
+    if (present_max_index_interval)
+      builder.append(max_index_interval);
+
     boolean present_row_cache_size = true && (isSetRow_cache_size());
     builder.append(present_row_cache_size);
     if (present_row_cache_size)
@@ -2732,6 +2940,11 @@
     if (present_memtable_operations_in_millions)
       builder.append(memtable_operations_in_millions);
 
+    boolean present_replicate_on_write = true && (isSetReplicate_on_write());
+    builder.append(present_replicate_on_write);
+    if (present_replicate_on_write)
+      builder.append(replicate_on_write);
+
     boolean present_merge_shards_chance = true && (isSetMerge_shards_chance());
     builder.append(present_merge_shards_chance);
     if (present_merge_shards_chance)
@@ -2747,6 +2960,16 @@
     if (present_row_cache_keys_to_save)
       builder.append(row_cache_keys_to_save);
 
+    boolean present_populate_io_cache_on_flush = true && (isSetPopulate_io_cache_on_flush());
+    builder.append(present_populate_io_cache_on_flush);
+    if (present_populate_io_cache_on_flush)
+      builder.append(populate_io_cache_on_flush);
+
+    boolean present_index_interval = true && (isSetIndex_interval());
+    builder.append(present_index_interval);
+    if (present_index_interval)
+      builder.append(index_interval);
+
     return builder.toHashCode();
   }
 
@@ -2888,16 +3111,6 @@
         return lastComparison;
       }
     }
-    lastComparison = Boolean.valueOf(isSetReplicate_on_write()).compareTo(other.isSetReplicate_on_write());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetReplicate_on_write()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.replicate_on_write, other.replicate_on_write);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
     lastComparison = Boolean.valueOf(isSetKey_validation_class()).compareTo(other.isSetKey_validation_class());
     if (lastComparison != 0) {
       return lastComparison;
@@ -2978,16 +3191,6 @@
         return lastComparison;
       }
     }
-    lastComparison = Boolean.valueOf(isSetPopulate_io_cache_on_flush()).compareTo(other.isSetPopulate_io_cache_on_flush());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetPopulate_io_cache_on_flush()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.populate_io_cache_on_flush, other.populate_io_cache_on_flush);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
     lastComparison = Boolean.valueOf(isSetMemtable_flush_period_in_ms()).compareTo(other.isSetMemtable_flush_period_in_ms());
     if (lastComparison != 0) {
       return lastComparison;
@@ -3008,16 +3211,6 @@
         return lastComparison;
       }
     }
-    lastComparison = Boolean.valueOf(isSetIndex_interval()).compareTo(other.isSetIndex_interval());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetIndex_interval()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.index_interval, other.index_interval);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
     lastComparison = Boolean.valueOf(isSetSpeculative_retry()).compareTo(other.isSetSpeculative_retry());
     if (lastComparison != 0) {
       return lastComparison;
@@ -3038,6 +3231,36 @@
         return lastComparison;
       }
     }
+    lastComparison = Boolean.valueOf(isSetCells_per_row_to_cache()).compareTo(other.isSetCells_per_row_to_cache());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetCells_per_row_to_cache()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.cells_per_row_to_cache, other.cells_per_row_to_cache);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMin_index_interval()).compareTo(other.isSetMin_index_interval());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMin_index_interval()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.min_index_interval, other.min_index_interval);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMax_index_interval()).compareTo(other.isSetMax_index_interval());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMax_index_interval()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.max_index_interval, other.max_index_interval);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
     lastComparison = Boolean.valueOf(isSetRow_cache_size()).compareTo(other.isSetRow_cache_size());
     if (lastComparison != 0) {
       return lastComparison;
@@ -3108,6 +3331,16 @@
         return lastComparison;
       }
     }
+    lastComparison = Boolean.valueOf(isSetReplicate_on_write()).compareTo(other.isSetReplicate_on_write());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetReplicate_on_write()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.replicate_on_write, other.replicate_on_write);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
     lastComparison = Boolean.valueOf(isSetMerge_shards_chance()).compareTo(other.isSetMerge_shards_chance());
     if (lastComparison != 0) {
       return lastComparison;
@@ -3138,6 +3371,26 @@
         return lastComparison;
       }
     }
+    lastComparison = Boolean.valueOf(isSetPopulate_io_cache_on_flush()).compareTo(other.isSetPopulate_io_cache_on_flush());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetPopulate_io_cache_on_flush()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.populate_io_cache_on_flush, other.populate_io_cache_on_flush);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetIndex_interval()).compareTo(other.isSetIndex_interval());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetIndex_interval()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.index_interval, other.index_interval);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
     return 0;
   }
 
@@ -3263,12 +3516,6 @@
       sb.append(this.max_compaction_threshold);
       first = false;
     }
-    if (isSetReplicate_on_write()) {
-      if (!first) sb.append(", ");
-      sb.append("replicate_on_write:");
-      sb.append(this.replicate_on_write);
-      first = false;
-    }
     if (isSetKey_validation_class()) {
       if (!first) sb.append(", ");
       sb.append("key_validation_class:");
@@ -3341,12 +3588,6 @@
       sb.append(this.dclocal_read_repair_chance);
       first = false;
     }
-    if (isSetPopulate_io_cache_on_flush()) {
-      if (!first) sb.append(", ");
-      sb.append("populate_io_cache_on_flush:");
-      sb.append(this.populate_io_cache_on_flush);
-      first = false;
-    }
     if (isSetMemtable_flush_period_in_ms()) {
       if (!first) sb.append(", ");
       sb.append("memtable_flush_period_in_ms:");
@@ -3359,12 +3600,6 @@
       sb.append(this.default_time_to_live);
       first = false;
     }
-    if (isSetIndex_interval()) {
-      if (!first) sb.append(", ");
-      sb.append("index_interval:");
-      sb.append(this.index_interval);
-      first = false;
-    }
     if (isSetSpeculative_retry()) {
       if (!first) sb.append(", ");
       sb.append("speculative_retry:");
@@ -3385,6 +3620,28 @@
       }
       first = false;
     }
+    if (isSetCells_per_row_to_cache()) {
+      if (!first) sb.append(", ");
+      sb.append("cells_per_row_to_cache:");
+      if (this.cells_per_row_to_cache == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.cells_per_row_to_cache);
+      }
+      first = false;
+    }
+    if (isSetMin_index_interval()) {
+      if (!first) sb.append(", ");
+      sb.append("min_index_interval:");
+      sb.append(this.min_index_interval);
+      first = false;
+    }
+    if (isSetMax_index_interval()) {
+      if (!first) sb.append(", ");
+      sb.append("max_index_interval:");
+      sb.append(this.max_index_interval);
+      first = false;
+    }
     if (isSetRow_cache_size()) {
       if (!first) sb.append(", ");
       sb.append("row_cache_size:");
@@ -3427,6 +3684,12 @@
       sb.append(this.memtable_operations_in_millions);
       first = false;
     }
+    if (isSetReplicate_on_write()) {
+      if (!first) sb.append(", ");
+      sb.append("replicate_on_write:");
+      sb.append(this.replicate_on_write);
+      first = false;
+    }
     if (isSetMerge_shards_chance()) {
       if (!first) sb.append(", ");
       sb.append("merge_shards_chance:");
@@ -3449,6 +3712,18 @@
       sb.append(this.row_cache_keys_to_save);
       first = false;
     }
+    if (isSetPopulate_io_cache_on_flush()) {
+      if (!first) sb.append(", ");
+      sb.append("populate_io_cache_on_flush:");
+      sb.append(this.populate_io_cache_on_flush);
+      first = false;
+    }
+    if (isSetIndex_interval()) {
+      if (!first) sb.append(", ");
+      sb.append("index_interval:");
+      sb.append(this.index_interval);
+      first = false;
+    }
     sb.append(")");
     return sb.toString();
   }
@@ -3615,14 +3890,6 @@
               org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
             }
             break;
-          case 24: // REPLICATE_ON_WRITE
-            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
-              struct.replicate_on_write = iprot.readBool();
-              struct.setReplicate_on_writeIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
           case 26: // KEY_VALIDATION_CLASS
             if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
               struct.key_validation_class = iprot.readString();
@@ -3711,14 +3978,6 @@
               org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
             }
             break;
-          case 38: // POPULATE_IO_CACHE_ON_FLUSH
-            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
-              struct.populate_io_cache_on_flush = iprot.readBool();
-              struct.setPopulate_io_cache_on_flushIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
           case 39: // MEMTABLE_FLUSH_PERIOD_IN_MS
             if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
               struct.memtable_flush_period_in_ms = iprot.readI32();
@@ -3735,14 +3994,6 @@
               org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
             }
             break;
-          case 41: // INDEX_INTERVAL
-            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
-              struct.index_interval = iprot.readI32();
-              struct.setIndex_intervalIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
           case 42: // SPECULATIVE_RETRY
             if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
               struct.speculative_retry = iprot.readString();
@@ -3770,6 +4021,30 @@
               org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
             }
             break;
+          case 44: // CELLS_PER_ROW_TO_CACHE
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.cells_per_row_to_cache = iprot.readString();
+              struct.setCells_per_row_to_cacheIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 45: // MIN_INDEX_INTERVAL
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.min_index_interval = iprot.readI32();
+              struct.setMin_index_intervalIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 46: // MAX_INDEX_INTERVAL
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.max_index_interval = iprot.readI32();
+              struct.setMax_index_intervalIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
           case 9: // ROW_CACHE_SIZE
             if (schemeField.type == org.apache.thrift.protocol.TType.DOUBLE) {
               struct.row_cache_size = iprot.readDouble();
@@ -3826,6 +4101,14 @@
               org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
             }
             break;
+          case 24: // REPLICATE_ON_WRITE
+            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
+              struct.replicate_on_write = iprot.readBool();
+              struct.setReplicate_on_writeIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
           case 25: // MERGE_SHARDS_CHANCE
             if (schemeField.type == org.apache.thrift.protocol.TType.DOUBLE) {
               struct.merge_shards_chance = iprot.readDouble();
@@ -3850,6 +4133,22 @@
               org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
             }
             break;
+          case 38: // POPULATE_IO_CACHE_ON_FLUSH
+            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
+              struct.populate_io_cache_on_flush = iprot.readBool();
+              struct.setPopulate_io_cache_on_flushIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 41: // INDEX_INTERVAL
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.index_interval = iprot.readI32();
+              struct.setIndex_intervalIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
           default:
             org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
         }
@@ -4115,6 +4414,23 @@
           oprot.writeFieldEnd();
         }
       }
+      if (struct.cells_per_row_to_cache != null) {
+        if (struct.isSetCells_per_row_to_cache()) {
+          oprot.writeFieldBegin(CELLS_PER_ROW_TO_CACHE_FIELD_DESC);
+          oprot.writeString(struct.cells_per_row_to_cache);
+          oprot.writeFieldEnd();
+        }
+      }
+      if (struct.isSetMin_index_interval()) {
+        oprot.writeFieldBegin(MIN_INDEX_INTERVAL_FIELD_DESC);
+        oprot.writeI32(struct.min_index_interval);
+        oprot.writeFieldEnd();
+      }
+      if (struct.isSetMax_index_interval()) {
+        oprot.writeFieldBegin(MAX_INDEX_INTERVAL_FIELD_DESC);
+        oprot.writeI32(struct.max_index_interval);
+        oprot.writeFieldEnd();
+      }
       oprot.writeFieldStop();
       oprot.writeStructEnd();
     }
@@ -4168,49 +4484,49 @@
       if (struct.isSetMax_compaction_threshold()) {
         optionals.set(10);
       }
-      if (struct.isSetReplicate_on_write()) {
+      if (struct.isSetKey_validation_class()) {
         optionals.set(11);
       }
-      if (struct.isSetKey_validation_class()) {
+      if (struct.isSetKey_alias()) {
         optionals.set(12);
       }
-      if (struct.isSetKey_alias()) {
+      if (struct.isSetCompaction_strategy()) {
         optionals.set(13);
       }
-      if (struct.isSetCompaction_strategy()) {
+      if (struct.isSetCompaction_strategy_options()) {
         optionals.set(14);
       }
-      if (struct.isSetCompaction_strategy_options()) {
+      if (struct.isSetCompression_options()) {
         optionals.set(15);
       }
-      if (struct.isSetCompression_options()) {
+      if (struct.isSetBloom_filter_fp_chance()) {
         optionals.set(16);
       }
-      if (struct.isSetBloom_filter_fp_chance()) {
+      if (struct.isSetCaching()) {
         optionals.set(17);
       }
-      if (struct.isSetCaching()) {
+      if (struct.isSetDclocal_read_repair_chance()) {
         optionals.set(18);
       }
-      if (struct.isSetDclocal_read_repair_chance()) {
+      if (struct.isSetMemtable_flush_period_in_ms()) {
         optionals.set(19);
       }
-      if (struct.isSetPopulate_io_cache_on_flush()) {
+      if (struct.isSetDefault_time_to_live()) {
         optionals.set(20);
       }
-      if (struct.isSetMemtable_flush_period_in_ms()) {
+      if (struct.isSetSpeculative_retry()) {
         optionals.set(21);
       }
-      if (struct.isSetDefault_time_to_live()) {
+      if (struct.isSetTriggers()) {
         optionals.set(22);
       }
-      if (struct.isSetIndex_interval()) {
+      if (struct.isSetCells_per_row_to_cache()) {
         optionals.set(23);
       }
-      if (struct.isSetSpeculative_retry()) {
+      if (struct.isSetMin_index_interval()) {
         optionals.set(24);
       }
-      if (struct.isSetTriggers()) {
+      if (struct.isSetMax_index_interval()) {
         optionals.set(25);
       }
       if (struct.isSetRow_cache_size()) {
@@ -4234,16 +4550,25 @@
       if (struct.isSetMemtable_operations_in_millions()) {
         optionals.set(32);
       }
-      if (struct.isSetMerge_shards_chance()) {
+      if (struct.isSetReplicate_on_write()) {
         optionals.set(33);
       }
-      if (struct.isSetRow_cache_provider()) {
+      if (struct.isSetMerge_shards_chance()) {
         optionals.set(34);
       }
-      if (struct.isSetRow_cache_keys_to_save()) {
+      if (struct.isSetRow_cache_provider()) {
         optionals.set(35);
       }
-      oprot.writeBitSet(optionals, 36);
+      if (struct.isSetRow_cache_keys_to_save()) {
+        optionals.set(36);
+      }
+      if (struct.isSetPopulate_io_cache_on_flush()) {
+        optionals.set(37);
+      }
+      if (struct.isSetIndex_interval()) {
+        optionals.set(38);
+      }
+      oprot.writeBitSet(optionals, 39);
       if (struct.isSetColumn_type()) {
         oprot.writeString(struct.column_type);
       }
@@ -4283,9 +4608,6 @@
       if (struct.isSetMax_compaction_threshold()) {
         oprot.writeI32(struct.max_compaction_threshold);
       }
-      if (struct.isSetReplicate_on_write()) {
-        oprot.writeBool(struct.replicate_on_write);
-      }
       if (struct.isSetKey_validation_class()) {
         oprot.writeString(struct.key_validation_class);
       }
@@ -4324,18 +4646,12 @@
       if (struct.isSetDclocal_read_repair_chance()) {
         oprot.writeDouble(struct.dclocal_read_repair_chance);
       }
-      if (struct.isSetPopulate_io_cache_on_flush()) {
-        oprot.writeBool(struct.populate_io_cache_on_flush);
-      }
       if (struct.isSetMemtable_flush_period_in_ms()) {
         oprot.writeI32(struct.memtable_flush_period_in_ms);
       }
       if (struct.isSetDefault_time_to_live()) {
         oprot.writeI32(struct.default_time_to_live);
       }
-      if (struct.isSetIndex_interval()) {
-        oprot.writeI32(struct.index_interval);
-      }
       if (struct.isSetSpeculative_retry()) {
         oprot.writeString(struct.speculative_retry);
       }
@@ -4348,6 +4664,15 @@
           }
         }
       }
+      if (struct.isSetCells_per_row_to_cache()) {
+        oprot.writeString(struct.cells_per_row_to_cache);
+      }
+      if (struct.isSetMin_index_interval()) {
+        oprot.writeI32(struct.min_index_interval);
+      }
+      if (struct.isSetMax_index_interval()) {
+        oprot.writeI32(struct.max_index_interval);
+      }
       if (struct.isSetRow_cache_size()) {
         oprot.writeDouble(struct.row_cache_size);
       }
@@ -4369,6 +4694,9 @@
       if (struct.isSetMemtable_operations_in_millions()) {
         oprot.writeDouble(struct.memtable_operations_in_millions);
       }
+      if (struct.isSetReplicate_on_write()) {
+        oprot.writeBool(struct.replicate_on_write);
+      }
       if (struct.isSetMerge_shards_chance()) {
         oprot.writeDouble(struct.merge_shards_chance);
       }
@@ -4378,6 +4706,12 @@
       if (struct.isSetRow_cache_keys_to_save()) {
         oprot.writeI32(struct.row_cache_keys_to_save);
       }
+      if (struct.isSetPopulate_io_cache_on_flush()) {
+        oprot.writeBool(struct.populate_io_cache_on_flush);
+      }
+      if (struct.isSetIndex_interval()) {
+        oprot.writeI32(struct.index_interval);
+      }
     }
 
     @Override
@@ -4387,7 +4721,7 @@
       struct.setKeyspaceIsSet(true);
       struct.name = iprot.readString();
       struct.setNameIsSet(true);
-      BitSet incoming = iprot.readBitSet(36);
+      BitSet incoming = iprot.readBitSet(39);
       if (incoming.get(0)) {
         struct.column_type = iprot.readString();
         struct.setColumn_typeIsSet(true);
@@ -4443,22 +4777,18 @@
         struct.setMax_compaction_thresholdIsSet(true);
       }
       if (incoming.get(11)) {
-        struct.replicate_on_write = iprot.readBool();
-        struct.setReplicate_on_writeIsSet(true);
-      }
-      if (incoming.get(12)) {
         struct.key_validation_class = iprot.readString();
         struct.setKey_validation_classIsSet(true);
       }
-      if (incoming.get(13)) {
+      if (incoming.get(12)) {
         struct.key_alias = iprot.readBinary();
         struct.setKey_aliasIsSet(true);
       }
-      if (incoming.get(14)) {
+      if (incoming.get(13)) {
         struct.compaction_strategy = iprot.readString();
         struct.setCompaction_strategyIsSet(true);
       }
-      if (incoming.get(15)) {
+      if (incoming.get(14)) {
         {
           org.apache.thrift.protocol.TMap _map135 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.STRING, iprot.readI32());
           struct.compaction_strategy_options = new HashMap<String,String>(2*_map135.size);
@@ -4473,7 +4803,7 @@
         }
         struct.setCompaction_strategy_optionsIsSet(true);
       }
-      if (incoming.get(16)) {
+      if (incoming.get(15)) {
         {
           org.apache.thrift.protocol.TMap _map139 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.STRING, org.apache.thrift.protocol.TType.STRING, iprot.readI32());
           struct.compression_options = new HashMap<String,String>(2*_map139.size);
@@ -4488,39 +4818,31 @@
         }
         struct.setCompression_optionsIsSet(true);
       }
-      if (incoming.get(17)) {
+      if (incoming.get(16)) {
         struct.bloom_filter_fp_chance = iprot.readDouble();
         struct.setBloom_filter_fp_chanceIsSet(true);
       }
-      if (incoming.get(18)) {
+      if (incoming.get(17)) {
         struct.caching = iprot.readString();
         struct.setCachingIsSet(true);
       }
-      if (incoming.get(19)) {
+      if (incoming.get(18)) {
         struct.dclocal_read_repair_chance = iprot.readDouble();
         struct.setDclocal_read_repair_chanceIsSet(true);
       }
-      if (incoming.get(20)) {
-        struct.populate_io_cache_on_flush = iprot.readBool();
-        struct.setPopulate_io_cache_on_flushIsSet(true);
-      }
-      if (incoming.get(21)) {
+      if (incoming.get(19)) {
         struct.memtable_flush_period_in_ms = iprot.readI32();
         struct.setMemtable_flush_period_in_msIsSet(true);
       }
-      if (incoming.get(22)) {
+      if (incoming.get(20)) {
         struct.default_time_to_live = iprot.readI32();
         struct.setDefault_time_to_liveIsSet(true);
       }
-      if (incoming.get(23)) {
-        struct.index_interval = iprot.readI32();
-        struct.setIndex_intervalIsSet(true);
-      }
-      if (incoming.get(24)) {
+      if (incoming.get(21)) {
         struct.speculative_retry = iprot.readString();
         struct.setSpeculative_retryIsSet(true);
       }
-      if (incoming.get(25)) {
+      if (incoming.get(22)) {
         {
           org.apache.thrift.protocol.TList _list143 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
           struct.triggers = new ArrayList<TriggerDef>(_list143.size);
@@ -4534,6 +4856,18 @@
         }
         struct.setTriggersIsSet(true);
       }
+      if (incoming.get(23)) {
+        struct.cells_per_row_to_cache = iprot.readString();
+        struct.setCells_per_row_to_cacheIsSet(true);
+      }
+      if (incoming.get(24)) {
+        struct.min_index_interval = iprot.readI32();
+        struct.setMin_index_intervalIsSet(true);
+      }
+      if (incoming.get(25)) {
+        struct.max_index_interval = iprot.readI32();
+        struct.setMax_index_intervalIsSet(true);
+      }
       if (incoming.get(26)) {
         struct.row_cache_size = iprot.readDouble();
         struct.setRow_cache_sizeIsSet(true);
@@ -4563,17 +4897,29 @@
         struct.setMemtable_operations_in_millionsIsSet(true);
       }
       if (incoming.get(33)) {
+        struct.replicate_on_write = iprot.readBool();
+        struct.setReplicate_on_writeIsSet(true);
+      }
+      if (incoming.get(34)) {
         struct.merge_shards_chance = iprot.readDouble();
         struct.setMerge_shards_chanceIsSet(true);
       }
-      if (incoming.get(34)) {
+      if (incoming.get(35)) {
         struct.row_cache_provider = iprot.readString();
         struct.setRow_cache_providerIsSet(true);
       }
-      if (incoming.get(35)) {
+      if (incoming.get(36)) {
         struct.row_cache_keys_to_save = iprot.readI32();
         struct.setRow_cache_keys_to_saveIsSet(true);
       }
+      if (incoming.get(37)) {
+        struct.populate_io_cache_on_flush = iprot.readBool();
+        struct.setPopulate_io_cache_on_flushIsSet(true);
+      }
+      if (incoming.get(38)) {
+        struct.index_interval = iprot.readI32();
+        struct.setIndex_intervalIsSet(true);
+      }
     }
   }
 

diff --git a/interface/thrift/gen-java/org/apache/cassandra/thrift/ColumnSlice.java b/interface/thrift/gen-java/org/apache/cassandra/thrift/ColumnSlice.java
new file mode 100644
index 0000000..67b88a3
--- /dev/null
+++ b/interface/thrift/gen-java/org/apache/cassandra/thrift/ColumnSlice.java

@@ -0,0 +1,551 @@
+/**
+ * Autogenerated by Thrift Compiler (0.9.1)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ *  @generated
+ */
+package org.apache.cassandra.thrift;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.apache.thrift.scheme.IScheme;
+import org.apache.thrift.scheme.SchemeFactory;
+import org.apache.thrift.scheme.StandardScheme;
+
+import org.apache.thrift.scheme.TupleScheme;
+import org.apache.thrift.protocol.TTupleProtocol;
+import org.apache.thrift.protocol.TProtocolException;
+import org.apache.thrift.EncodingUtils;
+import org.apache.thrift.TException;
+import org.apache.thrift.async.AsyncMethodCallback;
+import org.apache.thrift.server.AbstractNonblockingServer.*;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.EnumMap;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.EnumSet;
+import java.util.Collections;
+import java.util.BitSet;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The ColumnSlice is used to select a set of columns from inside a row.
+ * If start or finish are unspecified they will default to the start-of
+ * end-of value.
+ * @param start. The start of the ColumnSlice inclusive
+ * @param finish. The end of the ColumnSlice inclusive
+ */
+public class ColumnSlice implements org.apache.thrift.TBase<ColumnSlice, ColumnSlice._Fields>, java.io.Serializable, Cloneable, Comparable<ColumnSlice> {
+  private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("ColumnSlice");
+
+  private static final org.apache.thrift.protocol.TField START_FIELD_DESC = new org.apache.thrift.protocol.TField("start", org.apache.thrift.protocol.TType.STRING, (short)1);
+  private static final org.apache.thrift.protocol.TField FINISH_FIELD_DESC = new org.apache.thrift.protocol.TField("finish", org.apache.thrift.protocol.TType.STRING, (short)2);
+
+  private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
+  static {
+    schemes.put(StandardScheme.class, new ColumnSliceStandardSchemeFactory());
+    schemes.put(TupleScheme.class, new ColumnSliceTupleSchemeFactory());
+  }
+
+  public ByteBuffer start; // optional
+  public ByteBuffer finish; // optional
+
+  /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
+  public enum _Fields implements org.apache.thrift.TFieldIdEnum {
+    START((short)1, "start"),
+    FINISH((short)2, "finish");
+
+    private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
+
+    static {
+      for (_Fields field : EnumSet.allOf(_Fields.class)) {
+        byName.put(field.getFieldName(), field);
+      }
+    }
+
+    /**
+     * Find the _Fields constant that matches fieldId, or null if its not found.
+     */
+    public static _Fields findByThriftId(int fieldId) {
+      switch(fieldId) {
+        case 1: // START
+          return START;
+        case 2: // FINISH
+          return FINISH;
+        default:
+          return null;
+      }
+    }
+
+    /**
+     * Find the _Fields constant that matches fieldId, throwing an exception
+     * if it is not found.
+     */
+    public static _Fields findByThriftIdOrThrow(int fieldId) {
+      _Fields fields = findByThriftId(fieldId);
+      if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
+      return fields;
+    }
+
+    /**
+     * Find the _Fields constant that matches name, or null if its not found.
+     */
+    public static _Fields findByName(String name) {
+      return byName.get(name);
+    }
+
+    private final short _thriftId;
+    private final String _fieldName;
+
+    _Fields(short thriftId, String fieldName) {
+      _thriftId = thriftId;
+      _fieldName = fieldName;
+    }
+
+    public short getThriftFieldId() {
+      return _thriftId;
+    }
+
+    public String getFieldName() {
+      return _fieldName;
+    }
+  }
+
+  // isset id assignments
+  private _Fields optionals[] = {_Fields.START,_Fields.FINISH};
+  public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
+  static {
+    Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
+    tmpMap.put(_Fields.START, new org.apache.thrift.meta_data.FieldMetaData("start", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING        , true)));
+    tmpMap.put(_Fields.FINISH, new org.apache.thrift.meta_data.FieldMetaData("finish", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING        , true)));
+    metaDataMap = Collections.unmodifiableMap(tmpMap);
+    org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(ColumnSlice.class, metaDataMap);
+  }
+
+  public ColumnSlice() {
+  }
+
+  /**
+   * Performs a deep copy on <i>other</i>.
+   */
+  public ColumnSlice(ColumnSlice other) {
+    if (other.isSetStart()) {
+      this.start = org.apache.thrift.TBaseHelper.copyBinary(other.start);
+;
+    }
+    if (other.isSetFinish()) {
+      this.finish = org.apache.thrift.TBaseHelper.copyBinary(other.finish);
+;
+    }
+  }
+
+  public ColumnSlice deepCopy() {
+    return new ColumnSlice(this);
+  }
+
+  @Override
+  public void clear() {
+    this.start = null;
+    this.finish = null;
+  }
+
+  public byte[] getStart() {
+    setStart(org.apache.thrift.TBaseHelper.rightSize(start));
+    return start == null ? null : start.array();
+  }
+
+  public ByteBuffer bufferForStart() {
+    return start;
+  }
+
+  public ColumnSlice setStart(byte[] start) {
+    setStart(start == null ? (ByteBuffer)null : ByteBuffer.wrap(start));
+    return this;
+  }
+
+  public ColumnSlice setStart(ByteBuffer start) {
+    this.start = start;
+    return this;
+  }
+
+  public void unsetStart() {
+    this.start = null;
+  }
+
+  /** Returns true if field start is set (has been assigned a value) and false otherwise */
+  public boolean isSetStart() {
+    return this.start != null;
+  }
+
+  public void setStartIsSet(boolean value) {
+    if (!value) {
+      this.start = null;
+    }
+  }
+
+  public byte[] getFinish() {
+    setFinish(org.apache.thrift.TBaseHelper.rightSize(finish));
+    return finish == null ? null : finish.array();
+  }
+
+  public ByteBuffer bufferForFinish() {
+    return finish;
+  }
+
+  public ColumnSlice setFinish(byte[] finish) {
+    setFinish(finish == null ? (ByteBuffer)null : ByteBuffer.wrap(finish));
+    return this;
+  }
+
+  public ColumnSlice setFinish(ByteBuffer finish) {
+    this.finish = finish;
+    return this;
+  }
+
+  public void unsetFinish() {
+    this.finish = null;
+  }
+
+  /** Returns true if field finish is set (has been assigned a value) and false otherwise */
+  public boolean isSetFinish() {
+    return this.finish != null;
+  }
+
+  public void setFinishIsSet(boolean value) {
+    if (!value) {
+      this.finish = null;
+    }
+  }
+
+  public void setFieldValue(_Fields field, Object value) {
+    switch (field) {
+    case START:
+      if (value == null) {
+        unsetStart();
+      } else {
+        setStart((ByteBuffer)value);
+      }
+      break;
+
+    case FINISH:
+      if (value == null) {
+        unsetFinish();
+      } else {
+        setFinish((ByteBuffer)value);
+      }
+      break;
+
+    }
+  }
+
+  public Object getFieldValue(_Fields field) {
+    switch (field) {
+    case START:
+      return getStart();
+
+    case FINISH:
+      return getFinish();
+
+    }
+    throw new IllegalStateException();
+  }
+
+  /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
+  public boolean isSet(_Fields field) {
+    if (field == null) {
+      throw new IllegalArgumentException();
+    }
+
+    switch (field) {
+    case START:
+      return isSetStart();
+    case FINISH:
+      return isSetFinish();
+    }
+    throw new IllegalStateException();
+  }
+
+  @Override
+  public boolean equals(Object that) {
+    if (that == null)
+      return false;
+    if (that instanceof ColumnSlice)
+      return this.equals((ColumnSlice)that);
+    return false;
+  }
+
+  public boolean equals(ColumnSlice that) {
+    if (that == null)
+      return false;
+
+    boolean this_present_start = true && this.isSetStart();
+    boolean that_present_start = true && that.isSetStart();
+    if (this_present_start || that_present_start) {
+      if (!(this_present_start && that_present_start))
+        return false;
+      if (!this.start.equals(that.start))
+        return false;
+    }
+
+    boolean this_present_finish = true && this.isSetFinish();
+    boolean that_present_finish = true && that.isSetFinish();
+    if (this_present_finish || that_present_finish) {
+      if (!(this_present_finish && that_present_finish))
+        return false;
+      if (!this.finish.equals(that.finish))
+        return false;
+    }
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    HashCodeBuilder builder = new HashCodeBuilder();
+
+    boolean present_start = true && (isSetStart());
+    builder.append(present_start);
+    if (present_start)
+      builder.append(start);
+
+    boolean present_finish = true && (isSetFinish());
+    builder.append(present_finish);
+    if (present_finish)
+      builder.append(finish);
+
+    return builder.toHashCode();
+  }
+
+  @Override
+  public int compareTo(ColumnSlice other) {
+    if (!getClass().equals(other.getClass())) {
+      return getClass().getName().compareTo(other.getClass().getName());
+    }
+
+    int lastComparison = 0;
+
+    lastComparison = Boolean.valueOf(isSetStart()).compareTo(other.isSetStart());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetStart()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.start, other.start);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetFinish()).compareTo(other.isSetFinish());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetFinish()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.finish, other.finish);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    return 0;
+  }
+
+  public _Fields fieldForId(int fieldId) {
+    return _Fields.findByThriftId(fieldId);
+  }
+
+  public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException {
+    schemes.get(iprot.getScheme()).getScheme().read(iprot, this);
+  }
+
+  public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException {
+    schemes.get(oprot.getScheme()).getScheme().write(oprot, this);
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder("ColumnSlice(");
+    boolean first = true;
+
+    if (isSetStart()) {
+      sb.append("start:");
+      if (this.start == null) {
+        sb.append("null");
+      } else {
+        org.apache.thrift.TBaseHelper.toString(this.start, sb);
+      }
+      first = false;
+    }
+    if (isSetFinish()) {
+      if (!first) sb.append(", ");
+      sb.append("finish:");
+      if (this.finish == null) {
+        sb.append("null");
+      } else {
+        org.apache.thrift.TBaseHelper.toString(this.finish, sb);
+      }
+      first = false;
+    }
+    sb.append(")");
+    return sb.toString();
+  }
+
+  public void validate() throws org.apache.thrift.TException {
+    // check for required fields
+    // check for sub-struct validity
+  }
+
+  private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException {
+    try {
+      write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out)));
+    } catch (org.apache.thrift.TException te) {
+      throw new java.io.IOException(te);
+    }
+  }
+
+  private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
+    try {
+      read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in)));
+    } catch (org.apache.thrift.TException te) {
+      throw new java.io.IOException(te);
+    }
+  }
+
+  private static class ColumnSliceStandardSchemeFactory implements SchemeFactory {
+    public ColumnSliceStandardScheme getScheme() {
+      return new ColumnSliceStandardScheme();
+    }
+  }
+
+  private static class ColumnSliceStandardScheme extends StandardScheme<ColumnSlice> {
+
+    public void read(org.apache.thrift.protocol.TProtocol iprot, ColumnSlice struct) throws org.apache.thrift.TException {
+      org.apache.thrift.protocol.TField schemeField;
+      iprot.readStructBegin();
+      while (true)
+      {
+        schemeField = iprot.readFieldBegin();
+        if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { 
+          break;
+        }
+        switch (schemeField.id) {
+          case 1: // START
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.start = iprot.readBinary();
+              struct.setStartIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 2: // FINISH
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.finish = iprot.readBinary();
+              struct.setFinishIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          default:
+            org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+        }
+        iprot.readFieldEnd();
+      }
+      iprot.readStructEnd();
+
+      // check for required fields of primitive type, which can't be checked in the validate method
+      struct.validate();
+    }
+
+    public void write(org.apache.thrift.protocol.TProtocol oprot, ColumnSlice struct) throws org.apache.thrift.TException {
+      struct.validate();
+
+      oprot.writeStructBegin(STRUCT_DESC);
+      if (struct.start != null) {
+        if (struct.isSetStart()) {
+          oprot.writeFieldBegin(START_FIELD_DESC);
+          oprot.writeBinary(struct.start);
+          oprot.writeFieldEnd();
+        }
+      }
+      if (struct.finish != null) {
+        if (struct.isSetFinish()) {
+          oprot.writeFieldBegin(FINISH_FIELD_DESC);
+          oprot.writeBinary(struct.finish);
+          oprot.writeFieldEnd();
+        }
+      }
+      oprot.writeFieldStop();
+      oprot.writeStructEnd();
+    }
+
+  }
+
+  private static class ColumnSliceTupleSchemeFactory implements SchemeFactory {
+    public ColumnSliceTupleScheme getScheme() {
+      return new ColumnSliceTupleScheme();
+    }
+  }
+
+  private static class ColumnSliceTupleScheme extends TupleScheme<ColumnSlice> {
+
+    @Override
+    public void write(org.apache.thrift.protocol.TProtocol prot, ColumnSlice struct) throws org.apache.thrift.TException {
+      TTupleProtocol oprot = (TTupleProtocol) prot;
+      BitSet optionals = new BitSet();
+      if (struct.isSetStart()) {
+        optionals.set(0);
+      }
+      if (struct.isSetFinish()) {
+        optionals.set(1);
+      }
+      oprot.writeBitSet(optionals, 2);
+      if (struct.isSetStart()) {
+        oprot.writeBinary(struct.start);
+      }
+      if (struct.isSetFinish()) {
+        oprot.writeBinary(struct.finish);
+      }
+    }
+
+    @Override
+    public void read(org.apache.thrift.protocol.TProtocol prot, ColumnSlice struct) throws org.apache.thrift.TException {
+      TTupleProtocol iprot = (TTupleProtocol) prot;
+      BitSet incoming = iprot.readBitSet(2);
+      if (incoming.get(0)) {
+        struct.start = iprot.readBinary();
+        struct.setStartIsSet(true);
+      }
+      if (incoming.get(1)) {
+        struct.finish = iprot.readBinary();
+        struct.setFinishIsSet(true);
+      }
+    }
+  }
+
+}
+

diff --git a/interface/thrift/gen-java/org/apache/cassandra/thrift/CqlRow.java b/interface/thrift/gen-java/org/apache/cassandra/thrift/CqlRow.java
index bc0cf77..7487ed7 100644
--- a/interface/thrift/gen-java/org/apache/cassandra/thrift/CqlRow.java
+++ b/interface/thrift/gen-java/org/apache/cassandra/thrift/CqlRow.java

@@ -55,7 +55,12 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Row returned from a CQL query
+ * Row returned from a CQL query.
+ * 
+ * This struct is used for both CQL2 and CQL3 queries.  For CQL2, the partition key
+ * is special-cased and is always returned.  For CQL3, it is not special cased;
+ * it will be included in the columns list if it was included in the SELECT and
+ * the key field is always null.
  */
 public class CqlRow implements org.apache.thrift.TBase<CqlRow, CqlRow._Fields>, java.io.Serializable, Cloneable, Comparable<CqlRow> {
   private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("CqlRow");

diff --git a/interface/thrift/gen-java/org/apache/cassandra/thrift/MultiSliceRequest.java b/interface/thrift/gen-java/org/apache/cassandra/thrift/MultiSliceRequest.java
new file mode 100644
index 0000000..9d4878c
--- /dev/null
+++ b/interface/thrift/gen-java/org/apache/cassandra/thrift/MultiSliceRequest.java

@@ -0,0 +1,1042 @@
+/**
+ * Autogenerated by Thrift Compiler (0.9.1)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ *  @generated
+ */
+package org.apache.cassandra.thrift;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.apache.thrift.scheme.IScheme;
+import org.apache.thrift.scheme.SchemeFactory;
+import org.apache.thrift.scheme.StandardScheme;
+
+import org.apache.thrift.scheme.TupleScheme;
+import org.apache.thrift.protocol.TTupleProtocol;
+import org.apache.thrift.protocol.TProtocolException;
+import org.apache.thrift.EncodingUtils;
+import org.apache.thrift.TException;
+import org.apache.thrift.async.AsyncMethodCallback;
+import org.apache.thrift.server.AbstractNonblockingServer.*;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.EnumMap;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.EnumSet;
+import java.util.Collections;
+import java.util.BitSet;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Used to perform multiple slices on a single row key in one rpc operation
+ * @param key. The row key to be multi sliced
+ * @param column_parent. The column family (super columns are unsupported)
+ * @param column_slices. 0 to many ColumnSlice objects each will be used to select columns
+ * @param reversed. Direction of slice
+ * @param count. Maximum number of columns
+ * @param consistency_level. Level to perform the operation at
+ */
+public class MultiSliceRequest implements org.apache.thrift.TBase<MultiSliceRequest, MultiSliceRequest._Fields>, java.io.Serializable, Cloneable, Comparable<MultiSliceRequest> {
+  private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("MultiSliceRequest");
+
+  private static final org.apache.thrift.protocol.TField KEY_FIELD_DESC = new org.apache.thrift.protocol.TField("key", org.apache.thrift.protocol.TType.STRING, (short)1);
+  private static final org.apache.thrift.protocol.TField COLUMN_PARENT_FIELD_DESC = new org.apache.thrift.protocol.TField("column_parent", org.apache.thrift.protocol.TType.STRUCT, (short)2);
+  private static final org.apache.thrift.protocol.TField COLUMN_SLICES_FIELD_DESC = new org.apache.thrift.protocol.TField("column_slices", org.apache.thrift.protocol.TType.LIST, (short)3);
+  private static final org.apache.thrift.protocol.TField REVERSED_FIELD_DESC = new org.apache.thrift.protocol.TField("reversed", org.apache.thrift.protocol.TType.BOOL, (short)4);
+  private static final org.apache.thrift.protocol.TField COUNT_FIELD_DESC = new org.apache.thrift.protocol.TField("count", org.apache.thrift.protocol.TType.I32, (short)5);
+  private static final org.apache.thrift.protocol.TField CONSISTENCY_LEVEL_FIELD_DESC = new org.apache.thrift.protocol.TField("consistency_level", org.apache.thrift.protocol.TType.I32, (short)6);
+
+  private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
+  static {
+    schemes.put(StandardScheme.class, new MultiSliceRequestStandardSchemeFactory());
+    schemes.put(TupleScheme.class, new MultiSliceRequestTupleSchemeFactory());
+  }
+
+  public ByteBuffer key; // optional
+  public ColumnParent column_parent; // optional
+  public List<ColumnSlice> column_slices; // optional
+  public boolean reversed; // optional
+  public int count; // optional
+  /**
+   * 
+   * @see ConsistencyLevel
+   */
+  public ConsistencyLevel consistency_level; // optional
+
+  /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
+  public enum _Fields implements org.apache.thrift.TFieldIdEnum {
+    KEY((short)1, "key"),
+    COLUMN_PARENT((short)2, "column_parent"),
+    COLUMN_SLICES((short)3, "column_slices"),
+    REVERSED((short)4, "reversed"),
+    COUNT((short)5, "count"),
+    /**
+     * 
+     * @see ConsistencyLevel
+     */
+    CONSISTENCY_LEVEL((short)6, "consistency_level");
+
+    private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
+
+    static {
+      for (_Fields field : EnumSet.allOf(_Fields.class)) {
+        byName.put(field.getFieldName(), field);
+      }
+    }
+
+    /**
+     * Find the _Fields constant that matches fieldId, or null if its not found.
+     */
+    public static _Fields findByThriftId(int fieldId) {
+      switch(fieldId) {
+        case 1: // KEY
+          return KEY;
+        case 2: // COLUMN_PARENT
+          return COLUMN_PARENT;
+        case 3: // COLUMN_SLICES
+          return COLUMN_SLICES;
+        case 4: // REVERSED
+          return REVERSED;
+        case 5: // COUNT
+          return COUNT;
+        case 6: // CONSISTENCY_LEVEL
+          return CONSISTENCY_LEVEL;
+        default:
+          return null;
+      }
+    }
+
+    /**
+     * Find the _Fields constant that matches fieldId, throwing an exception
+     * if it is not found.
+     */
+    public static _Fields findByThriftIdOrThrow(int fieldId) {
+      _Fields fields = findByThriftId(fieldId);
+      if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
+      return fields;
+    }
+
+    /**
+     * Find the _Fields constant that matches name, or null if its not found.
+     */
+    public static _Fields findByName(String name) {
+      return byName.get(name);
+    }
+
+    private final short _thriftId;
+    private final String _fieldName;
+
+    _Fields(short thriftId, String fieldName) {
+      _thriftId = thriftId;
+      _fieldName = fieldName;
+    }
+
+    public short getThriftFieldId() {
+      return _thriftId;
+    }
+
+    public String getFieldName() {
+      return _fieldName;
+    }
+  }
+
+  // isset id assignments
+  private static final int __REVERSED_ISSET_ID = 0;
+  private static final int __COUNT_ISSET_ID = 1;
+  private byte __isset_bitfield = 0;
+  private _Fields optionals[] = {_Fields.KEY,_Fields.COLUMN_PARENT,_Fields.COLUMN_SLICES,_Fields.REVERSED,_Fields.COUNT,_Fields.CONSISTENCY_LEVEL};
+  public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
+  static {
+    Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
+    tmpMap.put(_Fields.KEY, new org.apache.thrift.meta_data.FieldMetaData("key", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING        , true)));
+    tmpMap.put(_Fields.COLUMN_PARENT, new org.apache.thrift.meta_data.FieldMetaData("column_parent", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, ColumnParent.class)));
+    tmpMap.put(_Fields.COLUMN_SLICES, new org.apache.thrift.meta_data.FieldMetaData("column_slices", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
+            new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, ColumnSlice.class))));
+    tmpMap.put(_Fields.REVERSED, new org.apache.thrift.meta_data.FieldMetaData("reversed", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
+    tmpMap.put(_Fields.COUNT, new org.apache.thrift.meta_data.FieldMetaData("count", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
+    tmpMap.put(_Fields.CONSISTENCY_LEVEL, new org.apache.thrift.meta_data.FieldMetaData("consistency_level", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.EnumMetaData(org.apache.thrift.protocol.TType.ENUM, ConsistencyLevel.class)));
+    metaDataMap = Collections.unmodifiableMap(tmpMap);
+    org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(MultiSliceRequest.class, metaDataMap);
+  }
+
+  public MultiSliceRequest() {
+    this.reversed = false;
+
+    this.count = 1000;
+
+    this.consistency_level = org.apache.cassandra.thrift.ConsistencyLevel.ONE;
+
+  }
+
+  /**
+   * Performs a deep copy on <i>other</i>.
+   */
+  public MultiSliceRequest(MultiSliceRequest other) {
+    __isset_bitfield = other.__isset_bitfield;
+    if (other.isSetKey()) {
+      this.key = org.apache.thrift.TBaseHelper.copyBinary(other.key);
+;
+    }
+    if (other.isSetColumn_parent()) {
+      this.column_parent = new ColumnParent(other.column_parent);
+    }
+    if (other.isSetColumn_slices()) {
+      List<ColumnSlice> __this__column_slices = new ArrayList<ColumnSlice>(other.column_slices.size());
+      for (ColumnSlice other_element : other.column_slices) {
+        __this__column_slices.add(new ColumnSlice(other_element));
+      }
+      this.column_slices = __this__column_slices;
+    }
+    this.reversed = other.reversed;
+    this.count = other.count;
+    if (other.isSetConsistency_level()) {
+      this.consistency_level = other.consistency_level;
+    }
+  }
+
+  public MultiSliceRequest deepCopy() {
+    return new MultiSliceRequest(this);
+  }
+
+  @Override
+  public void clear() {
+    this.key = null;
+    this.column_parent = null;
+    this.column_slices = null;
+    this.reversed = false;
+
+    this.count = 1000;
+
+    this.consistency_level = org.apache.cassandra.thrift.ConsistencyLevel.ONE;
+
+  }
+
+  public byte[] getKey() {
+    setKey(org.apache.thrift.TBaseHelper.rightSize(key));
+    return key == null ? null : key.array();
+  }
+
+  public ByteBuffer bufferForKey() {
+    return key;
+  }
+
+  public MultiSliceRequest setKey(byte[] key) {
+    setKey(key == null ? (ByteBuffer)null : ByteBuffer.wrap(key));
+    return this;
+  }
+
+  public MultiSliceRequest setKey(ByteBuffer key) {
+    this.key = key;
+    return this;
+  }
+
+  public void unsetKey() {
+    this.key = null;
+  }
+
+  /** Returns true if field key is set (has been assigned a value) and false otherwise */
+  public boolean isSetKey() {
+    return this.key != null;
+  }
+
+  public void setKeyIsSet(boolean value) {
+    if (!value) {
+      this.key = null;
+    }
+  }
+
+  public ColumnParent getColumn_parent() {
+    return this.column_parent;
+  }
+
+  public MultiSliceRequest setColumn_parent(ColumnParent column_parent) {
+    this.column_parent = column_parent;
+    return this;
+  }
+
+  public void unsetColumn_parent() {
+    this.column_parent = null;
+  }
+
+  /** Returns true if field column_parent is set (has been assigned a value) and false otherwise */
+  public boolean isSetColumn_parent() {
+    return this.column_parent != null;
+  }
+
+  public void setColumn_parentIsSet(boolean value) {
+    if (!value) {
+      this.column_parent = null;
+    }
+  }
+
+  public int getColumn_slicesSize() {
+    return (this.column_slices == null) ? 0 : this.column_slices.size();
+  }
+
+  public java.util.Iterator<ColumnSlice> getColumn_slicesIterator() {
+    return (this.column_slices == null) ? null : this.column_slices.iterator();
+  }
+
+  public void addToColumn_slices(ColumnSlice elem) {
+    if (this.column_slices == null) {
+      this.column_slices = new ArrayList<ColumnSlice>();
+    }
+    this.column_slices.add(elem);
+  }
+
+  public List<ColumnSlice> getColumn_slices() {
+    return this.column_slices;
+  }
+
+  public MultiSliceRequest setColumn_slices(List<ColumnSlice> column_slices) {
+    this.column_slices = column_slices;
+    return this;
+  }
+
+  public void unsetColumn_slices() {
+    this.column_slices = null;
+  }
+
+  /** Returns true if field column_slices is set (has been assigned a value) and false otherwise */
+  public boolean isSetColumn_slices() {
+    return this.column_slices != null;
+  }
+
+  public void setColumn_slicesIsSet(boolean value) {
+    if (!value) {
+      this.column_slices = null;
+    }
+  }
+
+  public boolean isReversed() {
+    return this.reversed;
+  }
+
+  public MultiSliceRequest setReversed(boolean reversed) {
+    this.reversed = reversed;
+    setReversedIsSet(true);
+    return this;
+  }
+
+  public void unsetReversed() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __REVERSED_ISSET_ID);
+  }
+
+  /** Returns true if field reversed is set (has been assigned a value) and false otherwise */
+  public boolean isSetReversed() {
+    return EncodingUtils.testBit(__isset_bitfield, __REVERSED_ISSET_ID);
+  }
+
+  public void setReversedIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __REVERSED_ISSET_ID, value);
+  }
+
+  public int getCount() {
+    return this.count;
+  }
+
+  public MultiSliceRequest setCount(int count) {
+    this.count = count;
+    setCountIsSet(true);
+    return this;
+  }
+
+  public void unsetCount() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __COUNT_ISSET_ID);
+  }
+
+  /** Returns true if field count is set (has been assigned a value) and false otherwise */
+  public boolean isSetCount() {
+    return EncodingUtils.testBit(__isset_bitfield, __COUNT_ISSET_ID);
+  }
+
+  public void setCountIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __COUNT_ISSET_ID, value);
+  }
+
+  /**
+   * 
+   * @see ConsistencyLevel
+   */
+  public ConsistencyLevel getConsistency_level() {
+    return this.consistency_level;
+  }
+
+  /**
+   * 
+   * @see ConsistencyLevel
+   */
+  public MultiSliceRequest setConsistency_level(ConsistencyLevel consistency_level) {
+    this.consistency_level = consistency_level;
+    return this;
+  }
+
+  public void unsetConsistency_level() {
+    this.consistency_level = null;
+  }
+
+  /** Returns true if field consistency_level is set (has been assigned a value) and false otherwise */
+  public boolean isSetConsistency_level() {
+    return this.consistency_level != null;
+  }
+
+  public void setConsistency_levelIsSet(boolean value) {
+    if (!value) {
+      this.consistency_level = null;
+    }
+  }
+
+  public void setFieldValue(_Fields field, Object value) {
+    switch (field) {
+    case KEY:
+      if (value == null) {
+        unsetKey();
+      } else {
+        setKey((ByteBuffer)value);
+      }
+      break;
+
+    case COLUMN_PARENT:
+      if (value == null) {
+        unsetColumn_parent();
+      } else {
+        setColumn_parent((ColumnParent)value);
+      }
+      break;
+
+    case COLUMN_SLICES:
+      if (value == null) {
+        unsetColumn_slices();
+      } else {
+        setColumn_slices((List<ColumnSlice>)value);
+      }
+      break;
+
+    case REVERSED:
+      if (value == null) {
+        unsetReversed();
+      } else {
+        setReversed((Boolean)value);
+      }
+      break;
+
+    case COUNT:
+      if (value == null) {
+        unsetCount();
+      } else {
+        setCount((Integer)value);
+      }
+      break;
+
+    case CONSISTENCY_LEVEL:
+      if (value == null) {
+        unsetConsistency_level();
+      } else {
+        setConsistency_level((ConsistencyLevel)value);
+      }
+      break;
+
+    }
+  }
+
+  public Object getFieldValue(_Fields field) {
+    switch (field) {
+    case KEY:
+      return getKey();
+
+    case COLUMN_PARENT:
+      return getColumn_parent();
+
+    case COLUMN_SLICES:
+      return getColumn_slices();
+
+    case REVERSED:
+      return Boolean.valueOf(isReversed());
+
+    case COUNT:
+      return Integer.valueOf(getCount());
+
+    case CONSISTENCY_LEVEL:
+      return getConsistency_level();
+
+    }
+    throw new IllegalStateException();
+  }
+
+  /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
+  public boolean isSet(_Fields field) {
+    if (field == null) {
+      throw new IllegalArgumentException();
+    }
+
+    switch (field) {
+    case KEY:
+      return isSetKey();
+    case COLUMN_PARENT:
+      return isSetColumn_parent();
+    case COLUMN_SLICES:
+      return isSetColumn_slices();
+    case REVERSED:
+      return isSetReversed();
+    case COUNT:
+      return isSetCount();
+    case CONSISTENCY_LEVEL:
+      return isSetConsistency_level();
+    }
+    throw new IllegalStateException();
+  }
+
+  @Override
+  public boolean equals(Object that) {
+    if (that == null)
+      return false;
+    if (that instanceof MultiSliceRequest)
+      return this.equals((MultiSliceRequest)that);
+    return false;
+  }
+
+  public boolean equals(MultiSliceRequest that) {
+    if (that == null)
+      return false;
+
+    boolean this_present_key = true && this.isSetKey();
+    boolean that_present_key = true && that.isSetKey();
+    if (this_present_key || that_present_key) {
+      if (!(this_present_key && that_present_key))
+        return false;
+      if (!this.key.equals(that.key))
+        return false;
+    }
+
+    boolean this_present_column_parent = true && this.isSetColumn_parent();
+    boolean that_present_column_parent = true && that.isSetColumn_parent();
+    if (this_present_column_parent || that_present_column_parent) {
+      if (!(this_present_column_parent && that_present_column_parent))
+        return false;
+      if (!this.column_parent.equals(that.column_parent))
+        return false;
+    }
+
+    boolean this_present_column_slices = true && this.isSetColumn_slices();
+    boolean that_present_column_slices = true && that.isSetColumn_slices();
+    if (this_present_column_slices || that_present_column_slices) {
+      if (!(this_present_column_slices && that_present_column_slices))
+        return false;
+      if (!this.column_slices.equals(that.column_slices))
+        return false;
+    }
+
+    boolean this_present_reversed = true && this.isSetReversed();
+    boolean that_present_reversed = true && that.isSetReversed();
+    if (this_present_reversed || that_present_reversed) {
+      if (!(this_present_reversed && that_present_reversed))
+        return false;
+      if (this.reversed != that.reversed)
+        return false;
+    }
+
+    boolean this_present_count = true && this.isSetCount();
+    boolean that_present_count = true && that.isSetCount();
+    if (this_present_count || that_present_count) {
+      if (!(this_present_count && that_present_count))
+        return false;
+      if (this.count != that.count)
+        return false;
+    }
+
+    boolean this_present_consistency_level = true && this.isSetConsistency_level();
+    boolean that_present_consistency_level = true && that.isSetConsistency_level();
+    if (this_present_consistency_level || that_present_consistency_level) {
+      if (!(this_present_consistency_level && that_present_consistency_level))
+        return false;
+      if (!this.consistency_level.equals(that.consistency_level))
+        return false;
+    }
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    HashCodeBuilder builder = new HashCodeBuilder();
+
+    boolean present_key = true && (isSetKey());
+    builder.append(present_key);
+    if (present_key)
+      builder.append(key);
+
+    boolean present_column_parent = true && (isSetColumn_parent());
+    builder.append(present_column_parent);
+    if (present_column_parent)
+      builder.append(column_parent);
+
+    boolean present_column_slices = true && (isSetColumn_slices());
+    builder.append(present_column_slices);
+    if (present_column_slices)
+      builder.append(column_slices);
+
+    boolean present_reversed = true && (isSetReversed());
+    builder.append(present_reversed);
+    if (present_reversed)
+      builder.append(reversed);
+
+    boolean present_count = true && (isSetCount());
+    builder.append(present_count);
+    if (present_count)
+      builder.append(count);
+
+    boolean present_consistency_level = true && (isSetConsistency_level());
+    builder.append(present_consistency_level);
+    if (present_consistency_level)
+      builder.append(consistency_level.getValue());
+
+    return builder.toHashCode();
+  }
+
+  @Override
+  public int compareTo(MultiSliceRequest other) {
+    if (!getClass().equals(other.getClass())) {
+      return getClass().getName().compareTo(other.getClass().getName());
+    }
+
+    int lastComparison = 0;
+
+    lastComparison = Boolean.valueOf(isSetKey()).compareTo(other.isSetKey());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetKey()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.key, other.key);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetColumn_parent()).compareTo(other.isSetColumn_parent());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetColumn_parent()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.column_parent, other.column_parent);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetColumn_slices()).compareTo(other.isSetColumn_slices());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetColumn_slices()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.column_slices, other.column_slices);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetReversed()).compareTo(other.isSetReversed());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetReversed()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.reversed, other.reversed);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetCount()).compareTo(other.isSetCount());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetCount()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.count, other.count);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetConsistency_level()).compareTo(other.isSetConsistency_level());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetConsistency_level()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.consistency_level, other.consistency_level);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    return 0;
+  }
+
+  public _Fields fieldForId(int fieldId) {
+    return _Fields.findByThriftId(fieldId);
+  }
+
+  public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException {
+    schemes.get(iprot.getScheme()).getScheme().read(iprot, this);
+  }
+
+  public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException {
+    schemes.get(oprot.getScheme()).getScheme().write(oprot, this);
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder("MultiSliceRequest(");
+    boolean first = true;
+
+    if (isSetKey()) {
+      sb.append("key:");
+      if (this.key == null) {
+        sb.append("null");
+      } else {
+        org.apache.thrift.TBaseHelper.toString(this.key, sb);
+      }
+      first = false;
+    }
+    if (isSetColumn_parent()) {
+      if (!first) sb.append(", ");
+      sb.append("column_parent:");
+      if (this.column_parent == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.column_parent);
+      }
+      first = false;
+    }
+    if (isSetColumn_slices()) {
+      if (!first) sb.append(", ");
+      sb.append("column_slices:");
+      if (this.column_slices == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.column_slices);
+      }
+      first = false;
+    }
+    if (isSetReversed()) {
+      if (!first) sb.append(", ");
+      sb.append("reversed:");
+      sb.append(this.reversed);
+      first = false;
+    }
+    if (isSetCount()) {
+      if (!first) sb.append(", ");
+      sb.append("count:");
+      sb.append(this.count);
+      first = false;
+    }
+    if (isSetConsistency_level()) {
+      if (!first) sb.append(", ");
+      sb.append("consistency_level:");
+      if (this.consistency_level == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.consistency_level);
+      }
+      first = false;
+    }
+    sb.append(")");
+    return sb.toString();
+  }
+
+  public void validate() throws org.apache.thrift.TException {
+    // check for required fields
+    // check for sub-struct validity
+    if (column_parent != null) {
+      column_parent.validate();
+    }
+  }
+
+  private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException {
+    try {
+      write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out)));
+    } catch (org.apache.thrift.TException te) {
+      throw new java.io.IOException(te);
+    }
+  }
+
+  private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
+    try {
+      // it doesn't seem like you should have to do this, but java serialization is wacky, and doesn't call the default constructor.
+      __isset_bitfield = 0;
+      read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in)));
+    } catch (org.apache.thrift.TException te) {
+      throw new java.io.IOException(te);
+    }
+  }
+
+  private static class MultiSliceRequestStandardSchemeFactory implements SchemeFactory {
+    public MultiSliceRequestStandardScheme getScheme() {
+      return new MultiSliceRequestStandardScheme();
+    }
+  }
+
+  private static class MultiSliceRequestStandardScheme extends StandardScheme<MultiSliceRequest> {
+
+    public void read(org.apache.thrift.protocol.TProtocol iprot, MultiSliceRequest struct) throws org.apache.thrift.TException {
+      org.apache.thrift.protocol.TField schemeField;
+      iprot.readStructBegin();
+      while (true)
+      {
+        schemeField = iprot.readFieldBegin();
+        if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { 
+          break;
+        }
+        switch (schemeField.id) {
+          case 1: // KEY
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.key = iprot.readBinary();
+              struct.setKeyIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 2: // COLUMN_PARENT
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRUCT) {
+              struct.column_parent = new ColumnParent();
+              struct.column_parent.read(iprot);
+              struct.setColumn_parentIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 3: // COLUMN_SLICES
+            if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
+              {
+                org.apache.thrift.protocol.TList _list216 = iprot.readListBegin();
+                struct.column_slices = new ArrayList<ColumnSlice>(_list216.size);
+                for (int _i217 = 0; _i217 < _list216.size; ++_i217)
+                {
+                  ColumnSlice _elem218;
+                  _elem218 = new ColumnSlice();
+                  _elem218.read(iprot);
+                  struct.column_slices.add(_elem218);
+                }
+                iprot.readListEnd();
+              }
+              struct.setColumn_slicesIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 4: // REVERSED
+            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
+              struct.reversed = iprot.readBool();
+              struct.setReversedIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 5: // COUNT
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.count = iprot.readI32();
+              struct.setCountIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 6: // CONSISTENCY_LEVEL
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.consistency_level = ConsistencyLevel.findByValue(iprot.readI32());
+              struct.setConsistency_levelIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          default:
+            org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+        }
+        iprot.readFieldEnd();
+      }
+      iprot.readStructEnd();
+
+      // check for required fields of primitive type, which can't be checked in the validate method
+      struct.validate();
+    }
+
+    public void write(org.apache.thrift.protocol.TProtocol oprot, MultiSliceRequest struct) throws org.apache.thrift.TException {
+      struct.validate();
+
+      oprot.writeStructBegin(STRUCT_DESC);
+      if (struct.key != null) {
+        if (struct.isSetKey()) {
+          oprot.writeFieldBegin(KEY_FIELD_DESC);
+          oprot.writeBinary(struct.key);
+          oprot.writeFieldEnd();
+        }
+      }
+      if (struct.column_parent != null) {
+        if (struct.isSetColumn_parent()) {
+          oprot.writeFieldBegin(COLUMN_PARENT_FIELD_DESC);
+          struct.column_parent.write(oprot);
+          oprot.writeFieldEnd();
+        }
+      }
+      if (struct.column_slices != null) {
+        if (struct.isSetColumn_slices()) {
+          oprot.writeFieldBegin(COLUMN_SLICES_FIELD_DESC);
+          {
+            oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, struct.column_slices.size()));
+            for (ColumnSlice _iter219 : struct.column_slices)
+            {
+              _iter219.write(oprot);
+            }
+            oprot.writeListEnd();
+          }
+          oprot.writeFieldEnd();
+        }
+      }
+      if (struct.isSetReversed()) {
+        oprot.writeFieldBegin(REVERSED_FIELD_DESC);
+        oprot.writeBool(struct.reversed);
+        oprot.writeFieldEnd();
+      }
+      if (struct.isSetCount()) {
+        oprot.writeFieldBegin(COUNT_FIELD_DESC);
+        oprot.writeI32(struct.count);
+        oprot.writeFieldEnd();
+      }
+      if (struct.consistency_level != null) {
+        if (struct.isSetConsistency_level()) {
+          oprot.writeFieldBegin(CONSISTENCY_LEVEL_FIELD_DESC);
+          oprot.writeI32(struct.consistency_level.getValue());
+          oprot.writeFieldEnd();
+        }
+      }
+      oprot.writeFieldStop();
+      oprot.writeStructEnd();
+    }
+
+  }
+
+  private static class MultiSliceRequestTupleSchemeFactory implements SchemeFactory {
+    public MultiSliceRequestTupleScheme getScheme() {
+      return new MultiSliceRequestTupleScheme();
+    }
+  }
+
+  private static class MultiSliceRequestTupleScheme extends TupleScheme<MultiSliceRequest> {
+
+    @Override
+    public void write(org.apache.thrift.protocol.TProtocol prot, MultiSliceRequest struct) throws org.apache.thrift.TException {
+      TTupleProtocol oprot = (TTupleProtocol) prot;
+      BitSet optionals = new BitSet();
+      if (struct.isSetKey()) {
+        optionals.set(0);
+      }
+      if (struct.isSetColumn_parent()) {
+        optionals.set(1);
+      }
+      if (struct.isSetColumn_slices()) {
+        optionals.set(2);
+      }
+      if (struct.isSetReversed()) {
+        optionals.set(3);
+      }
+      if (struct.isSetCount()) {
+        optionals.set(4);
+      }
+      if (struct.isSetConsistency_level()) {
+        optionals.set(5);
+      }
+      oprot.writeBitSet(optionals, 6);
+      if (struct.isSetKey()) {
+        oprot.writeBinary(struct.key);
+      }
+      if (struct.isSetColumn_parent()) {
+        struct.column_parent.write(oprot);
+      }
+      if (struct.isSetColumn_slices()) {
+        {
+          oprot.writeI32(struct.column_slices.size());
+          for (ColumnSlice _iter220 : struct.column_slices)
+          {
+            _iter220.write(oprot);
+          }
+        }
+      }
+      if (struct.isSetReversed()) {
+        oprot.writeBool(struct.reversed);
+      }
+      if (struct.isSetCount()) {
+        oprot.writeI32(struct.count);
+      }
+      if (struct.isSetConsistency_level()) {
+        oprot.writeI32(struct.consistency_level.getValue());
+      }
+    }
+
+    @Override
+    public void read(org.apache.thrift.protocol.TProtocol prot, MultiSliceRequest struct) throws org.apache.thrift.TException {
+      TTupleProtocol iprot = (TTupleProtocol) prot;
+      BitSet incoming = iprot.readBitSet(6);
+      if (incoming.get(0)) {
+        struct.key = iprot.readBinary();
+        struct.setKeyIsSet(true);
+      }
+      if (incoming.get(1)) {
+        struct.column_parent = new ColumnParent();
+        struct.column_parent.read(iprot);
+        struct.setColumn_parentIsSet(true);
+      }
+      if (incoming.get(2)) {
+        {
+          org.apache.thrift.protocol.TList _list221 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+          struct.column_slices = new ArrayList<ColumnSlice>(_list221.size);
+          for (int _i222 = 0; _i222 < _list221.size; ++_i222)
+          {
+            ColumnSlice _elem223;
+            _elem223 = new ColumnSlice();
+            _elem223.read(iprot);
+            struct.column_slices.add(_elem223);
+          }
+        }
+        struct.setColumn_slicesIsSet(true);
+      }
+      if (incoming.get(3)) {
+        struct.reversed = iprot.readBool();
+        struct.setReversedIsSet(true);
+      }
+      if (incoming.get(4)) {
+        struct.count = iprot.readI32();
+        struct.setCountIsSet(true);
+      }
+      if (incoming.get(5)) {
+        struct.consistency_level = ConsistencyLevel.findByValue(iprot.readI32());
+        struct.setConsistency_levelIsSet(true);
+      }
+    }
+  }
+
+}
+

diff --git a/lib/airline-0.6.jar b/lib/airline-0.6.jar
new file mode 100644
index 0000000..a35ae79
--- /dev/null
+++ b/lib/airline-0.6.jar
Binary files differ

diff --git a/lib/antlr-3.2.jar b/lib/antlr-3.2.jar
deleted file mode 100644
index fdd167d..0000000
--- a/lib/antlr-3.2.jar
+++ /dev/null
Binary files differ

diff --git a/lib/antlr-runtime-3.5.2.jar b/lib/antlr-runtime-3.5.2.jar
new file mode 100644
index 0000000..d48e3e8
--- /dev/null
+++ b/lib/antlr-runtime-3.5.2.jar
Binary files differ

diff --git a/lib/cassandra-driver-internal-only-2.1.1.post.zip b/lib/cassandra-driver-internal-only-2.1.1.post.zip
new file mode 100644
index 0000000..94b5d4c
--- /dev/null
+++ b/lib/cassandra-driver-internal-only-2.1.1.post.zip
Binary files differ

diff --git a/lib/commons-math3-3.2.jar b/lib/commons-math3-3.2.jar
new file mode 100644
index 0000000..f8b7db2
--- /dev/null
+++ b/lib/commons-math3-3.2.jar
Binary files differ

diff --git a/lib/concurrentlinkedhashmap-lru-1.3.jar b/lib/concurrentlinkedhashmap-lru-1.3.jar
deleted file mode 100644
index 7fbdf9f..0000000
--- a/lib/concurrentlinkedhashmap-lru-1.3.jar
+++ /dev/null
Binary files differ

diff --git a/lib/concurrentlinkedhashmap-lru-1.4.jar b/lib/concurrentlinkedhashmap-lru-1.4.jar
new file mode 100644
index 0000000..572b258
--- /dev/null
+++ b/lib/concurrentlinkedhashmap-lru-1.4.jar
Binary files differ

diff --git a/lib/cql-internal-only-1.4.1.zip b/lib/cql-internal-only-1.4.1.zip
deleted file mode 100644
index fa33a3a..0000000
--- a/lib/cql-internal-only-1.4.1.zip
+++ /dev/null
Binary files differ

diff --git a/lib/futures-2.1.6-py2.py3-none-any.zip b/lib/futures-2.1.6-py2.py3-none-any.zip
new file mode 100644
index 0000000..50bd78b
--- /dev/null
+++ b/lib/futures-2.1.6-py2.py3-none-any.zip
Binary files differ

diff --git a/lib/guava-15.0.jar b/lib/guava-15.0.jar
deleted file mode 100644
index eb9ef8a..0000000
--- a/lib/guava-15.0.jar
+++ /dev/null
Binary files differ

diff --git a/lib/guava-16.0.jar b/lib/guava-16.0.jar
new file mode 100644
index 0000000..7afcb10
--- /dev/null
+++ b/lib/guava-16.0.jar
Binary files differ

diff --git a/lib/high-scale-lib-1.0.6.jar b/lib/high-scale-lib-1.0.6.jar
new file mode 100644
index 0000000..5269294
--- /dev/null
+++ b/lib/high-scale-lib-1.0.6.jar
Binary files differ

diff --git a/lib/high-scale-lib-1.1.2.jar b/lib/high-scale-lib-1.1.2.jar
deleted file mode 100644
index 6343a70..0000000
--- a/lib/high-scale-lib-1.1.2.jar
+++ /dev/null
Binary files differ

diff --git a/lib/jamm-0.2.5.jar b/lib/jamm-0.2.5.jar
deleted file mode 100644
index ef8750d..0000000
--- a/lib/jamm-0.2.5.jar
+++ /dev/null
Binary files differ

diff --git a/lib/jamm-0.2.6.jar b/lib/jamm-0.2.6.jar
new file mode 100644
index 0000000..04e5c2e
--- /dev/null
+++ b/lib/jamm-0.2.6.jar
Binary files differ

diff --git a/lib/javax.inject.jar b/lib/javax.inject.jar
new file mode 100644
index 0000000..1ff61ce
--- /dev/null
+++ b/lib/javax.inject.jar
Binary files differ

diff --git a/lib/jna-4.0.0.jar b/lib/jna-4.0.0.jar
new file mode 100644
index 0000000..9038048
--- /dev/null
+++ b/lib/jna-4.0.0.jar
Binary files differ

diff --git a/lib/licenses/netty-3.6.6.txt b/lib/licenses/airline-0.6.txt
similarity index 100%
rename from lib/licenses/netty-3.6.6.txt
rename to lib/licenses/airline-0.6.txt


diff --git a/lib/licenses/antlr-3.2.txt b/lib/licenses/antlr-runtime-3.5.2.txt
similarity index 100%
rename from lib/licenses/antlr-3.2.txt
rename to lib/licenses/antlr-runtime-3.5.2.txt


diff --git a/lib/licenses/guava-15.0.txt b/lib/licenses/cassandra-driver-2.1.0b1.post.txt
similarity index 89%
copy from lib/licenses/guava-15.0.txt
copy to lib/licenses/cassandra-driver-2.1.0b1.post.txt
index d645695..f433b1a 100644
--- a/lib/licenses/guava-15.0.txt
+++ b/lib/licenses/cassandra-driver-2.1.0b1.post.txt

@@ -175,28 +175,3 @@
       of your accepting any such warranty or additional liability.
 
    END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

diff --git a/lib/licenses/cql-1.4.0.txt b/lib/licenses/commons-math3-3.2.txt
similarity index 100%
copy from lib/licenses/cql-1.4.0.txt
copy to lib/licenses/commons-math3-3.2.txt


diff --git a/lib/licenses/concurrentlinkedhashmap-lru-1.3.txt b/lib/licenses/concurrentlinkedhashmap-lru-1.4.txt
similarity index 100%
rename from lib/licenses/concurrentlinkedhashmap-lru-1.3.txt
rename to lib/licenses/concurrentlinkedhashmap-lru-1.4.txt


diff --git a/lib/licenses/futures-2.1.6.txt b/lib/licenses/futures-2.1.6.txt
new file mode 100644
index 0000000..c430db0
--- /dev/null
+++ b/lib/licenses/futures-2.1.6.txt

@@ -0,0 +1,21 @@
+Copyright 2009 Brian Quinlan. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY BRIAN QUINLAN "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+HALL THE FREEBSD PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file

diff --git a/lib/licenses/cql-1.4.0.txt b/lib/licenses/guava-16.0.txt
similarity index 100%
rename from lib/licenses/cql-1.4.0.txt
rename to lib/licenses/guava-16.0.txt


diff --git a/lib/licenses/high-scale-lib-1.1.2.txt b/lib/licenses/high-scale-lib-1.0.6.txt
similarity index 100%
rename from lib/licenses/high-scale-lib-1.1.2.txt
rename to lib/licenses/high-scale-lib-1.0.6.txt


diff --git a/lib/licenses/netty-3.6.6.txt b/lib/licenses/jamm-0.2.6.txt
similarity index 100%
copy from lib/licenses/netty-3.6.6.txt
copy to lib/licenses/jamm-0.2.6.txt


diff --git a/lib/licenses/cql-1.4.0.txt b/lib/licenses/javax.inject.txt
similarity index 100%
copy from lib/licenses/cql-1.4.0.txt
copy to lib/licenses/javax.inject.txt


diff --git a/lib/licenses/guava-15.0.txt b/lib/licenses/jna-4.0.0.txt
similarity index 99%
rename from lib/licenses/guava-15.0.txt
rename to lib/licenses/jna-4.0.0.txt
index d645695..7a4a3ea 100644
--- a/lib/licenses/guava-15.0.txt
+++ b/lib/licenses/jna-4.0.0.txt

@@ -199,4 +199,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
+   limitations under the License.
\ No newline at end of file

diff --git a/lib/licenses/log4j-1.2.16.txt b/lib/licenses/log4j-1.2.16.txt
deleted file mode 100644
index d645695..0000000
--- a/lib/licenses/log4j-1.2.16.txt
+++ /dev/null

@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

diff --git a/lib/licenses/logback-classic-1.1.2.txt b/lib/licenses/logback-classic-1.1.2.txt
new file mode 100644
index 0000000..79e486c
--- /dev/null
+++ b/lib/licenses/logback-classic-1.1.2.txt

@@ -0,0 +1,70 @@
+Eclipse Public License - v 1.0
+
+THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
+
+1. DEFINITIONS
+
+"Contribution" means:
+
+a) in the case of the initial Contributor, the initial code and documentation distributed under this Agreement, and
+b) in the case of each subsequent Contributor:
+i) changes to the Program, and
+ii) additions to the Program;
+where such changes and/or additions to the Program originate from and are distributed by that particular Contributor. A Contribution 'originates' from a Contributor if it was added to the Program by such Contributor itself or anyone acting on such Contributor's behalf. Contributions do not include additions to the Program which: (i) are separate modules of software distributed in conjunction with the Program under their own license agreement, and (ii) are not derivative works of the Program.
+"Contributor" means any person or entity that distributes the Program.
+
+"Licensed Patents" mean patent claims licensable by a Contributor which are necessarily infringed by the use or sale of its Contribution alone or when combined with the Program.
+
+"Program" means the Contributions distributed in accordance with this Agreement.
+
+"Recipient" means anyone who receives the Program under this Agreement, including all Contributors.
+
+2. GRANT OF RIGHTS
+
+a) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, distribute and sublicense the Contribution of such Contributor, if any, and such derivative works, in source code and object code form.
+b) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed Patents to make, use, sell, offer to sell, import and otherwise transfer the Contribution of such Contributor, if any, in source code and object code form. This patent license shall apply to the combination of the Contribution and the Program if, at the time the Contribution is added by the Contributor, such addition of the Contribution causes such combination to be covered by the Licensed Patents. The patent license shall not apply to any other combinations which include the Contribution. No hardware per se is licensed hereunder.
+c) Recipient understands that although each Contributor grants the licenses to its Contributions set forth herein, no assurances are provided by any Contributor that the Program does not infringe the patent or other intellectual property rights of any other entity. Each Contributor disclaims any liability to Recipient for claims brought by any other entity based on infringement of intellectual property rights or otherwise. As a condition to exercising the rights and licenses granted hereunder, each Recipient hereby assumes sole responsibility to secure any other intellectual property rights needed, if any. For example, if a third party patent license is required to allow Recipient to distribute the Program, it is Recipient's responsibility to acquire that license before distributing the Program.
+d) Each Contributor represents that to its knowledge it has sufficient copyright rights in its Contribution, if any, to grant the copyright license set forth in this Agreement.
+3. REQUIREMENTS
+
+A Contributor may choose to distribute the Program in object code form under its own license agreement, provided that:
+
+a) it complies with the terms and conditions of this Agreement; and
+b) its license agreement:
+i) effectively disclaims on behalf of all Contributors all warranties and conditions, express and implied, including warranties or conditions of title and non-infringement, and implied warranties or conditions of merchantability and fitness for a particular purpose;
+ii) effectively excludes on behalf of all Contributors all liability for damages, including direct, indirect, special, incidental and consequential damages, such as lost profits;
+iii) states that any provisions which differ from this Agreement are offered by that Contributor alone and not by any other party; and
+iv) states that source code for the Program is available from such Contributor, and informs licensees how to obtain it in a reasonable manner on or through a medium customarily used for software exchange.
+When the Program is made available in source code form:
+
+a) it must be made available under this Agreement; and
+b) a copy of this Agreement must be included with each copy of the Program.
+Contributors may not remove or alter any copyright notices contained within the Program.
+
+Each Contributor must identify itself as the originator of its Contribution, if any, in a manner that reasonably allows subsequent Recipients to identify the originator of the Contribution.
+
+4. COMMERCIAL DISTRIBUTION
+
+Commercial distributors of software may accept certain responsibilities with respect to end users, business partners and the like. While this license is intended to facilitate the commercial use of the Program, the Contributor who includes the Program in a commercial product offering should do so in a manner which does not create potential liability for other Contributors. Therefore, if a Contributor includes the Program in a commercial product offering, such Contributor ("Commercial Contributor") hereby agrees to defend and indemnify every other Contributor ("Indemnified Contributor") against any losses, damages and costs (collectively "Losses") arising from claims, lawsuits and other legal actions brought by a third party against the Indemnified Contributor to the extent caused by the acts or omissions of such Commercial Contributor in connection with its distribution of the Program in a commercial product offering. The obligations in this section do not apply to any claims or Losses relating to any actual or alleged intellectual property infringement. In order to qualify, an Indemnified Contributor must: a) promptly notify the Commercial Contributor in writing of such claim, and b) allow the Commercial Contributor to control, and cooperate with the Commercial Contributor in, the defense and any related settlement negotiations. The Indemnified Contributor may participate in any such claim at its own expense.
+
+For example, a Contributor might include the Program in a commercial product offering, Product X. That Contributor is then a Commercial Contributor. If that Commercial Contributor then makes performance claims, or offers warranties related to Product X, those performance claims and warranties are such Commercial Contributor's responsibility alone. Under this section, the Commercial Contributor would have to defend claims against the other Contributors related to those performance claims and warranties, and if a court requires any other Contributor to pay any damages as a result, the Commercial Contributor must pay those damages.
+
+5. NO WARRANTY
+
+EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the appropriateness of using and distributing the Program and assumes all risks associated with its exercise of rights under this Agreement , including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and unavailability or interruption of operations.
+
+6. DISCLAIMER OF LIABILITY
+
+EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+7. GENERAL
+
+If any provision of this Agreement is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this Agreement, and without further action by the parties hereto, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
+
+If Recipient institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program itself (excluding combinations of the Program with other software or hardware) infringes such Recipient's patent(s), then such Recipient's rights granted under Section 2(b) shall terminate as of the date such litigation is filed.
+
+All Recipient's rights under this Agreement shall terminate if it fails to comply with any of the material terms or conditions of this Agreement and does not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If all Recipient's rights under this Agreement terminate, Recipient agrees to cease use and distribution of the Program as soon as reasonably practicable. However, Recipient's obligations under this Agreement and any licenses granted by Recipient relating to the Program shall continue and survive.
+
+Everyone is permitted to copy and distribute copies of this Agreement, but in order to avoid inconsistency the Agreement is copyrighted and may only be modified in the following manner. The Agreement Steward reserves the right to publish new versions (including revisions) of this Agreement from time to time. No one other than the Agreement Steward has the right to modify this Agreement. The Eclipse Foundation is the initial Agreement Steward. The Eclipse Foundation may assign the responsibility to serve as the Agreement Steward to a suitable separate entity. Each new version of the Agreement will be given a distinguishing version number. The Program (including Contributions) may always be distributed subject to the version of the Agreement under which it was received. In addition, after a new version of the Agreement is published, Contributor may elect to distribute the Program (including its Contributions) under the new version. Except as expressly stated in Sections 2(a) and 2(b) above, Recipient receives no rights or licenses to the intellectual property of any Contributor under this Agreement, whether expressly, by implication, estoppel or otherwise. All rights in the Program not expressly granted under this Agreement are reserved.
+
+This Agreement is governed by the laws of the State of New York and the intellectual property laws of the United States of America. No party to this Agreement will bring a legal action under this Agreement more than one year after the cause of action arose. Each party waives its rights to a jury trial in any resulting litigation.

diff --git a/lib/licenses/logback-core-1.1.2.txt b/lib/licenses/logback-core-1.1.2.txt
new file mode 100644
index 0000000..79e486c
--- /dev/null
+++ b/lib/licenses/logback-core-1.1.2.txt

@@ -0,0 +1,70 @@
+Eclipse Public License - v 1.0
+
+THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
+
+1. DEFINITIONS
+
+"Contribution" means:
+
+a) in the case of the initial Contributor, the initial code and documentation distributed under this Agreement, and
+b) in the case of each subsequent Contributor:
+i) changes to the Program, and
+ii) additions to the Program;
+where such changes and/or additions to the Program originate from and are distributed by that particular Contributor. A Contribution 'originates' from a Contributor if it was added to the Program by such Contributor itself or anyone acting on such Contributor's behalf. Contributions do not include additions to the Program which: (i) are separate modules of software distributed in conjunction with the Program under their own license agreement, and (ii) are not derivative works of the Program.
+"Contributor" means any person or entity that distributes the Program.
+
+"Licensed Patents" mean patent claims licensable by a Contributor which are necessarily infringed by the use or sale of its Contribution alone or when combined with the Program.
+
+"Program" means the Contributions distributed in accordance with this Agreement.
+
+"Recipient" means anyone who receives the Program under this Agreement, including all Contributors.
+
+2. GRANT OF RIGHTS
+
+a) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, distribute and sublicense the Contribution of such Contributor, if any, and such derivative works, in source code and object code form.
+b) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed Patents to make, use, sell, offer to sell, import and otherwise transfer the Contribution of such Contributor, if any, in source code and object code form. This patent license shall apply to the combination of the Contribution and the Program if, at the time the Contribution is added by the Contributor, such addition of the Contribution causes such combination to be covered by the Licensed Patents. The patent license shall not apply to any other combinations which include the Contribution. No hardware per se is licensed hereunder.
+c) Recipient understands that although each Contributor grants the licenses to its Contributions set forth herein, no assurances are provided by any Contributor that the Program does not infringe the patent or other intellectual property rights of any other entity. Each Contributor disclaims any liability to Recipient for claims brought by any other entity based on infringement of intellectual property rights or otherwise. As a condition to exercising the rights and licenses granted hereunder, each Recipient hereby assumes sole responsibility to secure any other intellectual property rights needed, if any. For example, if a third party patent license is required to allow Recipient to distribute the Program, it is Recipient's responsibility to acquire that license before distributing the Program.
+d) Each Contributor represents that to its knowledge it has sufficient copyright rights in its Contribution, if any, to grant the copyright license set forth in this Agreement.
+3. REQUIREMENTS
+
+A Contributor may choose to distribute the Program in object code form under its own license agreement, provided that:
+
+a) it complies with the terms and conditions of this Agreement; and
+b) its license agreement:
+i) effectively disclaims on behalf of all Contributors all warranties and conditions, express and implied, including warranties or conditions of title and non-infringement, and implied warranties or conditions of merchantability and fitness for a particular purpose;
+ii) effectively excludes on behalf of all Contributors all liability for damages, including direct, indirect, special, incidental and consequential damages, such as lost profits;
+iii) states that any provisions which differ from this Agreement are offered by that Contributor alone and not by any other party; and
+iv) states that source code for the Program is available from such Contributor, and informs licensees how to obtain it in a reasonable manner on or through a medium customarily used for software exchange.
+When the Program is made available in source code form:
+
+a) it must be made available under this Agreement; and
+b) a copy of this Agreement must be included with each copy of the Program.
+Contributors may not remove or alter any copyright notices contained within the Program.
+
+Each Contributor must identify itself as the originator of its Contribution, if any, in a manner that reasonably allows subsequent Recipients to identify the originator of the Contribution.
+
+4. COMMERCIAL DISTRIBUTION
+
+Commercial distributors of software may accept certain responsibilities with respect to end users, business partners and the like. While this license is intended to facilitate the commercial use of the Program, the Contributor who includes the Program in a commercial product offering should do so in a manner which does not create potential liability for other Contributors. Therefore, if a Contributor includes the Program in a commercial product offering, such Contributor ("Commercial Contributor") hereby agrees to defend and indemnify every other Contributor ("Indemnified Contributor") against any losses, damages and costs (collectively "Losses") arising from claims, lawsuits and other legal actions brought by a third party against the Indemnified Contributor to the extent caused by the acts or omissions of such Commercial Contributor in connection with its distribution of the Program in a commercial product offering. The obligations in this section do not apply to any claims or Losses relating to any actual or alleged intellectual property infringement. In order to qualify, an Indemnified Contributor must: a) promptly notify the Commercial Contributor in writing of such claim, and b) allow the Commercial Contributor to control, and cooperate with the Commercial Contributor in, the defense and any related settlement negotiations. The Indemnified Contributor may participate in any such claim at its own expense.
+
+For example, a Contributor might include the Program in a commercial product offering, Product X. That Contributor is then a Commercial Contributor. If that Commercial Contributor then makes performance claims, or offers warranties related to Product X, those performance claims and warranties are such Commercial Contributor's responsibility alone. Under this section, the Commercial Contributor would have to defend claims against the other Contributors related to those performance claims and warranties, and if a court requires any other Contributor to pay any damages as a result, the Commercial Contributor must pay those damages.
+
+5. NO WARRANTY
+
+EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the appropriateness of using and distributing the Program and assumes all risks associated with its exercise of rights under this Agreement , including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and unavailability or interruption of operations.
+
+6. DISCLAIMER OF LIABILITY
+
+EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+7. GENERAL
+
+If any provision of this Agreement is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this Agreement, and without further action by the parties hereto, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
+
+If Recipient institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program itself (excluding combinations of the Program with other software or hardware) infringes such Recipient's patent(s), then such Recipient's rights granted under Section 2(b) shall terminate as of the date such litigation is filed.
+
+All Recipient's rights under this Agreement shall terminate if it fails to comply with any of the material terms or conditions of this Agreement and does not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If all Recipient's rights under this Agreement terminate, Recipient agrees to cease use and distribution of the Program as soon as reasonably practicable. However, Recipient's obligations under this Agreement and any licenses granted by Recipient relating to the Program shall continue and survive.
+
+Everyone is permitted to copy and distribute copies of this Agreement, but in order to avoid inconsistency the Agreement is copyrighted and may only be modified in the following manner. The Agreement Steward reserves the right to publish new versions (including revisions) of this Agreement from time to time. No one other than the Agreement Steward has the right to modify this Agreement. The Eclipse Foundation is the initial Agreement Steward. The Eclipse Foundation may assign the responsibility to serve as the Agreement Steward to a suitable separate entity. Each new version of the Agreement will be given a distinguishing version number. The Program (including Contributions) may always be distributed subject to the version of the Agreement under which it was received. In addition, after a new version of the Agreement is published, Contributor may elect to distribute the Program (including its Contributions) under the new version. Except as expressly stated in Sections 2(a) and 2(b) above, Recipient receives no rights or licenses to the intellectual property of any Contributor under this Agreement, whether expressly, by implication, estoppel or otherwise. All rights in the Program not expressly granted under this Agreement are reserved.
+
+This Agreement is governed by the laws of the State of New York and the intellectual property laws of the United States of America. No party to this Agreement will bring a legal action under this Agreement more than one year after the cause of action arose. Each party waives its rights to a jury trial in any resulting litigation.

diff --git a/lib/licenses/netty-3.5.9.txt b/lib/licenses/netty-3.5.9.txt
deleted file mode 100644
index d645695..0000000
--- a/lib/licenses/netty-3.5.9.txt
+++ /dev/null

@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

diff --git a/lib/licenses/cql-1.4.0.txt b/lib/licenses/netty-all-4.0.23.Final.txt
similarity index 100%
copy from lib/licenses/cql-1.4.0.txt
copy to lib/licenses/netty-all-4.0.23.Final.txt


diff --git a/lib/licenses/servlet-api-2.5-20081211.txt b/lib/licenses/servlet-api-2.5-20081211.txt
deleted file mode 100644
index d645695..0000000
--- a/lib/licenses/servlet-api-2.5-20081211.txt
+++ /dev/null

@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

diff --git a/lib/licenses/six-1.7.3.txt b/lib/licenses/six-1.7.3.txt
new file mode 100644
index 0000000..d76e024
--- /dev/null
+++ b/lib/licenses/six-1.7.3.txt

@@ -0,0 +1,18 @@
+Copyright (c) 2010-2014 Benjamin Peterson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

diff --git a/lib/licenses/slf4j-log4j12-1.7.2.txt b/lib/licenses/slf4j-log4j12-1.7.2.txt
deleted file mode 100644
index dbfc769..0000000
--- a/lib/licenses/slf4j-log4j12-1.7.2.txt
+++ /dev/null

@@ -1,20 +0,0 @@
-Copyright (c) 2004-2008 QOS.ch
-All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and associated documentation files (the "Software"),
-to deal in the Software without restriction, including without limitation
-the rights to use, copy, modify, merge, publish, distribute, sublicense,
-and/or sell copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. 

diff --git a/lib/licenses/snaptree-0.1.txt b/lib/licenses/snaptree-0.1.txt
deleted file mode 100644
index 07324c5..0000000
--- a/lib/licenses/snaptree-0.1.txt
+++ /dev/null

@@ -1,776 +0,0 @@
-
-
-
-<!DOCTYPE html>
-<html>
-  <head>
-    <meta charset='utf-8'>
-    <meta http-equiv="X-UA-Compatible" content="chrome=1">
-        <title>doc/LICENSE at master from nbronson/snaptree - GitHub</title>
-    <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="GitHub" />
-    <link rel="fluid-icon" href="https://github.com/fluidicon.png" title="GitHub" />
-
-    
-    
-
-    <meta content="authenticity_token" name="csrf-param" />
-<meta content="kCKdrZvsCWGNIGaBTKBWlG4gYXwI636kkFF5Jfbr//c=" name="csrf-token" />
-
-    <link href="https://a248.e.akamai.net/assets.github.com/stylesheets/bundles/github-ce4abc8fb736cacb557664dcd8194a5486c74f6b.css" media="screen" rel="stylesheet" type="text/css" />
-    
-
-    <script src="https://a248.e.akamai.net/assets.github.com/javascripts/bundles/jquery-6c2aad85e5c2becfaac6d62ce0f290d10fa1725e.js" type="text/javascript"></script>
-    <script src="https://a248.e.akamai.net/assets.github.com/javascripts/bundles/github-724a1478428e953614c0459ba27f5d900fc109be.js" type="text/javascript"></script>
-    
-
-      <link rel='permalink' href='/nbronson/snaptree/blob/b198f84b0c927f6b5cdef080552fc26aa004d3ee/doc/LICENSE'>
-    
-
-    <meta name="description" content="snaptree - Concurrent TreeMap w/ efficient support for clone() and consistent iteration" />
-  <link href="https://github.com/nbronson/snaptree/commits/master.atom" rel="alternate" title="Recent Commits to snaptree:master" type="application/atom+xml" />
-
-  </head>
-
-
-  <body class="logged_out page-blob  vis-public env-production ">
-    
-
-
-    
-
-      <div id="header" class="true clearfix">
-        <div class="container" class="clearfix">
-          <a class="site-logo" href="https://github.com">
-            <!--[if IE]>
-            <img alt="GitHub" class="github-logo" src="https://a248.e.akamai.net/assets.github.com/images/modules/header/logov7.png?1323882717" />
-            <img alt="GitHub" class="github-logo-hover" src="https://a248.e.akamai.net/assets.github.com/images/modules/header/logov7-hover.png?1324325359" />
-            <![endif]-->
-            <img alt="GitHub" class="github-logo-4x" height="30" src="https://a248.e.akamai.net/assets.github.com/images/modules/header/logov7@4x.png?1323882717" />
-            <img alt="GitHub" class="github-logo-4x-hover" height="30" src="https://a248.e.akamai.net/assets.github.com/images/modules/header/logov7@4x-hover.png?1324325359" />
-          </a>
-
-                <!--
-      make sure to use fully qualified URLs here since this nav
-      is used on error pages on other domains
-    -->
-    <ul class="top-nav logged_out">
-        <li class="pricing"><a href="https://github.com/plans">Signup and Pricing</a></li>
-        <li class="explore"><a href="https://github.com/explore">Explore GitHub</a></li>
-      <li class="features"><a href="https://github.com/features">Features</a></li>
-        <li class="blog"><a href="https://github.com/blog">Blog</a></li>
-      <li class="login"><a href="https://github.com/login?return_to=%2Fnbronson%2Fsnaptree%2Fblob%2Fmaster%2Fdoc%2FLICENSE">Login</a></li>
-    </ul>
-
-
-        </div>
-      </div>
-
-      
-
-            <div class="site">
-      <div class="container">
-        <div class="pagehead repohead instapaper_ignore readability-menu">
-
-
-        <div class="title-actions-bar">
-          <h1>
-            <a href="/nbronson">nbronson</a> /
-            <strong><a href="/nbronson/snaptree" class="js-current-repository">snaptree</a></strong>
-          </h1>
-          
-
-
-
-              <ul class="pagehead-actions">
-
-
-          <li><a href="/login?return_to=%2Fnbronson%2Fsnaptree" class="minibutton btn-watch watch-button entice tooltipped leftwards" rel="nofollow" title="You must be logged in to use this feature"><span><span class="icon"></span>Watch</span></a></li>
-          <li><a href="/login?return_to=%2Fnbronson%2Fsnaptree" class="minibutton btn-fork fork-button entice tooltipped leftwards" rel="nofollow" title="You must be logged in to use this feature"><span><span class="icon"></span>Fork</span></a></li>
-
-
-      <li class="repostats">
-        <ul class="repo-stats">
-          <li class="watchers ">
-            <a href="/nbronson/snaptree/watchers" title="Watchers" class="tooltipped downwards">
-              25
-            </a>
-          </li>
-          <li class="forks">
-            <a href="/nbronson/snaptree/network" title="Forks" class="tooltipped downwards">
-              2
-            </a>
-          </li>
-        </ul>
-      </li>
-    </ul>
-
-        </div>
-
-          
-
-  <ul class="tabs">
-    <li><a href="/nbronson/snaptree" class="selected" highlight="repo_sourcerepo_downloadsrepo_commitsrepo_tagsrepo_branches">Code</a></li>
-    <li><a href="/nbronson/snaptree/network" highlight="repo_networkrepo_fork_queue">Network</a>
-    <li><a href="/nbronson/snaptree/pulls" highlight="repo_pulls">Pull Requests <span class='counter'>0</span></a></li>
-
-      <li><a href="/nbronson/snaptree/issues" highlight="repo_issues">Issues <span class='counter'>0</span></a></li>
-
-
-    <li><a href="/nbronson/snaptree/graphs" highlight="repo_graphsrepo_contributors">Stats &amp; Graphs</a></li>
-
-  </ul>
-
-  
-<div class="frame frame-center tree-finder" style="display:none"
-      data-tree-list-url="/nbronson/snaptree/tree-list/b198f84b0c927f6b5cdef080552fc26aa004d3ee"
-      data-blob-url-prefix="/nbronson/snaptree/blob/b198f84b0c927f6b5cdef080552fc26aa004d3ee"
-    >
-
-  <div class="breadcrumb">
-    <b><a href="/nbronson/snaptree">snaptree</a></b> /
-    <input class="tree-finder-input js-navigation-enable" type="text" name="query" autocomplete="off" spellcheck="false">
-  </div>
-
-    <div class="octotip">
-      <p>
-        <a href="/nbronson/snaptree/dismiss-tree-finder-help" class="dismiss js-dismiss-tree-list-help" title="Hide this notice forever" rel="nofollow">Dismiss</a>
-        <strong>Octotip:</strong> You've activated the <em>file finder</em>
-        by pressing <span class="kbd">t</span> Start typing to filter the
-        file list. Use <span class="kbd badmono">↑</span> and
-        <span class="kbd badmono">↓</span> to navigate,
-        <span class="kbd">enter</span> to view files.
-      </p>
-    </div>
-
-  <table class="tree-browser" cellpadding="0" cellspacing="0">
-    <tr class="js-header"><th>&nbsp;</th><th>name</th></tr>
-    <tr class="js-no-results no-results" style="display: none">
-      <th colspan="2">No matching files</th>
-    </tr>
-    <tbody class="js-results-list js-navigation-container" data-navigation-enable-mouse>
-    </tbody>
-  </table>
-</div>
-
-<div id="jump-to-line" style="display:none">
-  <h2>Jump to Line</h2>
-  <form>
-    <input class="textfield" type="text">
-    <div class="full-button">
-      <button type="submit" class="classy">
-        <span>Go</span>
-      </button>
-    </div>
-  </form>
-</div>
-
-
-<div class="subnav-bar">
-
-  <ul class="actions">
-    
-      <li class="switcher">
-
-        <div class="context-menu-container js-menu-container">
-          <span class="text">Current branch:</span>
-          <a href="#"
-             class="minibutton bigger switcher context-menu-button js-menu-target js-commitish-button btn-branch repo-tree"
-             data-master-branch="master"
-             data-ref="master">
-            <span><span class="icon"></span>master</span>
-          </a>
-
-          <div class="context-pane commitish-context js-menu-content">
-            <a href="javascript:;" class="close js-menu-close"></a>
-            <div class="title">Switch Branches/Tags</div>
-            <div class="body pane-selector commitish-selector js-filterable-commitishes">
-              <div class="filterbar">
-                <div class="placeholder-field js-placeholder-field">
-                  <label class="placeholder" for="context-commitish-filter-field" data-placeholder-mode="sticky">Filter branches/tags</label>
-                  <input type="text" id="context-commitish-filter-field" class="commitish-filter" />
-                </div>
-
-                <ul class="tabs">
-                  <li><a href="#" data-filter="branches" class="selected">Branches</a></li>
-                  <li><a href="#" data-filter="tags">Tags</a></li>
-                </ul>
-              </div>
-
-                <div class="commitish-item branch-commitish selector-item">
-                  <h4>
-                      <a href="/nbronson/snaptree/blob/master/doc/LICENSE" data-name="master">master</a>
-                  </h4>
-                </div>
-
-
-              <div class="no-results" style="display:none">Nothing to show</div>
-            </div>
-          </div><!-- /.commitish-context-context -->
-        </div>
-
-      </li>
-  </ul>
-
-  <ul class="subnav">
-    <li><a href="/nbronson/snaptree" class="selected" highlight="repo_source">Files</a></li>
-    <li><a href="/nbronson/snaptree/commits/master" highlight="repo_commits">Commits</a></li>
-    <li><a href="/nbronson/snaptree/branches" class="" highlight="repo_branches" rel="nofollow">Branches <span class="counter">1</span></a></li>
-    <li><a href="/nbronson/snaptree/tags" class="blank" highlight="repo_tags">Tags <span class="counter">0</span></a></li>
-    <li><a href="/nbronson/snaptree/downloads" class="blank" highlight="repo_downloads">Downloads <span class="counter">0</span></a></li>
-  </ul>
-
-</div>
-
-  
-  
-  
-
-
-          
-
-        </div><!-- /.repohead -->
-
-        
-
-
-
-
-  
-  <p class="last-commit">Latest commit to the <strong>master</strong> branch</p>
-
-<div class="commit commit-tease js-details-container">
-  <p class="commit-title ">
-      <a href="/nbronson/snaptree/commit/b198f84b0c927f6b5cdef080552fc26aa004d3ee" class="message">update to new IDEA install</a>
-      
-  </p>
-  <div class="commit-meta">
-    <a href="/nbronson/snaptree/commit/b198f84b0c927f6b5cdef080552fc26aa004d3ee" class="sha-block">commit <span class="sha">b198f84b0c</span></a>
-
-    <div class="authorship">
-      <img class="gravatar" height="20" src="https://secure.gravatar.com/avatar/9e4674291aec6ad9f6c0a7e219d522c0?s=140&amp;d=https://a248.e.akamai.net/assets.github.com%2Fimages%2Fgravatars%2Fgravatar-140.png" width="20" />
-      <span class="author-name"><a href="/nbronson">nbronson</a></span>
-      authored <time class="js-relative-date" datetime="2011-11-16T09:38:04-08:00" title="2011-11-16 09:38:04">November 16, 2011</time>
-
-    </div>
-  </div>
-</div>
-
-
-  <div id="slider">
-
-    <div class="breadcrumb" data-path="doc/LICENSE/">
-      <b><a href="/nbronson/snaptree/tree/b198f84b0c927f6b5cdef080552fc26aa004d3ee" class="js-rewrite-sha">snaptree</a></b> / <a href="/nbronson/snaptree/tree/b198f84b0c927f6b5cdef080552fc26aa004d3ee/doc" class="js-rewrite-sha">doc</a> / LICENSE       <span style="display:none" id="clippy_4738" class="clippy-text">doc/LICENSE</span>
-      
-      <object classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000"
-              width="110"
-              height="14"
-              class="clippy"
-              id="clippy" >
-      <param name="movie" value="https://a248.e.akamai.net/assets.github.com/flash/clippy.swf?1261951368?v5"/>
-      <param name="allowScriptAccess" value="always" />
-      <param name="quality" value="high" />
-      <param name="scale" value="noscale" />
-      <param NAME="FlashVars" value="id=clippy_4738&amp;copied=copied!&amp;copyto=copy to clipboard">
-      <param name="bgcolor" value="#FFFFFF">
-      <param name="wmode" value="opaque">
-      <embed src="https://a248.e.akamai.net/assets.github.com/flash/clippy.swf?1261951368?v5"
-             width="110"
-             height="14"
-             name="clippy"
-             quality="high"
-             allowScriptAccess="always"
-             type="application/x-shockwave-flash"
-             pluginspage="http://www.macromedia.com/go/getflashplayer"
-             FlashVars="id=clippy_4738&amp;copied=copied!&amp;copyto=copy to clipboard"
-             bgcolor="#FFFFFF"
-             wmode="opaque"
-      />
-      </object>
-      
-
-    </div>
-
-    <div class="frames">
-      <div class="frame frame-center" data-path="doc/LICENSE/" data-permalink-url="/nbronson/snaptree/blob/b198f84b0c927f6b5cdef080552fc26aa004d3ee/doc/LICENSE" data-title="doc/LICENSE at master from nbronson/snaptree - GitHub" data-type="blob">
-          <ul class="big-actions">
-            <li><a class="file-edit-link minibutton js-rewrite-sha" href="/nbronson/snaptree/edit/b198f84b0c927f6b5cdef080552fc26aa004d3ee/doc/LICENSE" data-method="post" rel="nofollow"><span>Edit this file</span></a></li>
-          </ul>
-
-        <div id="files">
-          <div class="file">
-            <div class="meta">
-              <div class="info">
-                <span class="icon"><img alt="Txt" height="16" src="https://a248.e.akamai.net/assets.github.com/images/icons/txt.png?1252203928" width="16" /></span>
-                <span class="mode" title="File Mode">100644</span>
-                  <span>36 lines (27 sloc)</span>
-                <span>1.715 kb</span>
-              </div>
-              <ul class="actions">
-                <li><a href="/nbronson/snaptree/raw/master/doc/LICENSE" id="raw-url">raw</a></li>
-                  <li><a href="/nbronson/snaptree/blame/master/doc/LICENSE">blame</a></li>
-                <li><a href="/nbronson/snaptree/commits/master/doc/LICENSE" rel="nofollow">history</a></li>
-              </ul>
-            </div>
-              <div class="data type-text">
-      <table cellpadding="0" cellspacing="0" class="lines">
-        <tr>
-          <td>
-            <pre class="line_numbers"><span id="L1" rel="#L1">1</span>
-<span id="L2" rel="#L2">2</span>
-<span id="L3" rel="#L3">3</span>
-<span id="L4" rel="#L4">4</span>
-<span id="L5" rel="#L5">5</span>
-<span id="L6" rel="#L6">6</span>
-<span id="L7" rel="#L7">7</span>
-<span id="L8" rel="#L8">8</span>
-<span id="L9" rel="#L9">9</span>
-<span id="L10" rel="#L10">10</span>
-<span id="L11" rel="#L11">11</span>
-<span id="L12" rel="#L12">12</span>
-<span id="L13" rel="#L13">13</span>
-<span id="L14" rel="#L14">14</span>
-<span id="L15" rel="#L15">15</span>
-<span id="L16" rel="#L16">16</span>
-<span id="L17" rel="#L17">17</span>
-<span id="L18" rel="#L18">18</span>
-<span id="L19" rel="#L19">19</span>
-<span id="L20" rel="#L20">20</span>
-<span id="L21" rel="#L21">21</span>
-<span id="L22" rel="#L22">22</span>
-<span id="L23" rel="#L23">23</span>
-<span id="L24" rel="#L24">24</span>
-<span id="L25" rel="#L25">25</span>
-<span id="L26" rel="#L26">26</span>
-<span id="L27" rel="#L27">27</span>
-<span id="L28" rel="#L28">28</span>
-<span id="L29" rel="#L29">29</span>
-<span id="L30" rel="#L30">30</span>
-<span id="L31" rel="#L31">31</span>
-<span id="L32" rel="#L32">32</span>
-<span id="L33" rel="#L33">33</span>
-<span id="L34" rel="#L34">34</span>
-<span id="L35" rel="#L35">35</span>
-<span id="L36" rel="#L36">36</span>
-</pre>
-          </td>
-          <td width="100%">
-                <div class="highlight"><pre><div class='line' id='LC1'>SNAPTREE LICENSE</div><div class='line' id='LC2'><br/></div><div class='line' id='LC3'>Copyright (c) 2009 Stanford University, unless otherwise specified.</div><div class='line' id='LC4'>All rights reserved.</div><div class='line' id='LC5'><br/></div><div class='line' id='LC6'>This software was developed by the Pervasive Parallelism Laboratory of</div><div class='line' id='LC7'>Stanford University, California, USA.</div><div class='line' id='LC8'><br/></div><div class='line' id='LC9'>Permission to use, copy, modify, and distribute this software in source</div><div class='line' id='LC10'>or binary form for any purpose with or without fee is hereby granted,</div><div class='line' id='LC11'>provided that the following conditions are met:</div><div class='line' id='LC12'><br/></div><div class='line' id='LC13'>&nbsp;&nbsp;&nbsp;1. Redistributions of source code must retain the above copyright</div><div class='line' id='LC14'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;notice, this list of conditions and the following disclaimer.</div><div class='line' id='LC15'><br/></div><div class='line' id='LC16'>&nbsp;&nbsp;&nbsp;2. Redistributions in binary form must reproduce the above copyright</div><div class='line' id='LC17'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;notice, this list of conditions and the following disclaimer in the</div><div class='line' id='LC18'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;documentation and/or other materials provided with the distribution.</div><div class='line' id='LC19'><br/></div><div class='line' id='LC20'>&nbsp;&nbsp;&nbsp;3. Neither the name of Stanford University nor the names of its</div><div class='line' id='LC21'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;contributors may be used to endorse or promote products derived</div><div class='line' id='LC22'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;from this software without specific prior written permission.</div><div class='line' id='LC23'><br/></div><div class='line' id='LC24'><br/></div><div class='line' id='LC25'>THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS&#39;&#39; AND</div><div class='line' id='LC26'>ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</div><div class='line' id='LC27'>IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE</div><div class='line' id='LC28'>ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE</div><div class='line' id='LC29'>FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</div><div class='line' id='LC30'>DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</div><div class='line' id='LC31'>SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</div><div class='line' id='LC32'>CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT</div><div class='line' id='LC33'>LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY</div><div class='line' id='LC34'>OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF</div><div class='line' id='LC35'>SUCH DAMAGE.</div><div class='line' id='LC36'><br/></div></pre></div>
-          </td>
-        </tr>
-      </table>
-  </div>
-
-          </div>
-        </div>
-      </div>
-    </div>
-
-  </div>
-
-<div class="frame frame-loading" style="display:none;" data-tree-list-url="/nbronson/snaptree/tree-list/b198f84b0c927f6b5cdef080552fc26aa004d3ee" data-blob-url-prefix="/nbronson/snaptree/blob/b198f84b0c927f6b5cdef080552fc26aa004d3ee">
-  <img src="https://a248.e.akamai.net/assets.github.com/images/modules/ajax/big_spinner_336699.gif?1315937507" height="32" width="32">
-</div>
-
-      </div>
-    </div>
-
-    <!-- footer -->
-    <div id="footer" >
-      
-  <div class="upper_footer">
-     <div class="container" class="clearfix">
-
-       <!--[if IE]><h4 id="blacktocat_ie">GitHub Links</h4><![endif]-->
-       <![if !IE]><h4 id="blacktocat">GitHub Links</h4><![endif]>
-
-       <ul class="footer_nav">
-         <h4>GitHub</h4>
-         <li><a href="https://github.com/about">About</a></li>
-         <li><a href="https://github.com/blog">Blog</a></li>
-         <li><a href="https://github.com/features">Features</a></li>
-         <li><a href="https://github.com/contact">Contact &amp; Support</a></li>
-         <li><a href="https://github.com/training">Training</a></li>
-         <li><a href="http://enterprise.github.com/">GitHub Enterprise</a></li>
-         <li><a href="http://status.github.com/">Site Status</a></li>
-       </ul>
-
-       <ul class="footer_nav">
-         <h4>Tools</h4>
-         <li><a href="http://get.gaug.es/">Gauges: Analyze web traffic</a></li>
-         <li><a href="http://speakerdeck.com">Speakerdeck: Presentations</a></li>
-         <li><a href="https://gist.github.com">Gist: Code snippets</a></li>
-         <li><a href="http://mac.github.com/">GitHub for Mac</a></li>
-         <li><a href="http://mobile.github.com/">Issues for iPhone</a></li>
-         <li><a href="http://jobs.github.com/">Job Board</a></li>
-       </ul>
-
-       <ul class="footer_nav">
-         <h4>Extras</h4>
-         <li><a href="http://shop.github.com/">GitHub Shop</a></li>
-         <li><a href="http://octodex.github.com/">The Octodex</a></li>
-       </ul>
-
-       <ul class="footer_nav">
-         <h4>Documentation</h4>
-         <li><a href="http://help.github.com/">GitHub Help</a></li>
-         <li><a href="http://developer.github.com/">Developer API</a></li>
-         <li><a href="http://github.github.com/github-flavored-markdown/">GitHub Flavored Markdown</a></li>
-         <li><a href="http://pages.github.com/">GitHub Pages</a></li>
-       </ul>
-
-     </div><!-- /.site -->
-  </div><!-- /.upper_footer -->
-
-<div class="lower_footer">
-  <div class="container" class="clearfix">
-    <!--[if IE]><div id="legal_ie"><![endif]-->
-    <![if !IE]><div id="legal"><![endif]>
-      <ul>
-          <li><a href="https://github.com/site/terms">Terms of Service</a></li>
-          <li><a href="https://github.com/site/privacy">Privacy</a></li>
-          <li><a href="https://github.com/security">Security</a></li>
-      </ul>
-
-      <p>&copy; 2011 <span id="_rrt" title="0.07115s from fe10.rs.github.com">GitHub</span> Inc. All rights reserved.</p>
-    </div><!-- /#legal or /#legal_ie-->
-
-      <div class="sponsor">
-        <a href="http://www.rackspace.com" class="logo">
-          <img alt="Dedicated Server" height="36" src="https://a248.e.akamai.net/assets.github.com/images/modules/footer/rackspace_logo.png?v2" width="38" />
-        </a>
-        Powered by the <a href="http://www.rackspace.com ">Dedicated
-        Servers</a> and<br/> <a href="http://www.rackspacecloud.com">Cloud
-        Computing</a> of Rackspace Hosting<span>&reg;</span>
-      </div>
-  </div><!-- /.site -->
-</div><!-- /.lower_footer -->
-
-    </div><!-- /#footer -->
-
-    
-
-<div id="keyboard_shortcuts_pane" class="instapaper_ignore readability-extra" style="display:none">
-  <h2>Keyboard Shortcuts <small><a href="#" class="js-see-all-keyboard-shortcuts">(see all)</a></small></h2>
-
-  <div class="columns threecols">
-    <div class="column first">
-      <h3>Site wide shortcuts</h3>
-      <dl class="keyboard-mappings">
-        <dt>s</dt>
-        <dd>Focus site search</dd>
-      </dl>
-      <dl class="keyboard-mappings">
-        <dt>?</dt>
-        <dd>Bring up this help dialog</dd>
-      </dl>
-    </div><!-- /.column.first -->
-
-    <div class="column middle" style='display:none'>
-      <h3>Commit list</h3>
-      <dl class="keyboard-mappings">
-        <dt>j</dt>
-        <dd>Move selection down</dd>
-      </dl>
-      <dl class="keyboard-mappings">
-        <dt>k</dt>
-        <dd>Move selection up</dd>
-      </dl>
-      <dl class="keyboard-mappings">
-        <dt>c <em>or</em> o <em>or</em> enter</dt>
-        <dd>Open commit</dd>
-      </dl>
-      <dl class="keyboard-mappings">
-        <dt>y</dt>
-        <dd>Expand URL to its canonical form</dd>
-      </dl>
-    </div><!-- /.column.first -->
-
-    <div class="column last" style='display:none'>
-      <h3>Pull request list</h3>
-      <dl class="keyboard-mappings">
-        <dt>j</dt>
-        <dd>Move selection down</dd>
-      </dl>
-      <dl class="keyboard-mappings">
-        <dt>k</dt>
-        <dd>Move selection up</dd>
-      </dl>
-      <dl class="keyboard-mappings">
-        <dt>o <em>or</em> enter</dt>
-        <dd>Open issue</dd>
-      </dl>
-    </div><!-- /.columns.last -->
-
-  </div><!-- /.columns.equacols -->
-
-  <div style='display:none'>
-    <div class="rule"></div>
-
-    <h3>Issues</h3>
-
-    <div class="columns threecols">
-      <div class="column first">
-        <dl class="keyboard-mappings">
-          <dt>j</dt>
-          <dd>Move selection down</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>k</dt>
-          <dd>Move selection up</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>x</dt>
-          <dd>Toggle selection</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>o <em>or</em> enter</dt>
-          <dd>Open issue</dd>
-        </dl>
-      </div><!-- /.column.first -->
-      <div class="column middle">
-        <dl class="keyboard-mappings">
-          <dt>I</dt>
-          <dd>Mark selection as read</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>U</dt>
-          <dd>Mark selection as unread</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>e</dt>
-          <dd>Close selection</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>y</dt>
-          <dd>Remove selection from view</dd>
-        </dl>
-      </div><!-- /.column.middle -->
-      <div class="column last">
-        <dl class="keyboard-mappings">
-          <dt>c</dt>
-          <dd>Create issue</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>l</dt>
-          <dd>Create label</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>i</dt>
-          <dd>Back to inbox</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>u</dt>
-          <dd>Back to issues</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>/</dt>
-          <dd>Focus issues search</dd>
-        </dl>
-      </div>
-    </div>
-  </div>
-
-  <div style='display:none'>
-    <div class="rule"></div>
-
-    <h3>Issues Dashboard</h3>
-
-    <div class="columns threecols">
-      <div class="column first">
-        <dl class="keyboard-mappings">
-          <dt>j</dt>
-          <dd>Move selection down</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>k</dt>
-          <dd>Move selection up</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>o <em>or</em> enter</dt>
-          <dd>Open issue</dd>
-        </dl>
-      </div><!-- /.column.first -->
-    </div>
-  </div>
-
-  <div style='display:none'>
-    <div class="rule"></div>
-
-    <h3>Network Graph</h3>
-    <div class="columns equacols">
-      <div class="column first">
-        <dl class="keyboard-mappings">
-          <dt><span class="badmono">←</span> <em>or</em> h</dt>
-          <dd>Scroll left</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt><span class="badmono">→</span> <em>or</em> l</dt>
-          <dd>Scroll right</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt><span class="badmono">↑</span> <em>or</em> k</dt>
-          <dd>Scroll up</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt><span class="badmono">↓</span> <em>or</em> j</dt>
-          <dd>Scroll down</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>t</dt>
-          <dd>Toggle visibility of head labels</dd>
-        </dl>
-      </div><!-- /.column.first -->
-      <div class="column last">
-        <dl class="keyboard-mappings">
-          <dt>shift <span class="badmono">←</span> <em>or</em> shift h</dt>
-          <dd>Scroll all the way left</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>shift <span class="badmono">→</span> <em>or</em> shift l</dt>
-          <dd>Scroll all the way right</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>shift <span class="badmono">↑</span> <em>or</em> shift k</dt>
-          <dd>Scroll all the way up</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>shift <span class="badmono">↓</span> <em>or</em> shift j</dt>
-          <dd>Scroll all the way down</dd>
-        </dl>
-      </div><!-- /.column.last -->
-    </div>
-  </div>
-
-  <div >
-    <div class="rule"></div>
-    <div class="columns threecols">
-      <div class="column first" >
-        <h3>Source Code Browsing</h3>
-        <dl class="keyboard-mappings">
-          <dt>t</dt>
-          <dd>Activates the file finder</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>l</dt>
-          <dd>Jump to line</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>w</dt>
-          <dd>Switch branch/tag</dd>
-        </dl>
-        <dl class="keyboard-mappings">
-          <dt>y</dt>
-          <dd>Expand URL to its canonical form</dd>
-        </dl>
-      </div>
-    </div>
-  </div>
-</div>
-
-    <div id="markdown-help" class="instapaper_ignore readability-extra">
-  <h2>Markdown Cheat Sheet</h2>
-
-  <div class="cheatsheet-content">
-
-  <div class="mod">
-    <div class="col">
-      <h3>Format Text</h3>
-      <p>Headers</p>
-      <pre>
-# This is an &lt;h1&gt; tag
-## This is an &lt;h2&gt; tag
-###### This is an &lt;h6&gt; tag</pre>
-     <p>Text styles</p>
-     <pre>
-*This text will be italic*
-_This will also be italic_
-**This text will be bold**
-__This will also be bold__
-
-*You **can** combine them*
-</pre>
-    </div>
-    <div class="col">
-      <h3>Lists</h3>
-      <p>Unordered</p>
-      <pre>
-* Item 1
-* Item 2
-  * Item 2a
-  * Item 2b</pre>
-     <p>Ordered</p>
-     <pre>
-1. Item 1
-2. Item 2
-3. Item 3
-   * Item 3a
-   * Item 3b</pre>
-    </div>
-    <div class="col">
-      <h3>Miscellaneous</h3>
-      <p>Images</p>
-      <pre>
-![GitHub Logo](/images/logo.png)
-Format: ![Alt Text](url)
-</pre>
-     <p>Links</p>
-     <pre>
-http://github.com - automatic!
-[GitHub](http://github.com)</pre>
-<p>Blockquotes</p>
-     <pre>
-As Kanye West said:
-
-> We're living the future so
-> the present is our past.
-</pre>
-    </div>
-  </div>
-  <div class="rule"></div>
-
-  <h3>Code Examples in Markdown</h3>
-  <div class="col">
-      <p>Syntax highlighting with <a href="http://github.github.com/github-flavored-markdown/" title="GitHub Flavored Markdown" target="_blank">GFM</a></p>
-      <pre>
-```javascript
-function fancyAlert(arg) {
-  if(arg) {
-    $.facebox({div:'#foo'})
-  }
-}
-```</pre>
-    </div>
-    <div class="col">
-      <p>Or, indent your code 4 spaces</p>
-      <pre>
-Here is a Python code example
-without syntax highlighting:
-
-    def foo:
-      if not bar:
-        return true</pre>
-    </div>
-    <div class="col">
-      <p>Inline code for comments</p>
-      <pre>
-I think you should use an
-`&lt;addr&gt;` element here instead.</pre>
-    </div>
-  </div>
-
-  </div>
-</div>
-
-
-    <div class="context-overlay"></div>
-
-    <div class="ajax-error-message">
-      <p><span class="icon"></span> Something went wrong with that request. Please try again. <a href="javascript:;" class="ajax-error-dismiss">Dismiss</a></p>
-    </div>
-
-    
-    
-    
-  </body>
-</html>
-

diff --git a/lib/licenses/guava-15.0.txt b/lib/licenses/stream-2.5.2.txt
similarity index 99%
copy from lib/licenses/guava-15.0.txt
copy to lib/licenses/stream-2.5.2.txt
index d645695..c8dc677 100644
--- a/lib/licenses/guava-15.0.txt
+++ b/lib/licenses/stream-2.5.2.txt

@@ -187,7 +187,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2011 Clearspring Technologies
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/lib/licenses/antlr-3.2.txt b/lib/licenses/stringtemplate-4.0.2.txt
similarity index 100%
copy from lib/licenses/antlr-3.2.txt
copy to lib/licenses/stringtemplate-4.0.2.txt


diff --git a/lib/licenses/thrift-python-0.9.1.txt b/lib/licenses/thrift-python-0.9.1.txt
deleted file mode 100644
index 9d189ef..0000000
--- a/lib/licenses/thrift-python-0.9.1.txt
+++ /dev/null

@@ -1,324 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
---------------------------------------------------
-SOFTWARE DISTRIBUTED WITH THRIFT:
-
-The Apache Thrift software includes a number of subcomponents with
-separate copyright notices and license terms. Your use of the source
-code for the these subcomponents is subject to the terms and
-conditions of the following licenses.
-
---------------------------------------------------
-Portions of the following files are licensed under the MIT License:
-
-  lib/erl/src/Makefile.am
-
-Please see doc/otp-base-license.txt for the full terms of this license.
-
-
---------------------------------------------------
-The following files contain some portions of code contributed under
-the Thrift Software License (see doc/old-thrift-license.txt), and relicensed
-under the Apache 2.0 License:
-
-  compiler/cpp/Makefile.am
-  compiler/cpp/src/generate/t_cocoa_generator.cc
-  compiler/cpp/src/generate/t_cpp_generator.cc
-  compiler/cpp/src/generate/t_csharp_generator.cc
-  compiler/cpp/src/generate/t_erl_generator.cc
-  compiler/cpp/src/generate/t_hs_generator.cc
-  compiler/cpp/src/generate/t_java_generator.cc
-  compiler/cpp/src/generate/t_ocaml_generator.cc
-  compiler/cpp/src/generate/t_perl_generator.cc
-  compiler/cpp/src/generate/t_php_generator.cc
-  compiler/cpp/src/generate/t_py_generator.cc
-  compiler/cpp/src/generate/t_rb_generator.cc
-  compiler/cpp/src/generate/t_st_generator.cc
-  compiler/cpp/src/generate/t_xsd_generator.cc
-  compiler/cpp/src/main.cc
-  compiler/cpp/src/parse/t_field.h
-  compiler/cpp/src/parse/t_program.h
-  compiler/cpp/src/platform.h
-  compiler/cpp/src/thriftl.ll
-  compiler/cpp/src/thrifty.yy
-  lib/csharp/src/Protocol/TBinaryProtocol.cs
-  lib/csharp/src/Protocol/TField.cs
-  lib/csharp/src/Protocol/TList.cs
-  lib/csharp/src/Protocol/TMap.cs
-  lib/csharp/src/Protocol/TMessage.cs
-  lib/csharp/src/Protocol/TMessageType.cs
-  lib/csharp/src/Protocol/TProtocol.cs
-  lib/csharp/src/Protocol/TProtocolException.cs
-  lib/csharp/src/Protocol/TProtocolFactory.cs
-  lib/csharp/src/Protocol/TProtocolUtil.cs
-  lib/csharp/src/Protocol/TSet.cs
-  lib/csharp/src/Protocol/TStruct.cs
-  lib/csharp/src/Protocol/TType.cs
-  lib/csharp/src/Server/TServer.cs
-  lib/csharp/src/Server/TSimpleServer.cs
-  lib/csharp/src/Server/TThreadPoolServer.cs
-  lib/csharp/src/TApplicationException.cs
-  lib/csharp/src/Thrift.csproj
-  lib/csharp/src/Thrift.sln
-  lib/csharp/src/TProcessor.cs
-  lib/csharp/src/Transport/TServerSocket.cs
-  lib/csharp/src/Transport/TServerTransport.cs
-  lib/csharp/src/Transport/TSocket.cs
-  lib/csharp/src/Transport/TStreamTransport.cs
-  lib/csharp/src/Transport/TTransport.cs
-  lib/csharp/src/Transport/TTransportException.cs
-  lib/csharp/src/Transport/TTransportFactory.cs
-  lib/csharp/ThriftMSBuildTask/Properties/AssemblyInfo.cs
-  lib/csharp/ThriftMSBuildTask/ThriftBuild.cs
-  lib/csharp/ThriftMSBuildTask/ThriftMSBuildTask.csproj
-  lib/rb/lib/thrift.rb
-  lib/st/README
-  lib/st/thrift.st
-  test/OptionalRequiredTest.cpp
-  test/OptionalRequiredTest.thrift
-  test/ThriftTest.thrift
-
---------------------------------------------------
-For the aclocal/ax_boost_base.m4 and contrib/fb303/aclocal/ax_boost_base.m4 components:
-
-#   Copyright (c) 2007 Thomas Porschberg <thomas@randspringer.de>
-#
-#   Copying and distribution of this file, with or without
-#   modification, are permitted in any medium without royalty provided
-#   the copyright notice and this notice are preserved.
-
---------------------------------------------------
-For the compiler/cpp/src/md5.[ch] components:
-
-/*
-  Copyright (C) 1999, 2000, 2002 Aladdin Enterprises.  All rights reserved.
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-
-  L. Peter Deutsch
-  ghost@aladdin.com
-
- */
-
----------------------------------------------------
-For the lib/rb/setup.rb: Copyright (c) 2000-2005 Minero Aoki,
-lib/ocaml/OCamlMakefile and lib/ocaml/README-OCamlMakefile components:
-     Copyright (C) 1999 - 2007  Markus Mottl
-
-Licensed under the terms of the GNU Lesser General Public License 2.1
-(see doc/lgpl-2.1.txt for the full terms of this license)

diff --git a/lib/licenses/thrift-server-0.3.3.txt b/lib/licenses/thrift-server-0.3.3.txt
deleted file mode 100644
index d645695..0000000
--- a/lib/licenses/thrift-server-0.3.3.txt
+++ /dev/null

@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

diff --git a/lib/licenses/jamm-0.2.5.txt b/lib/licenses/thrift-server-0.3.5.txt
similarity index 100%
rename from lib/licenses/jamm-0.2.5.txt
rename to lib/licenses/thrift-server-0.3.5.txt


diff --git a/lib/log4j-1.2.16.jar b/lib/log4j-1.2.16.jar
deleted file mode 100644
index 3f9d847..0000000
--- a/lib/log4j-1.2.16.jar
+++ /dev/null
Binary files differ

diff --git a/lib/logback-classic-1.1.2.jar b/lib/logback-classic-1.1.2.jar
new file mode 100644
index 0000000..9230b2a
--- /dev/null
+++ b/lib/logback-classic-1.1.2.jar
Binary files differ

diff --git a/lib/logback-core-1.1.2.jar b/lib/logback-core-1.1.2.jar
new file mode 100644
index 0000000..391da64
--- /dev/null
+++ b/lib/logback-core-1.1.2.jar
Binary files differ

diff --git a/lib/netty-3.6.6.Final.jar b/lib/netty-3.6.6.Final.jar
deleted file mode 100644
index 35cb073..0000000
--- a/lib/netty-3.6.6.Final.jar
+++ /dev/null
Binary files differ

diff --git a/lib/netty-all-4.0.23.Final.jar b/lib/netty-all-4.0.23.Final.jar
new file mode 100644
index 0000000..0555a16
--- /dev/null
+++ b/lib/netty-all-4.0.23.Final.jar
Binary files differ

diff --git a/lib/servlet-api-2.5-20081211.jar b/lib/servlet-api-2.5-20081211.jar
deleted file mode 100644
index b0537c4..0000000
--- a/lib/servlet-api-2.5-20081211.jar
+++ /dev/null
Binary files differ

diff --git a/lib/six-1.7.3-py2.py3-none-any.zip b/lib/six-1.7.3-py2.py3-none-any.zip
new file mode 100644
index 0000000..e077898
--- /dev/null
+++ b/lib/six-1.7.3-py2.py3-none-any.zip
Binary files differ

diff --git a/lib/slf4j-log4j12-1.7.2.jar b/lib/slf4j-log4j12-1.7.2.jar
deleted file mode 100644
index 37a85d7..0000000
--- a/lib/slf4j-log4j12-1.7.2.jar
+++ /dev/null
Binary files differ

diff --git a/lib/snappy-java-1.0.5.jar b/lib/snappy-java-1.0.5.2.jar
similarity index 91%
rename from lib/snappy-java-1.0.5.jar
rename to lib/snappy-java-1.0.5.2.jar
index 6dc413d..e0efe83 100644
--- a/lib/snappy-java-1.0.5.jar
+++ b/lib/snappy-java-1.0.5.2.jar
Binary files differ

diff --git a/lib/snaptree-0.1.jar b/lib/snaptree-0.1.jar
deleted file mode 100644
index a5d2c85..0000000
--- a/lib/snaptree-0.1.jar
+++ /dev/null
Binary files differ

diff --git a/lib/stream-2.5.2.jar b/lib/stream-2.5.2.jar
new file mode 100644
index 0000000..a099116
--- /dev/null
+++ b/lib/stream-2.5.2.jar
Binary files differ

diff --git a/lib/stringtemplate-4.0.2.jar b/lib/stringtemplate-4.0.2.jar
new file mode 100644
index 0000000..87d7faf
--- /dev/null
+++ b/lib/stringtemplate-4.0.2.jar
Binary files differ

diff --git a/lib/thrift-python-internal-only-0.9.1.zip b/lib/thrift-python-internal-only-0.9.1.zip
deleted file mode 100644
index 7949ea6..0000000
--- a/lib/thrift-python-internal-only-0.9.1.zip
+++ /dev/null
Binary files differ

diff --git a/pylib/cqlshlib/async_insert.py b/pylib/cqlshlib/async_insert.py
new file mode 100644
index 0000000..d325716
--- /dev/null
+++ b/pylib/cqlshlib/async_insert.py

@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from threading import Event, Condition
+from . import meter
+import sys
+
+class _CountDownLatch(object):
+    def __init__(self, counter=1):
+        self._count = counter
+        self._lock = Condition()
+
+    def count_down(self):
+        with self._lock:
+            self._count -= 1
+            if self._count <= 0:
+                self._lock.notifyAll()
+
+    def await(self):
+        with self._lock:
+            while self._count > 0:
+                # use a timeout so that the main thread wakes up occasionally
+                # so it can see keyboard interrupts (CASSANDRA-7815)
+                self._lock.wait(0.5)
+
+
+class _ChainedWriter(object):
+
+    CONCURRENCY = 100
+
+    def __init__(self, session, enumerated_reader, statement_func):
+        self._sentinel = object()
+        self._session = session
+        self._cancellation_event = Event()
+        self._first_error = None
+        self._task_counter = _CountDownLatch(self.CONCURRENCY)
+        self._enumerated_reader = enumerated_reader
+        self._statement_func = statement_func
+        self._meter = meter.Meter()
+
+    def insert(self):
+        if not self._enumerated_reader:
+            return 0, None
+
+        for i in xrange(self.CONCURRENCY):
+            self._execute_next(self._sentinel, 0)
+
+        try:
+            self._task_counter.await()
+        except KeyboardInterrupt:
+            self._cancellation_event.set()
+            sys.stdout.write('Aborting due to keyboard interrupt\n')
+            self._task_counter.await()
+        self._meter.done()
+        return self._meter.num_finished(), self._first_error
+
+
+    def _abort(self, error, failed_record):
+        if not self._first_error:
+            self._first_error = error, failed_record
+        self._task_counter.count_down()
+        self._cancellation_event.set()
+
+    def _handle_error(self, error, failed_record):
+        self._abort(error, failed_record)
+
+    def _execute_next(self, result, last_completed_record):
+        if self._cancellation_event.is_set():
+            self._task_counter.count_down()
+            return
+
+        if result is not self._sentinel:
+            self._meter.mark_written()
+
+        try:
+            (current_record, row) = next(self._enumerated_reader)
+        except StopIteration:
+            self._task_counter.count_down()
+            return
+        except Exception as exc:
+            self._abort(exc, last_completed_record)
+            return
+
+        if self._cancellation_event.is_set():
+            self._task_counter.count_down()
+            return
+
+        try:
+            statement = self._statement_func(row)
+            future = self._session.execute_async(statement)
+            future.add_callbacks(callback=self._execute_next,
+                                 callback_args=(current_record,),
+                                 errback=self._handle_error,
+                                 errback_args=(current_record,))
+        except Exception as exc:
+            self._abort(exc, current_record)
+            return
+
+
+def insert_concurrent(session, enumerated_reader, statement_func):
+    return _ChainedWriter(session, enumerated_reader, statement_func).insert()
+

diff --git a/pylib/cqlshlib/cql3handling.py b/pylib/cqlshlib/cql3handling.py
index 0b7863c..8d2fec5 100644
--- a/pylib/cqlshlib/cql3handling.py
+++ b/pylib/cqlshlib/cql3handling.py

@@ -14,27 +14,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
-from warnings import warn
 from .cqlhandling import CqlParsingRuleSet, Hint
-from cql.cqltypes import (cql_types, lookup_casstype, CompositeType, UTF8Type,
-                          ColumnToCollectionType, CounterColumnType, DateType)
-from . import helptopics
+from cassandra.metadata import maybe_escape_name
+from cassandra.metadata import escape_name
 
-simple_cql_types = set(cql_types)
+
+simple_cql_types = set(('ascii', 'bigint', 'blob', 'boolean', 'counter', 'decimal', 'double', 'float', 'inet', 'int',
+                        'text', 'timestamp', 'timeuuid', 'uuid', 'varchar', 'varint'))
 simple_cql_types.difference_update(('set', 'map', 'list'))
 
+from . import helptopics
 cqldocs = helptopics.CQL3HelpTopics()
 
-try:
-    import json
-except ImportError:
-    import simplejson as json
-
-# temporarily have this here until a newer cassandra-dbapi2 is bundled with C*
-class TimestampType(DateType):
-    pass
-
 class UnexpectedTableStructure(UserWarning):
     def __init__(self, msg):
         self.msg = msg
@@ -62,14 +53,12 @@
 
     columnfamily_layout_options = (
         ('bloom_filter_fp_chance', None),
-        ('caching', None),
         ('comment', None),
         ('dclocal_read_repair_chance', 'local_read_repair_chance'),
         ('gc_grace_seconds', None),
-        ('index_interval', None),
+        ('min_index_interval', None),
+        ('max_index_interval', None),
         ('read_repair_chance', None),
-        ('replicate_on_write', None),
-        ('populate_io_cache_on_flush', None),
         ('default_time_to_live', None),
         ('speculative_retry', None),
         ('memtable_flush_period_in_ms', None),
@@ -82,6 +71,8 @@
             ('class', 'max_threshold', 'tombstone_compaction_interval', 'tombstone_threshold', 'enabled', 'unchecked_tombstone_compaction')),
         ('compression', 'compression_parameters',
             ('sstable_compression', 'chunk_length_kb', 'crc_check_chance')),
+        ('caching', None,
+            ('rows_per_partition', 'keys')),
     )
 
     obsolete_cf_options = ()
@@ -98,6 +89,10 @@
         'SERIAL'
     )
 
+    maybe_escape_name = staticmethod(maybe_escape_name)
+
+    escape_name = staticmethod(escape_name)
+
     @classmethod
     def escape_value(cls, value):
         if value is None:
@@ -111,33 +106,14 @@
         return "'%s'" % value.replace("'", "''")
 
     @staticmethod
-    def escape_name(name):
-        return '"%s"' % name.replace('"', '""')
-
-    valid_cql3_word_re = re.compile(r'^[a-z][0-9a-z_]*$')
-
-    @classmethod
-    def is_valid_cql3_name(cls, s):
-        if s is None:
-            return False
-        if s.lower() in cls.keywords - cls.unreserved_keywords:
-            return False
-        return cls.valid_cql3_word_re.match(s) is not None
-
-    @classmethod
-    def maybe_escape_name(cls, name):
-        if cls.is_valid_cql3_name(name):
-            return name
-        return cls.escape_name(name)
-
-    @staticmethod
     def dequote_name(name):
         name = name.strip()
         if name == '':
             return name
         if name[0] == '"' and name[-1] == '"':
-            name = name[1:-1].replace('""', '"')
-        return name
+            return name[1:-1].replace('""', '"')
+        else:
+            return name.lower()
 
     @staticmethod
     def dequote_value(cqlword):
@@ -151,13 +127,12 @@
 CqlRuleSet = Cql3ParsingRuleSet()
 
 # convenience for remainder of module
-shorthands = ('completer_for', 'explain_completion',
-              'dequote_value', 'dequote_name',
-              'escape_value', 'escape_name',
-              'maybe_escape_name')
-
-for shorthand in shorthands:
-    globals()[shorthand] = getattr(CqlRuleSet, shorthand)
+completer_for = CqlRuleSet.completer_for
+explain_completion = CqlRuleSet.explain_completion
+dequote_value = CqlRuleSet.dequote_value
+dequote_name = CqlRuleSet.dequote_name
+escape_value = CqlRuleSet.escape_value
+maybe_escape_name = CqlRuleSet.maybe_escape_name
 
 
 # BEGIN SYNTAX/COMPLETION RULE DEFINITIONS
@@ -203,7 +178,9 @@
          | <uuid>
          | <boolean>
          | <blobLiteral>
+         | <collectionLiteral>
          | <functionName> <functionArguments>
+         | "NULL"
          ;
 
 <functionArguments> ::= "(" ( <term> ( "," <term> )* )? ")"
@@ -212,9 +189,6 @@
 <tokenDefinition> ::= token="TOKEN" "(" <term> ( "," <term> )* ")"
                     | <term>
                     ;
-<value> ::= <term>
-          | <collectionLiteral>
-          ;
 <cident> ::= <quotedName>
            | <identifier>
            | <unreservedKeyword>
@@ -253,11 +227,16 @@
 <schemaChangeStatement> ::= <createKeyspaceStatement>
                           | <createColumnFamilyStatement>
                           | <createIndexStatement>
+                          | <createUserTypeStatement>
+                          | <createTriggerStatement>
                           | <dropKeyspaceStatement>
                           | <dropColumnFamilyStatement>
                           | <dropIndexStatement>
+                          | <dropUserTypeStatement>
+                          | <dropTriggerStatement>
                           | <alterTableStatement>
                           | <alterKeyspaceStatement>
+                          | <alterUserTypeStatement>
                           ;
 
 <authenticationStatement> ::= <createUserStatement>
@@ -274,15 +253,19 @@
 # timestamp is included here, since it's also a keyword
 <simpleStorageType> ::= typename=( <identifier> | <stringLiteral> | <K_TIMESTAMP> ) ;
 
-<storageType> ::= <simpleStorageType> | <collectionType> ;
+<userType> ::= utname=<cfOrKsName> ;
 
-<collectionType> ::= "map" "<" <simpleStorageType> "," <simpleStorageType> ">"
-                   | "list" "<" <simpleStorageType> ">"
-                   | "set" "<" <simpleStorageType> ">"
+<storageType> ::= <simpleStorageType> | <collectionType> | <userType> ;
+
+<collectionType> ::= "map" "<" <simpleStorageType> "," ( <simpleStorageType> | <userType> ) ">"
+                   | "list" "<" ( <simpleStorageType> | <userType> ) ">"
+                   | "set" "<" ( <simpleStorageType> | <userType> ) ">"
                    ;
 
 <columnFamilyName> ::= ( ksname=<cfOrKsName> dot="." )? cfname=<cfOrKsName> ;
 
+<userTypeName> ::= ( ksname=<cfOrKsName> dot="." )? utname=<cfOrKsName> ;
+
 <keyspaceName> ::= ksname=<cfOrKsName> ;
 
 <nonSystemKeyspaceName> ::= ksname=<cfOrKsName> ;
@@ -408,7 +391,7 @@
     currentkey = dequote_value(ctxt.get_binding('propmapkey')[-1])
     if currentkey == 'class':
         return map(escape_value, CqlRuleSet.replication_strategies)
-    return [Hint('<value>')]
+    return [Hint('<term>')]
 
 def ks_prop_val_mapender_completer(ctxt, cass):
     optname = ctxt.get_binding('propname')[-1]
@@ -440,15 +423,15 @@
         return ["{'sstable_compression': '"]
     if this_opt == 'compaction':
         return ["{'class': '"]
+    if this_opt == 'caching':
+        return ["{'keys': '"]
     if any(this_opt == opt[0] for opt in CqlRuleSet.obsolete_cf_options):
         return ["'<obsolete_option>'"]
     if this_opt in ('read_repair_chance', 'bloom_filter_fp_chance',
                     'dclocal_read_repair_chance'):
         return [Hint('<float_between_0_and_1>')]
-    if this_opt in ('replicate_on_write', 'populate_io_cache_on_flush'):
-        return ["'yes'", "'no'"]
     if this_opt in ('min_compaction_threshold', 'max_compaction_threshold',
-                    'gc_grace_seconds', 'index_interval'):
+                    'gc_grace_seconds', 'min_index_interval', 'max_index_interval'):
         return [Hint('<integer>')]
     return [Hint('<option_value>')]
 
@@ -464,6 +447,8 @@
     pairsseen = dict(zip(keysseen, valsseen))
     if optname == 'compression':
         return map(escape_value, set(subopts).difference(keysseen))
+    if optname == 'caching':
+        return map(escape_value, set(subopts).difference(keysseen))
     if optname == 'compaction':
         opts = set(subopts)
         try:
@@ -497,6 +482,11 @@
         if key == 'sstable_compression':
             return map(escape_value, CqlRuleSet.available_compression_classes)
         return [Hint('<option_value>')]
+    elif opt == 'caching':
+        if key == 'rows_per_partition':
+            return ["'ALL'", "'NONE'", Hint('#rows_per_partition')]
+        elif key == 'keys':
+            return ["'ALL'", "'NONE'"]
     return ()
 
 def cf_prop_val_mapender_completer(ctxt, cass):
@@ -524,17 +514,19 @@
     ksnames = [n for n in cass.get_keyspace_names() if n not in NONALTERBALE_KEYSPACES]
     return map(maybe_escape_name, ksnames)
 
-@completer_for('columnFamilyName', 'ksname')
 def cf_ks_name_completer(ctxt, cass):
     return [maybe_escape_name(ks) + '.' for ks in cass.get_keyspace_names()]
 
-@completer_for('columnFamilyName', 'dot')
+completer_for('columnFamilyName', 'ksname')(cf_ks_name_completer)
+
 def cf_ks_dot_completer(ctxt, cass):
     name = dequote_name(ctxt.get_binding('ksname'))
     if name in cass.get_keyspace_names():
         return ['.']
     return []
 
+completer_for('columnFamilyName', 'dot')(cf_ks_dot_completer)
+
 @completer_for('columnFamilyName', 'cfname')
 def cf_name_completer(ctxt, cass):
     ks = ctxt.get_binding('ksname', None)
@@ -548,6 +540,26 @@
         raise
     return map(maybe_escape_name, cfnames)
 
+completer_for('userTypeName', 'ksname')(cf_ks_name_completer)
+
+completer_for('userTypeName', 'dot')(cf_ks_dot_completer)
+
+def ut_name_completer(ctxt, cass):
+    ks = ctxt.get_binding('ksname', None)
+    if ks is not None:
+        ks = dequote_name(ks)
+    try:
+        utnames = cass.get_usertype_names(ks)
+    except Exception:
+        if ks is None:
+            return ()
+        raise
+    return map(maybe_escape_name, utnames)
+
+
+completer_for('userTypeName', 'utname')(ut_name_completer)
+completer_for('userType', 'utname')(ut_name_completer)
+
 @completer_for('unreservedKeyword', 'nocomplete')
 def unreserved_keyword_completer(ctxt, cass):
     # we never want to provide completions through this production;
@@ -555,12 +567,19 @@
     # names, CF names, property values, etc.
     return ()
 
-def get_cf_layout(ctxt, cass):
+def get_table_meta(ctxt, cass):
     ks = ctxt.get_binding('ksname', None)
     if ks is not None:
         ks = dequote_name(ks)
     cf = dequote_name(ctxt.get_binding('cfname'))
-    return cass.get_columnfamily_layout(ks, cf)
+    return cass.get_table_meta(ks, cf)
+
+def get_ut_layout(ctxt, cass):
+    ks = ctxt.get_binding('ksname', None)
+    if ks is not None:
+        ks = dequote_name(ks)
+    ut = dequote_name(ctxt.get_binding('utname'))
+    return cass.get_usertype_layout(ks, ut)
 
 def working_on_keyspace(ctxt):
     wat = ctxt.get_binding('wat').upper()
@@ -576,20 +595,24 @@
                           ( "WHERE" <whereClause> )?
                           ( "ORDER" "BY" <orderByClause> ( "," <orderByClause> )* )?
                           ( "LIMIT" limit=<wholenumber> )?
+                          ( "ALLOW" "FILTERING" )?
                     ;
 <whereClause> ::= <relation> ( "AND" <relation> )*
                 ;
-<relation> ::= [rel_lhs]=<cident> ( "=" | "<" | ">" | "<=" | ">=" ) <term>
+<relation> ::= [rel_lhs]=<cident> ( "=" | "<" | ">" | "<=" | ">=" | "CONTAINS" ) <term>
              | token="TOKEN" "(" [rel_tokname]=<cident>
                                  ( "," [rel_tokname]=<cident> )*
-                             ")" ("=" | "<" | ">" | "<=" | ">=") <tokenDefinition>
+                             ")" ("=" | "<" | ">" | "<=" | ">=" | "CONTAINS") <tokenDefinition>
              | [rel_lhs]=<cident> "IN" "(" <term> ( "," <term> )* ")"
              ;
 <selectClause> ::= "DISTINCT"? <selector> ("AS" <cident>)? ("," <selector> ("AS" <cident>)?)*
                  | "*"
                  | "COUNT" "(" star=( "*" | "1" ) ")" ("AS" <cident>)?
                  ;
+<udtSubfieldSelection> ::= <identifier> "." <identifier>
+                         ;
 <selector> ::= [colname]=<cident>
+             | <udtSubfieldSelection>
              | "WRITETIME" "(" [colname]=<cident> ")"
              | "TTL" "(" [colname]=<cident> ")"
              | <functionName> <selectionFunctionArguments>
@@ -608,8 +631,8 @@
         keyname = ctxt.get_binding('rel_lhs', ())
         if not keyname:
             return [Hint("Can't ORDER BY here: need to specify partition key in WHERE clause")]
-    layout = get_cf_layout(ctxt, cass)
-    order_by_candidates = layout.clustering_key_columns[:]
+    layout = get_table_meta(ctxt, cass)
+    order_by_candidates = [col.name for col in layout.clustering_key]
     if len(order_by_candidates) > len(prev_order_cols):
         return [maybe_escape_name(order_by_candidates[len(prev_order_cols)])]
     return [Hint('No more orderable columns here.')]
@@ -620,26 +643,26 @@
 
 @completer_for('relation', 'rel_tokname')
 def relation_token_subject_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
-    return [layout.partition_key_columns[0]]
+    layout = get_table_meta(ctxt, cass)
+    return [key.name for key in layout.partition_key]
 
 @completer_for('relation', 'rel_lhs')
 def select_relation_lhs_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
-    filterable = set((layout.partition_key_columns[0], layout.clustering_key_columns[0]))
-    already_filtered_on = map(dequote_name, ctxt.get_binding('rel_lhs'))
-    for num in range(1, len(layout.partition_key_columns)):
-        if layout.partition_key_columns[num - 1] in already_filtered_on:
-            filterable.add(layout.partition_key_columns[num])
+    layout = get_table_meta(ctxt, cass)
+    filterable = set((layout.partition_key[0].name, layout.clustering_key[0].name))
+    already_filtered_on = map(dequote_name, ctxt.get_binding('rel_lhs', ()))
+    for num in range(1, len(layout.partition_key)):
+        if layout.partition_key[num - 1].name in already_filtered_on:
+            filterable.add(layout.partition_key[num].name)
         else:
             break
-    for num in range(1, len(layout.clustering_key_columns)):
-        if layout.clustering_key_columns[num - 1] in already_filtered_on:
-            filterable.add(layout.clustering_key_columns[num])
+    for num in range(1, len(layout.clustering_key)):
+        if layout.clustering_key[num - 1].name in already_filtered_on:
+            filterable.add(layout.clustering_key[num].name)
         else:
             break
-    for cd in layout.columns:
-        if cd.index_name is not None:
+    for cd in layout.columns.values():
+        if cd.index:
             filterable.add(cd.name)
     return map(maybe_escape_name, filterable)
 
@@ -653,8 +676,9 @@
 <insertStatement> ::= "INSERT" "INTO" cf=<columnFamilyName>
                                "(" [colname]=<cident> "," [colname]=<cident>
                                    ( "," [colname]=<cident> )* ")"
-                      "VALUES" "(" [newval]=<value> valcomma="," [newval]=<value>
-                                   ( valcomma="," [newval]=<value> )* valcomma=")"
+                      "VALUES" "(" [newval]=<term> valcomma="," [newval]=<term>
+                                   ( valcomma="," [newval]=<term> )* valcomma=")"
+                      ( "IF" "NOT" "EXISTS")?
                       ( "USING" [insertopt]=<usingOption>
                                 ( "AND" [insertopt]=<usingOption> )* )?
                     ;
@@ -663,26 +687,34 @@
                 ;
 '''
 
+def regular_column_names(table_meta):
+    if not table_meta or not table_meta.columns:
+        return []
+    regular_coulmns = list(set(table_meta.columns.keys())
+                           - set([key.name for key in table_meta.partition_key])
+                           - set([key.name for key in table_meta.clustering_key]))
+    return regular_coulmns
+
 @completer_for('insertStatement', 'colname')
 def insert_colname_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
+    layout = get_table_meta(ctxt, cass)
     colnames = set(map(dequote_name, ctxt.get_binding('colname', ())))
-    keycols = layout.primary_key_columns
+    keycols = layout.primary_key
     for k in keycols:
-        if k not in colnames:
-            return [maybe_escape_name(k)]
-    normalcols = set(layout.regular_columns) - colnames
+        if k.name not in colnames:
+            return [maybe_escape_name(k.name)]
+    normalcols = set(regular_column_names(layout)) - colnames
     return map(maybe_escape_name, normalcols)
 
 @completer_for('insertStatement', 'newval')
 def insert_newval_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
+    layout = get_table_meta(ctxt, cass)
     insertcols = map(dequote_name, ctxt.get_binding('colname'))
     valuesdone = ctxt.get_binding('newval', ())
     if len(valuesdone) >= len(insertcols):
         return []
     curcol = insertcols[len(valuesdone)]
-    cqltype = layout.get_column(curcol).cqltype
+    cqltype = layout.columns[curcol].data_type
     coltype = cqltype.typename
     if coltype in ('map', 'set'):
         return ['{']
@@ -695,7 +727,7 @@
 
 @completer_for('insertStatement', 'valcomma')
 def insert_valcomma_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
+    layout = get_table_meta(ctxt, cass)
     numcols = len(ctxt.get_binding('colname', ()))
     numvals = len(ctxt.get_binding('newval', ()))
     if numcols > numvals:
@@ -715,13 +747,19 @@
                                   ( "AND" [updateopt]=<usingOption> )* )?
                         "SET" <assignment> ( "," <assignment> )*
                         "WHERE" <whereClause>
+                        ( "IF" <conditions> )?
                     ;
 <assignment> ::= updatecol=<cident>
-                    ( "=" update_rhs=( <value> | <cident> )
+                    ( "=" update_rhs=( <term> | <cident> )
                                 ( counterop=( "+" | "-" ) inc=<wholenumber>
-                                | listadder="+" listcol=<cident> )
+                                | listadder="+" listcol=<cident> )?
                     | indexbracket="[" <term> "]" "=" <term> )
                ;
+<conditions> ::=  <condition> ( "AND" <condition> )*
+               ;
+<condition> ::= <cident> ( "[" <term> "]" )? ( ( "=" | "<" | ">" | "<=" | ">=" | "!=" ) <term>
+                                             | "IN" "(" <term> ( "," <term> )* ")")
+              ;
 '''
 
 @completer_for('updateStatement', 'updateopt')
@@ -733,14 +771,14 @@
 
 @completer_for('assignment', 'updatecol')
 def update_col_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
-    return map(maybe_escape_name, layout.regular_columns)
+    layout = get_table_meta(ctxt, cass)
+    return map(maybe_escape_name, regular_column_names(layout))
 
 @completer_for('assignment', 'update_rhs')
 def update_countername_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
+    layout = get_table_meta(ctxt, cass)
     curcol = dequote_name(ctxt.get_binding('updatecol', ''))
-    cqltype = layout.get_column(curcol).cqltype
+    cqltype = layout.columns[curcol].data_type
     coltype = cqltype.typename
     if coltype == 'counter':
         return [maybe_escape_name(curcol)]
@@ -752,15 +790,15 @@
 
 @completer_for('assignment', 'counterop')
 def update_counterop_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
+    layout = get_table_meta(ctxt, cass)
     curcol = dequote_name(ctxt.get_binding('updatecol', ''))
-    return ['+', '-'] if layout.is_counter_col(curcol) else []
+    return ['+', '-'] if layout.columns[curcol].data_type.typename == 'counter' else []
 
 @completer_for('assignment', 'inc')
 def update_counter_inc_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
+    layout = get_table_meta(ctxt, cass)
     curcol = dequote_name(ctxt.get_binding('updatecol', ''))
-    if layout.is_counter_col(curcol):
+    if layout.columns[curcol].data_type.typename == 'counter':
         return [Hint('<wholenumber>')]
     return []
 
@@ -781,9 +819,9 @@
 
 @completer_for('assignment', 'indexbracket')
 def update_indexbracket_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
+    layout = get_table_meta(ctxt, cass)
     curcol = dequote_name(ctxt.get_binding('updatecol', ''))
-    coltype = layout.get_column(curcol).cqltype.typename
+    coltype = layout.columns[curcol].data_type.typename
     if coltype in ('map', 'list'):
         return ['[']
     return []
@@ -793,6 +831,7 @@
                         "FROM" cf=<columnFamilyName>
                         ( "USING" [delopt]=<deleteOption> )?
                         "WHERE" <whereClause>
+                        ( "IF" ( "EXISTS" | <conditions> ) )?
                     ;
 <deleteSelector> ::= delcol=<cident> ( memberbracket="[" memberselector=<term> "]" )?
                    ;
@@ -809,8 +848,8 @@
 
 @completer_for('deleteSelector', 'delcol')
 def delete_delcol_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
-    return map(maybe_escape_name, layout.regular_columns)
+    layout = get_table_meta(ctxt, cass)
+    return map(maybe_escape_name, regular_column_names(layout))
 
 syntax_rules += r'''
 <batchStatement> ::= "BEGIN" ( "UNLOGGED" | "COUNTER" )? "BATCH"
@@ -947,17 +986,24 @@
 
 syntax_rules += r'''
 <createIndexStatement> ::= "CREATE" "CUSTOM"? "INDEX" ("IF" "NOT" "EXISTS")? indexname=<identifier>? "ON"
-                               cf=<columnFamilyName> "(" col=<cident> ")"
+                               cf=<columnFamilyName> ( "(" col=<cident> ")" | "(" "KEYS"  "(" col=<cident> ")" ")")
                                ( "USING" <stringLiteral> ( "WITH" "OPTIONS" "=" <mapLiteral> )? )?
                          ;
+
+<createUserTypeStatement> ::= "CREATE" "TYPE" ( ks=<nonSystemKeyspaceName> dot="." )? typename=<cfOrKsName> "(" newcol=<cident> <storageType>
+                                ( "," [newcolname]=<cident> <storageType> )*
+                            ")"
+                         ;
 '''
 
 explain_completion('createIndexStatement', 'indexname', '<new_index_name>')
+explain_completion('createUserTypeStatement', 'typename', '<new_type_name>')
+explain_completion('createUserTypeStatement', 'newcol', '<new_field_name>')
 
 @completer_for('createIndexStatement', 'col')
 def create_index_col_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
-    colnames = [cd.name for cd in layout.columns if cd.index_name is None]
+    layout = get_table_meta(ctxt, cass)
+    colnames = [cd.name for cd in layout.columns.values() if not cd.index]
     return map(maybe_escape_name, colnames)
 
 syntax_rules += r'''
@@ -967,13 +1013,43 @@
 <dropColumnFamilyStatement> ::= "DROP" ( "COLUMNFAMILY" | "TABLE" ) ("IF" "EXISTS")? cf=<columnFamilyName>
                               ;
 
-<dropIndexStatement> ::= "DROP" "INDEX" ("IF" "EXISTS")? indexname=<identifier>
+<indexName> ::= ( ksname=<idxOrKsName> dot="." )? idxname=<idxOrKsName> ;
+
+<idxOrKsName> ::= <identifier>
+               | <quotedName>
+               | <unreservedKeyword>;
+
+<dropIndexStatement> ::= "DROP" "INDEX" ("IF" "EXISTS")? idx=<indexName>
                        ;
+
+<dropUserTypeStatement> ::= "DROP" "TYPE" ut=<userTypeName>
+                              ;
+
 '''
 
-@completer_for('dropIndexStatement', 'indexname')
-def drop_index_completer(ctxt, cass):
-    return map(maybe_escape_name, cass.get_index_names())
+@completer_for('indexName', 'ksname')
+def idx_ks_name_completer(ctxt, cass):
+    return [maybe_escape_name(ks) + '.' for ks in cass.get_keyspace_names()]
+
+@completer_for('indexName', 'dot')
+def idx_ks_dot_completer(ctxt, cass):
+    name = dequote_name(ctxt.get_binding('ksname'))
+    if name in cass.get_keyspace_names():
+        return ['.']
+    return []
+
+@completer_for('indexName', 'idxname')
+def idx_ks_idx_name_completer(ctxt, cass):
+    ks = ctxt.get_binding('ksname', None)
+    if ks is not None:
+        ks = dequote_name(ks)
+    try:
+        idxnames = cass.get_index_names(ks)
+    except Exception:
+        if ks is None:
+            return ()
+        raise
+    return map(maybe_escape_name, idxnames)
 
 syntax_rules += r'''
 <alterTableStatement> ::= "ALTER" wat=( "COLUMNFAMILY" | "TABLE" ) cf=<columnFamilyName>
@@ -986,15 +1062,32 @@
                       | "RENAME" existcol=<cident> "TO" newcol=<cident>
                          ( "AND" existcol=<cident> "TO" newcol=<cident> )*
                       ;
+
+<alterUserTypeStatement> ::= "ALTER" "TYPE" ut=<userTypeName>
+                               <alterTypeInstructions>
+                             ;
+<alterTypeInstructions> ::= "ALTER" existcol=<cident> "TYPE" <storageType>
+                           | "ADD" newcol=<cident> <storageType>
+                           | "RENAME" existcol=<cident> "TO" newcol=<cident>
+                              ( "AND" existcol=<cident> "TO" newcol=<cident> )*
+                           ;
 '''
 
 @completer_for('alterInstructions', 'existcol')
 def alter_table_col_completer(ctxt, cass):
-    layout = get_cf_layout(ctxt, cass)
-    cols = [md.name for md in layout.columns]
+    layout = get_table_meta(ctxt, cass)
+    cols = [str(md) for md in layout.columns]
     return map(maybe_escape_name, cols)
 
+@completer_for('alterTypeInstructions', 'existcol')
+def alter_type_field_completer(ctxt, cass):
+    layout = get_ut_layout(ctxt, cass)
+    fields = [tuple[0] for tuple in layout]
+    return map(maybe_escape_name, fields)
+
 explain_completion('alterInstructions', 'newcol', '<new_column_name>')
+explain_completion('alterTypeInstructions', 'newcol', '<new_field_name>')
+
 
 syntax_rules += r'''
 <alterKeyspaceStatement> ::= "ALTER" wat=( "KEYSPACE" | "SCHEMA" ) ks=<alterableKeyspaceName>
@@ -1066,130 +1159,31 @@
     if ctxt.matched[0][0] == 'K_CREATE':
         return [Hint('<username>')]
 
-    cursor = cass.conn.cursor()
-    cursor.execute("LIST USERS")
-    return [maybe_quote(row[0].replace("'", "''")) for row in cursor.fetchall()]
+    session = cass.session
+    return [maybe_quote(row.values()[0].replace("'", "''")) for row in session.execute("LIST USERS")]
+
+syntax_rules += r'''
+<createTriggerStatement> ::= "CREATE" "TRIGGER" ( "IF" "NOT" "EXISTS" )? <cident>
+                               "ON" cf=<columnFamilyName> "USING" class=<stringLiteral>
+                           ;
+
+<dropTriggerStatement> ::= "DROP" "TRIGGER" ( "IF" "EXISTS" )? triggername=<cident>
+                             "ON" cf=<columnFamilyName>
+                         ;
+'''
+explain_completion('createTriggerStatement', 'class', '\'fully qualified class name\'')
+
+def get_trigger_names(ctxt, cass):
+    ks = ctxt.get_binding('ksname', None)
+    if ks is not None:
+        ks = dequote_name(ks)
+    return cass.get_trigger_names(ks)
+
+@completer_for('dropTriggerStatement', 'triggername')
+def alter_type_field_completer(ctxt, cass):
+    names = get_trigger_names(ctxt, cass)
+    return map(maybe_escape_name, names)
 
 # END SYNTAX/COMPLETION RULE DEFINITIONS
 
 CqlRuleSet.append_rules(syntax_rules)
-
-class CqlColumnDef:
-    index_name = None
-    index_type = None
-    component_type = 'regular'
-    component_index = None
-    index_options = {}
-
-    def __init__(self, name, cqltype):
-        self.name = name
-        self.cqltype = cqltype
-        assert name is not None
-
-    @classmethod
-    def from_layout(cls, layout):
-        c = cls(layout[u'column_name'], lookup_casstype(layout[u'validator']))
-        c.component_type = layout[u'type']
-        idx = layout[u'component_index'] # can be None
-        if idx:
-            c.component_index = int(idx)
-        c.index_name = layout[u'index_name']
-        c.index_type = layout[u'index_type']
-        if c.index_type == 'CUSTOM':
-            c.index_options = json.loads(layout[u'index_options'])
-        return c
-
-    def is_static(self):
-        return self.component_type == 'static'
-
-    def __str__(self):
-        indexstr = ' (index %s)' % self.index_name if self.index_name is not None else ''
-        return '<CqlColumnDef %r %r%s>' % (self.name, self.cqltype, indexstr)
-    __repr__ = __str__
-
-class CqlTableDef:
-    """Names of all columns which are grouped into the partition key"""
-    partition_key_columns = ()
-
-    """Names of all columns which are part of the primary key, but not grouped
-       into the partition key"""
-    clustering_key_columns = ()
-
-    """Names of all columns which are part of the primary key, whether or not
-       they are grouped into the partition key"""
-    primary_key_columns = ()
-
-    """Names of all columns which aren't part of the primary key"""
-    regular_columns = ()
-
-    """CqlColumnDef objects for all columns. Use .get_column() to access one
-       by name."""
-    columns = ()
-
-    def __init__(self, name):
-        self.name = name
-
-    @classmethod
-    def from_layout(cls, layout, coldefs):
-        """
-        This constructor accepts a dictionary of column-value pairs from a row
-        of system.schema_columnfamilies, and a sequence of similar dictionaries
-        from corresponding rows in system.schema_columns.
-        """
-        cf = cls(name=layout[u'columnfamily_name'])
-        cf.keyspace = layout[u'keyspace_name']
-        for attr, val in layout.items():
-            setattr(cf, attr.encode('ascii'), val)
-        cf.comparator = lookup_casstype(cf.comparator)
-        for attr in ('compaction_strategy_options', 'compression_parameters'):
-            setattr(cf, attr, json.loads(getattr(cf, attr)))
-
-        # deal with columns, filter out empty column names (see CASSANDRA-6139)
-        columns = filter(lambda c: c.name, map(CqlColumnDef.from_layout, coldefs))
-
-        partition_key_cols = filter(lambda c: c.component_type == u'partition_key', columns)
-        partition_key_cols.sort(key=lambda c: c.component_index)
-        cf.partition_key_columns = map(lambda c: c.name, partition_key_cols)
-
-        clustering_key_cols = filter(lambda c: c.component_type == u'clustering_key', columns)
-        clustering_key_cols.sort(key=lambda c: c.component_index)
-        cf.clustering_key_columns = map(lambda c: c.name, clustering_key_cols)
-
-        cf.primary_key_columns = cf.partition_key_columns + cf.clustering_key_columns
-
-        regular_cols = list(set(columns) - set(partition_key_cols) - set(clustering_key_cols))
-        regular_cols.sort(key=lambda c: c.name)
-        cf.regular_columns = map(lambda c: c.name, regular_cols)
-
-        cf.columns = partition_key_cols + clustering_key_cols + regular_cols
-        return cf
-
-    # not perfect, but good enough; please read CFDefinition constructor comments
-    # returns False if we are dealing with a CQL3 table, True otherwise.
-    # 'compact' here means 'needs WITH COMPACT STORAGE option for CREATE TABLE in CQL3'.
-    def is_compact_storage(self):
-        if not issubclass(self.comparator, CompositeType):
-            return True
-        for subtype in self.comparator.subtypes:
-            if issubclass(subtype, ColumnToCollectionType):
-                return False
-        if len(self.clustering_key_columns) == len(self.comparator.subtypes) - 1:
-            if self.comparator.subtypes[-1] is UTF8Type:
-                return False
-        return True
-
-    def is_counter_col(self, colname):
-        try:
-            return bool(self.get_column(colname).cqltype is CounterColumnType)
-        except KeyError:
-            return False
-
-    def get_column(self, colname):
-        col_info = [cm for cm in self.columns if cm.name == colname]
-        if not col_info:
-            raise KeyError("column %r not found" % (colname,))
-        return col_info[0]
-
-    def __str__(self):
-        return '<%s %s.%s>' % (self.__class__.__name__, self.keyspace, self.name)
-    __repr__ = __str__

diff --git a/pylib/cqlshlib/cqlhandling.py b/pylib/cqlshlib/cqlhandling.py
index 0d54630..6e61ac1 100644
--- a/pylib/cqlshlib/cqlhandling.py
+++ b/pylib/cqlshlib/cqlhandling.py

@@ -17,10 +17,8 @@
 # code for dealing with CQL's syntax, rules, interpretation
 # i.e., stuff that's not necessarily cqlsh-specific
 
-import re
 import traceback
 from . import pylexotron, util
-from cql import cqltypes
 
 Hint = pylexotron.Hint
 

diff --git a/pylib/cqlshlib/formatting.py b/pylib/cqlshlib/formatting.py
index 6b3ff93..845ffac 100644
--- a/pylib/cqlshlib/formatting.py
+++ b/pylib/cqlshlib/formatting.py

@@ -16,12 +16,12 @@
 
 import re
 import time
-import binascii
+import calendar
 import math
 from collections import defaultdict
 from . import wcwidth
 from .displaying import colorme, FormattedValue, DEFAULT_VALUE_COLORS
-from cql import cqltypes
+from cassandra.cqltypes import EMPTY
 
 unicode_controlchars_re = re.compile(r'[\x00-\x31\x7f-\xa0]')
 controlchars_re = re.compile(r'[\x00-\x31\x7f-\xff]')
@@ -96,11 +96,11 @@
 # making format_value a generic function
 _formatters = {}
 
-def format_value(cqltype, val, **kwargs):
-    if val == '' and not cqltype.empty_binary_ok:
-        return format_value_default(val, **kwargs)
-    formatter = _formatters.get(cqltype.typename, format_value_default)
-    return formatter(val, subtypes=cqltype.subtypes, **kwargs)
+def format_value(type, val, **kwargs):
+    if val == EMPTY:
+        return format_value_default('', **kwargs)
+    formatter = _formatters.get(type.__name__, format_value_default)
+    return formatter(val, **kwargs)
 
 def formatter_for(typname):
     def registrator(f):
@@ -108,10 +108,12 @@
         return f
     return registrator
 
-@formatter_for('blob')
+@formatter_for('bytearray')
 def format_value_blob(val, colormap, **_):
-    bval = '0x' + ''.join('%02x' % ord(c) for c in val)
+    bval = '0x' + ''.join('%02x' % c for c in val)
     return colorme(bval, colormap, 'blob')
+formatter_for('buffer')(format_value_blob)
+
 
 def format_python_formatted_type(val, colormap, color, quote=False):
     bval = str(val)
@@ -119,21 +121,20 @@
         bval = "'%s'" % bval
     return colorme(bval, colormap, color)
 
-@formatter_for('decimal')
+@formatter_for('Decimal')
 def format_value_decimal(val, colormap, **_):
     return format_python_formatted_type(val, colormap, 'decimal')
 
-@formatter_for('uuid')
+@formatter_for('UUID')
 def format_value_uuid(val, colormap, **_):
     return format_python_formatted_type(val, colormap, 'uuid')
 
-formatter_for('timeuuid')(format_value_uuid)
 
 @formatter_for('inet')
 def formatter_value_inet(val, colormap, quote=False, **_):
     return format_python_formatted_type(val, colormap, 'inet', quote=quote)
 
-@formatter_for('boolean')
+@formatter_for('bool')
 def format_value_boolean(val, colormap, **_):
     return format_python_formatted_type(val, colormap, 'boolean')
 
@@ -147,25 +148,24 @@
     return colorme(bval, colormap, 'float')
 
 formatter_for('float')(format_floating_point_type)
-formatter_for('double')(format_floating_point_type)
 
 def format_integer_type(val, colormap, **_):
     # base-10 only for now; support others?
     bval = str(val)
     return colorme(bval, colormap, 'int')
 
-formatter_for('bigint')(format_integer_type)
+formatter_for('long')(format_integer_type)
 formatter_for('int')(format_integer_type)
-formatter_for('varint')(format_integer_type)
-formatter_for('counter')(format_integer_type)
 
-@formatter_for('timestamp')
+@formatter_for('date')
 def format_value_timestamp(val, colormap, time_format, quote=False, **_):
-    bval = strftime(time_format, val)
+    bval = strftime(time_format, calendar.timegm(val.utctimetuple()))
     if quote:
         bval = "'%s'" % bval
     return colorme(bval, colormap, 'timestamp')
 
+formatter_for('datetime')(format_value_timestamp)
+
 def strftime(time_format, seconds):
     local = time.localtime(seconds)
     formatted = time.strftime(time_format, local)
@@ -183,7 +183,7 @@
     hours, minutes = divmod(abs(offset) / 60, 60)
     return formatted[:-5] + sign + '{0:0=2}{1:0=2}'.format(hours, minutes)
 
-@formatter_for('text')
+@formatter_for('str')
 def format_value_text(val, encoding, colormap, quote=False, **_):
     escapedval = val.replace(u'\\', u'\\\\')
     if quote:
@@ -196,12 +196,11 @@
     return color_text(bval, colormap, displaywidth)
 
 # name alias
-formatter_for('varchar')(format_value_text)
-formatter_for('ascii')(format_value_text)
+formatter_for('unicode')(format_value_text)
 
-def format_simple_collection(subtype, val, lbracket, rbracket, encoding,
+def format_simple_collection(val, lbracket, rbracket, encoding,
                              colormap, time_format, float_precision, nullval):
-    subs = [format_value(subtype, sval, encoding=encoding, colormap=colormap,
+    subs = [format_value(type(sval), sval, encoding=encoding, colormap=colormap,
                          time_format=time_format, float_precision=float_precision,
                          nullval=nullval, quote=True)
             for sval in val]
@@ -213,24 +212,31 @@
     return FormattedValue(bval, coloredval, displaywidth)
 
 @formatter_for('list')
-def format_value_list(val, encoding, colormap, time_format, float_precision, subtypes, nullval, **_):
-    return format_simple_collection(subtypes[0], val, '[', ']', encoding, colormap,
+def format_value_list(val, encoding, colormap, time_format, float_precision, nullval, **_):
+    return format_simple_collection(val, '[', ']', encoding, colormap,
+                                    time_format, float_precision, nullval)
+
+@formatter_for('tuple')
+def format_value_tuple(val, encoding, colormap, time_format, float_precision, nullval, **_):
+    return format_simple_collection(val, '(', ')', encoding, colormap,
                                     time_format, float_precision, nullval)
 
 @formatter_for('set')
-def format_value_set(val, encoding, colormap, time_format, float_precision, subtypes, nullval, **_):
-    return format_simple_collection(subtypes[0], sorted(val), '{', '}', encoding, colormap,
+def format_value_set(val, encoding, colormap, time_format, float_precision, nullval, **_):
+    return format_simple_collection(sorted(val), '{', '}', encoding, colormap,
                                     time_format, float_precision, nullval)
+formatter_for('frozenset')(format_value_set)
+formatter_for('sortedset')(format_value_set)
 
-@formatter_for('map')
-def format_value_map(val, encoding, colormap, time_format, float_precision, subtypes, nullval, **_):
-    def subformat(v, subtype):
-        return format_value(subtype, v, encoding=encoding, colormap=colormap,
+
+@formatter_for('dict')
+def format_value_map(val, encoding, colormap, time_format, float_precision, nullval, **_):
+    def subformat(v):
+        return format_value(type(v), v, encoding=encoding, colormap=colormap,
                             time_format=time_format, float_precision=float_precision,
                             nullval=nullval, quote=True)
 
-    subkeytype, subvaltype = subtypes
-    subs = [(subformat(k, subkeytype), subformat(v, subvaltype)) for (k, v) in sorted(val.items())]
+    subs = [(subformat(k), subformat(v)) for (k, v) in sorted(val.items())]
     bval = '{' + ', '.join(k.strval + ': ' + v.strval for (k, v) in subs) + '}'
     lb, comma, colon, rb = [colormap['collection'] + s + colormap['reset']
                             for s in ('{', ', ', ': ', '}')]
@@ -239,3 +245,26 @@
                + rb
     displaywidth = 4 * len(subs) + sum(k.displaywidth + v.displaywidth for (k, v) in subs)
     return FormattedValue(bval, coloredval, displaywidth)
+formatter_for('OrderedDict')(format_value_map)
+
+
+def format_value_utype(val, encoding, colormap, time_format, float_precision, nullval, **_):
+    def format_field_value(v):
+        if v is None:
+            return colorme(nullval, colormap, 'error')
+        return format_value(type(v), v, encoding=encoding, colormap=colormap,
+                            time_format=time_format, float_precision=float_precision,
+                            nullval=nullval, quote=True)
+
+    def format_field_name(name):
+        return format_value_text(name, encoding=encoding, colormap=colormap, quote=False)
+
+    subs = [(format_field_name(k), format_field_value(v)) for (k, v) in val._asdict().items()]
+    bval = '{' + ', '.join(k.strval + ': ' + v.strval for (k, v) in subs) + '}'
+    lb, comma, colon, rb = [colormap['collection'] + s + colormap['reset']
+                            for s in ('{', ', ', ': ', '}')]
+    coloredval = lb \
+                 + comma.join(k.coloredval + colon + v.coloredval for (k, v) in subs) \
+                 + rb
+    displaywidth = 4 * len(subs) + sum(k.displaywidth + v.displaywidth for (k, v) in subs)
+    return FormattedValue(bval, coloredval, displaywidth)

diff --git a/pylib/cqlshlib/helptopics.py b/pylib/cqlshlib/helptopics.py
index 710aa74..34584ff 100644
--- a/pylib/cqlshlib/helptopics.py
+++ b/pylib/cqlshlib/helptopics.py

@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cql.cqltypes import cql_types
+from .cql3handling import simple_cql_types
 
 class CQLHelpTopics(object):
     def get_help_topics(self):
@@ -25,7 +25,7 @@
 
     def help_types(self):
         print "\n        CQL types recognized by this version of cqlsh:\n"
-        for t in cql_types:
+        for t in simple_cql_types:
             print '          ' + t
         print """
         For information on the various recognizable input formats for these
@@ -388,7 +388,7 @@
 
         Counter columns can be incremented or decremented by an arbitrary
         numeric value though the assignment of an expression that adds or
-        substracts the value.
+        subtracts the value.
         """
 
     def help_update_where(self):

diff --git a/pylib/cqlshlib/meter.py b/pylib/cqlshlib/meter.py
new file mode 100644
index 0000000..e1a6bfc
--- /dev/null
+++ b/pylib/cqlshlib/meter.py

@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from time import time
+import sys
+from threading import RLock
+
+
+class Meter(object):
+
+    def __init__(self):
+        self._num_finished = 0
+        self._last_checkpoint_time = None
+        self._current_rate = 0.0
+        self._lock = RLock()
+
+    def mark_written(self):
+        with self._lock:
+            if not self._last_checkpoint_time:
+                self._last_checkpoint_time = time()
+            self._num_finished += 1
+
+            if self._num_finished % 10000 == 0:
+                previous_checkpoint_time = self._last_checkpoint_time
+                self._last_checkpoint_time = time()
+                new_rate = 10000.0 / (self._last_checkpoint_time - previous_checkpoint_time)
+                if self._current_rate == 0.0:
+                    self._current_rate = new_rate
+                else:
+                    self._current_rate = (self._current_rate + new_rate) / 2.0
+
+            if self._num_finished % 1000 != 0:
+                return
+            output = 'Processed %s rows; Write: %.2f rows/s\r' % \
+                     (self._num_finished, self._current_rate)
+            sys.stdout.write(output)
+            sys.stdout.flush()
+
+    def num_finished(self):
+        with self._lock:
+            return self._num_finished
+
+    def done(self):
+        print ""
+
+

diff --git a/pylib/cqlshlib/pylexotron.py b/pylib/cqlshlib/pylexotron.py
index ad283df..b4ac36f 100644
--- a/pylib/cqlshlib/pylexotron.py
+++ b/pylib/cqlshlib/pylexotron.py

@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from functools import partial
 import re
 from .saferscanner import SaferScanner
 
@@ -24,6 +23,9 @@
         bad_char = len(rulestr) - len(unmatched)
         linenum = rulestr[:bad_char].count('\n') + 1
         charnum = len(rulestr[:bad_char].rsplit('\n', 1)[-1]) + 1
+        snippet_start = max(0, min(len(rulestr), bad_char - 10))
+        snippet_end = max(0, min(len(rulestr), bad_char + 10))
+        msg += " (Error at: '...%s...')" % (rulestr[snippet_start:snippet_end],)
         raise cls(linenum, charnum, msg)
 
     def __init__(self, linenum, charnum, msg='Lexing error'):

diff --git a/pylib/cqlshlib/ssl.py b/pylib/cqlshlib/sslhandling.py
similarity index 82%
rename from pylib/cqlshlib/ssl.py
rename to pylib/cqlshlib/sslhandling.py
index 3400b40..70dd759 100644
--- a/pylib/cqlshlib/ssl.py
+++ b/pylib/cqlshlib/sslhandling.py

@@ -17,15 +17,14 @@
 import os
 import sys
 import ConfigParser
-from thrift.transport import TSSLSocket, TTransport
+import ssl
 
-def ssl_transport_factory(host, port, env, config_file):
+def ssl_settings(host, config_file, env=os.environ):
     """
-    SSL Thrift transport factory function.
+    Function wcich generates SSL setting for cassandra.Cluster
 
     Params:
     * host .........: hostname of Cassandra node.
-    * port .........: port number to connect to.
     * env ..........: environment variables. SSL factory will use, if passed,
                       SSL_CERTFILE and SSL_VALIDATE variables.
     * config_file ..: path to cqlsh config file (usually ~/.cqlshrc).
@@ -65,6 +64,15 @@
     if not ssl_certfile is None:
         ssl_certfile = os.path.expanduser(ssl_certfile)
 
-    tsocket = TSSLSocket.TSSLSocket(host, port, ca_certs=ssl_certfile,
-                                    validate=ssl_validate)
-    return TTransport.TFramedTransport(tsocket)
+    userkey = get_option('ssl', 'userkey')
+    if userkey:
+        userkey = os.path.expanduser(userkey)
+    usercert = get_option('ssl', 'usercert')
+    if usercert:
+        usercert = os.path.expanduser(usercert)
+
+    return dict(ca_certs=ssl_certfile,
+                cert_reqs=ssl.CERT_REQUIRED if ssl_validate else ssl.CERT_NONE,
+                ssl_version=ssl.PROTOCOL_TLSv1,
+                keyfile=userkey, certfile=usercert)
+

diff --git a/pylib/cqlshlib/test/basecase.py b/pylib/cqlshlib/test/basecase.py
index efc2555..5600f1a 100644
--- a/pylib/cqlshlib/test/basecase.py
+++ b/pylib/cqlshlib/test/basecase.py

@@ -44,10 +44,10 @@
 
 sys.path.append(rundir)
 import cqlsh
-cql = cqlsh.cql
+cql = cqlsh.cassandra.cluster.Cluster
 
-TEST_HOST = os.environ.get('CQL_TEST_HOST', 'localhost')
-TEST_PORT = int(os.environ.get('CQL_TEST_PORT', 9160))
+TEST_HOST = os.environ.get('CQL_TEST_HOST', '127.0.0.1')
+TEST_PORT = int(os.environ.get('CQL_TEST_PORT', 9042))
 
 class BaseTestCase(unittest.TestCase):
     def assertNicelyFormattedTableHeader(self, line, msg=None):

diff --git a/pylib/cqlshlib/test/cassconnect.py b/pylib/cqlshlib/test/cassconnect.py
index 63d8c10..21dddcd 100644
--- a/pylib/cqlshlib/test/cassconnect.py
+++ b/pylib/cqlshlib/test/cassconnect.py

@@ -24,15 +24,15 @@
 
 test_keyspace_init = os.path.join(rundir, 'test_keyspace_init.cql')
 
-def get_cassandra_connection(cql_version=None):
+def get_cassandra_connection(cql_version=cqlsh.DEFAULT_CQLVER):
     if cql_version is None:
-        cql_version = '3.1.0'
-    conn = cql.connect(TEST_HOST, TEST_PORT, cql_version=cql_version)
+        cql_version = cqlsh.DEFAULT_CQLVER
+    conn = cql((TEST_HOST,), TEST_PORT, cql_version=cql_version)
     # until the cql lib does this for us
     conn.cql_version = cql_version
     return conn
 
-def get_cassandra_cursor(cql_version=None):
+def get_cassandra_cursor(cql_version=cqlsh.DEFAULT_CQLVER):
     return get_cassandra_connection(cql_version=cql_version).cursor()
 
 TEST_KEYSPACES_CREATED = []
@@ -46,17 +46,17 @@
 
 def create_test_keyspace(cursor):
     ksname = make_test_ks_name()
-    qksname = quote_name(cursor, ksname)
+    qksname = quote_name(ksname)
     cursor.execute('''
         CREATE KEYSPACE %s WITH replication =
             {'class': 'SimpleStrategy', 'replication_factor': 1};
-    ''' % quote_name(cursor, ksname))
+    ''' % quote_name(ksname))
     cursor.execute('USE %s;' % qksname)
     TEST_KEYSPACES_CREATED.append(ksname)
     return ksname
 
-def split_cql_commands(source, cqlver='3.1.0'):
-    ruleset = cql_rule_set(cqlver)
+def split_cql_commands(source):
+    ruleset = cql_rule_set()
     statements, in_batch = ruleset.cql_split_statements(source)
     if in_batch:
         raise ValueError("CQL source ends unexpectedly")
@@ -64,7 +64,7 @@
     return [ruleset.cql_extract_orig(toks, source) for toks in statements if toks]
 
 def execute_cql_commands(cursor, source, logprefix='INIT: '):
-    for cql in split_cql_commands(source, cqlver=cursor._connection.cql_version):
+    for cql in split_cql_commands(source):
         cqlshlog.debug(logprefix + cql)
         cursor.execute(cql)
 
@@ -73,17 +73,17 @@
         return execute_cql_commands(cursor, f.read())
 
 def create_test_db():
-    with cassandra_cursor(ks=None, cql_version='3.1.0') as c:
+    with cassandra_cursor(ks=None) as c:
         k = create_test_keyspace(c)
         execute_cql_file(c, test_keyspace_init)
     return k
 
 def remove_test_db():
     with cassandra_cursor(ks=None) as c:
-        c.execute('DROP KEYSPACE %s' % quote_name(c, TEST_KEYSPACES_CREATED.pop(-1)))
+        c.execute('DROP KEYSPACE %s' % quote_name(TEST_KEYSPACES_CREATED.pop(-1)))
 
 @contextlib.contextmanager
-def cassandra_connection(cql_version=None):
+def cassandra_connection(cql_version=cqlsh.DEFAULT_CQLVER):
     """
     Make a Cassandra CQL connection with the given CQL version and get a cursor
     for it, and optionally connect to a given keyspace.
@@ -115,22 +115,18 @@
         ks = get_test_keyspace()
     conn = get_cassandra_connection(cql_version=cql_version)
     try:
-        c = conn.cursor()
-        if ks is not None:
-            c.execute('USE %s;' % quote_name(c, ks))
+        c = conn.connect(ks)
+        # if ks is not None:
+        #     c.execute('USE %s;' % quote_name(c, ks))
         yield c
     finally:
-        conn.close()
+        conn.shutdown()
 
-def cql_rule_set(cqlver):
+def cql_rule_set():
     return cqlsh.cql3handling.CqlRuleSet
 
-def quote_name(cqlver, name):
-    if isinstance(cqlver, cql.cursor.Cursor):
-        cqlver = cqlver._connection
-    if isinstance(cqlver, cql.connection.Connection):
-        cqlver = cqlver.cql_version
-    return cql_rule_set(cqlver).maybe_escape_name(name)
+def quote_name(name):
+    return cql_rule_set().maybe_escape_name(name)
 
 class DEFAULTVAL: pass
 

diff --git a/pylib/cqlshlib/test/run_cqlsh.py b/pylib/cqlshlib/test/run_cqlsh.py
index 929849c..6ae295c 100644
--- a/pylib/cqlshlib/test/run_cqlsh.py
+++ b/pylib/cqlshlib/test/run_cqlsh.py

@@ -246,6 +246,7 @@
         # readline trying to be friendly- remove these artifacts
         output = output.replace(' \r', '')
         output = output.replace('\r', '')
+        output = output.replace(' \b', '')
         if self.tty:
             echo, output = output.split('\n', 1)
             assert echo == cmd, "unexpected echo %r instead of %r" % (echo, cmd)

diff --git a/pylib/cqlshlib/test/test_cqlsh_completion.py b/pylib/cqlshlib/test/test_cqlsh_completion.py
index 63296fa..820414d 100644
--- a/pylib/cqlshlib/test/test_cqlsh_completion.py
+++ b/pylib/cqlshlib/test/test_cqlsh_completion.py

@@ -33,11 +33,11 @@
 # isn't coming
 COMPLETION_RESPONSE_TIME = 0.5
 
-completion_separation_re = re.compile(r'\s\s+')
+completion_separation_re = re.compile(r'\s+')
 
 class CqlshCompletionCase(BaseTestCase):
     def setUp(self):
-        self.cqlsh_runner = testrun_cqlsh(cqlver=self.cqlver, env={'COLUMNS': '100000'})
+        self.cqlsh_runner = testrun_cqlsh(cqlver=cqlsh.DEFAULT_CQLVER, env={'COLUMNS': '100000'})
         self.cqlsh = self.cqlsh_runner.__enter__()
 
     def tearDown(self):
@@ -56,6 +56,7 @@
         self.cqlsh.send(inputstring)
         self.cqlsh.send(TAB)
         completed = self.cqlsh.read_up_to_timeout(COMPLETION_RESPONSE_TIME)
+        completed = completed.replace(' \b', '')
         self.assertEqual(completed[:len(inputstring)], inputstring)
         completed = completed[len(inputstring):]
         completed = completed.replace(BEL, '')
@@ -91,13 +92,13 @@
         return self.module.CqlRuleSet.replication_strategies
 
 class TestCqlshCompletion(CqlshCompletionCase):
-    cqlver = '3.1.0'
+    cqlver = '3.1.6'
     module = cqlsh.cql3handling
 
     def test_complete_on_empty_string(self):
         self.trycompletions('', choices=('?', 'ALTER', 'BEGIN', 'CAPTURE', 'CONSISTENCY',
                                          'COPY', 'CREATE', 'DEBUG', 'DELETE', 'DESC', 'DESCRIBE',
-                                         'DROP', 'GRANT', 'HELP', 'INSERT', 'LIST', 'REVOKE',
+                                         'DROP', 'GRANT', 'HELP', 'INSERT', 'LIST', 'PAGING', 'REVOKE',
                                          'SELECT', 'SHOW', 'SOURCE', 'TRACING', 'EXPAND', 'TRUNCATE',
                                          'UPDATE', 'USE', 'exit', 'quit'))
 
@@ -152,10 +153,10 @@
                                "{'class': 'SimpleStrategy', 'repl", "ication_factor'")
         self.trycompletions("create keyspace foo with replication ="
                                "{'class': 'SimpleStrategy', 'replication_factor': ", '',
-                            choices=('<value>',))
+                            choices=('<term>',))
         self.trycompletions("create keyspace foo with replication ="
                                "{'class': 'SimpleStrategy', 'replication_factor': 1", '',
-                            choices=('<value>',))
+                            choices=('<term>',))
         self.trycompletions("create keyspace foo with replication ="
                                "{'class': 'SimpleStrategy', 'replication_factor': 1 ", '}')
         self.trycompletions("create keyspace foo with replication ="
@@ -176,7 +177,7 @@
     def test_complete_in_string_literals(self):
         # would be great if we could get a space after this sort of completion,
         # but readline really wants to make things difficult for us
-        self.trycompletions('insert into system."NodeId', 'Info"')
+        self.trycompletions('insert into system."Index', 'Info"')
         self.trycompletions('USE "', choices=('system', self.cqlsh.keyspace),
                             other_choices_ok=True)
         self.trycompletions("create keyspace blah with replication = {'class': 'Sim",

diff --git a/pylib/cqlshlib/test/test_cqlsh_output.py b/pylib/cqlshlib/test/test_cqlsh_output.py
index e680cab..baf72c1 100644
--- a/pylib/cqlshlib/test/test_cqlsh_output.py
+++ b/pylib/cqlshlib/test/test_cqlsh_output.py

@@ -21,7 +21,7 @@
 
 import re
 from itertools import izip
-from .basecase import (BaseTestCase, cqlshlog, dedent, at_a_time,
+from .basecase import (BaseTestCase, cqlshlog, dedent, at_a_time, cqlsh,
                        TEST_HOST, TEST_PORT)
 from .cassconnect import (get_test_keyspace, testrun_cqlsh, testcall_cqlsh,
                           cassandra_cursor, split_cql_commands, quote_name)
@@ -67,7 +67,7 @@
                                  % (tags, coloredtext.colored_version(), coloredtext.colortags()))
 
     def assertCqlverQueriesGiveColoredOutput(self, queries_and_expected_outputs,
-                                             cqlver=(), **kwargs):
+                                             cqlver=(cqlsh.DEFAULT_CQLVER,), **kwargs):
         if not isinstance(cqlver, (tuple, list)):
             cqlver = (cqlver,)
         for ver in cqlver:
@@ -195,7 +195,7 @@
             (1 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
         q = 'select COUNT(*) FROM twenty_rows_composite_table limit 1000000;'
         self.assertQueriesGiveColoredOutput((
@@ -211,13 +211,13 @@
             (1 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_static_cf_output(self):
         self.assertCqlverQueriesGiveColoredOutput((
             ("select a, b from twenty_rows_table where a in ('1', '13', '2');", """
              a  | b
-             MM   MM
+             RR   MM
             ----+----
 
               1 |  1
@@ -231,12 +231,12 @@
             (3 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
         self.assertQueriesGiveColoredOutput((
             ('select * from dynamic_columns;', """
              somekey | column1 | value
-             MMMMMMM   MMMMMMM   MMMMM
+             RRRRRRR   CCCCCCC   MMMMM
             ---------+---------+-------------------------
 
                    1 |     1.2 |           one point two
@@ -254,23 +254,34 @@
             (5 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_empty_cf_output(self):
+        # we print the header after CASSANDRA-6910
         self.assertCqlverQueriesGiveColoredOutput((
             ('select * from empty_table;', """
+             lonelykey | lonelycol
+             RRRRRRRRR   MMMMMMMMM
+            -----------+-----------
+
+
             (0 rows)
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
         q = 'select * from has_all_types where num = 999;'
 
         # same query should show up as empty in cql 3
         self.assertQueriesGiveColoredOutput((
             (q, """
+             num | asciicol | bigintcol | blobcol | booleancol | decimalcol | doublecol | floatcol | intcol | textcol | timestampcol | uuidcol | varcharcol | varintcol
+             RRR   MMMMMMMM   MMMMMMMMM   MMMMMMM   MMMMMMMMMM   MMMMMMMMMM   MMMMMMMMM   MMMMMMMM   MMMMMM   MMMMMMM   MMMMMMMMMMMM   MMMMMMM   MMMMMMMMMM   MMMMMMMMM
+            -----+----------+-----------+---------+------------+------------+-----------+----------+--------+---------+--------------+---------+------------+-----------
+
+
             (0 rows)
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_columnless_key_output(self):
         q = "select a from twenty_rows_table where a in ('1', '2', '-9192');"
@@ -278,7 +289,7 @@
         self.assertQueriesGiveColoredOutput((
             (q, """
              a
-             M
+             R
             ---
 
              1
@@ -290,7 +301,7 @@
             (2 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_numeric_output(self):
         self.assertCqlverQueriesGiveColoredOutput((
@@ -339,7 +350,7 @@
             (5 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_timestamp_output(self):
         self.assertQueriesGiveColoredOutput((
@@ -376,7 +387,7 @@
         self.assertCqlverQueriesGiveColoredOutput((
             ('select num, booleancol from has_all_types where num in (0, 1, 2, 3);', """
              num | booleancol
-             MMM   MMMMMMMMMM
+             RRR   MMMMMMMMMM
             -----+------------
 
                0 |       True
@@ -392,14 +403,14 @@
             (4 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_null_output(self):
         # column with metainfo but no values
         self.assertCqlverQueriesGiveColoredOutput((
             ("select k, c, notthere from undefined_values_table where k in ('k1', 'k2');", """
              k  | c  | notthere
-             M    M    MMMMMMMM
+             R    M    MMMMMMMM
             ----+----+----------
 
              k1 | c1 |     null
@@ -411,13 +422,13 @@
             (2 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
         # all-columns, including a metainfo column has no values (cql3)
         self.assertQueriesGiveColoredOutput((
             ("select * from undefined_values_table where k in ('k1', 'k2');", """
              k  | c  | notthere
-             M    M    MMMMMMMM
+             R    M    MMMMMMMM
             ----+----+----------
 
              k1 | c1 |     null
@@ -429,13 +440,13 @@
             (2 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_string_output_ascii(self):
         self.assertCqlverQueriesGiveColoredOutput((
             ("select * from ascii_with_special_chars where k in (0, 1, 2, 3);", r"""
              k | val
-             M   MMM
+             R   MMM
             ---+-----------------------------------------------
 
              0 |                                    newline:\n
@@ -451,7 +462,7 @@
             (4 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_string_output_utf8(self):
         # many of these won't line up visually here, to keep the source code
@@ -463,7 +474,7 @@
         self.assertCqlverQueriesGiveColoredOutput((
             ("select * from utf8_with_special_chars where k in (0, 1, 2, 3, 4, 5, 6);", u"""
              k | val
-             M   MMM
+             R   MMM
             ---+-------------------------------
 
              0 |                 Normal string
@@ -485,13 +496,13 @@
             (7 rows)
             nnnnnnnn
             """.encode('utf-8')),
-        ), cqlver=3, env={'LANG': 'en_US.UTF-8'})
+        ), cqlver=cqlsh.DEFAULT_CQLVER, env={'LANG': 'en_US.UTF-8'})
 
     def test_blob_output(self):
         self.assertCqlverQueriesGiveColoredOutput((
             ("select num, blobcol from has_all_types where num in (0, 1, 2, 3);", r"""
              num | blobcol
-             MMM   MMMMMMM
+             RRR   MMMMMMM
             -----+----------------------
 
                0 | 0x000102030405fffefd
@@ -507,10 +518,10 @@
             (4 rows)
             nnnnnnnn
             """),
-        ), cqlver=3)
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
 
     def test_prompt(self):
-        with testrun_cqlsh(tty=True, keyspace=None, cqlver=3) as c:
+        with testrun_cqlsh(tty=True, keyspace=None, cqlver=cqlsh.DEFAULT_CQLVER) as c:
             self.assertEqual(c.output_header.splitlines()[-1], 'cqlsh> ')
 
             c.send('\n')
@@ -533,15 +544,15 @@
             self.assertEqual(outputlines[2], 'cqlsh:system> ')
             midline = ColoredText(outputlines[1])
             self.assertEqual(midline.plain(),
-                             "Bad Request: Keyspace 'nonexistentkeyspace' does not exist")
+                             'code=2200 [Invalid query] message="Keyspace \'nonexistentkeyspace\' does not exist"')
             self.assertColorFromTags(midline,
                              "RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR")
 
     def test_describe_keyspace_output(self):
-        fullcqlver = '3.1.0'
+        fullcqlver = cqlsh.DEFAULT_CQLVER
         with testrun_cqlsh(tty=True, cqlver=fullcqlver) as c:
             ks = get_test_keyspace()
-            qks = quote_name(fullcqlver, ks)
+            qks = quote_name(ks)
             for cmd in ('describe keyspace', 'desc keyspace'):
                 for givename in ('system', '', qks):
                     for semicolon in ('', ';'):
@@ -553,7 +564,7 @@
             # new keyspace name
             new_ks_name = 'COPY_OF_' + ks
             copy_desc = desc.replace(ks, new_ks_name)
-            statements = split_cql_commands(copy_desc, cqlver=fullcqlver)
+            statements = split_cql_commands(copy_desc)
             do_drop = True
 
             with cassandra_cursor(cql_version=fullcqlver) as curs:
@@ -564,13 +575,12 @@
                 finally:
                     curs.execute('use system')
                     if do_drop:
-                        curs.execute('drop keyspace %s' % quote_name(fullcqlver, new_ks_name))
+                        curs.execute('drop keyspace %s' % quote_name(new_ks_name))
 
     def check_describe_keyspace_output(self, output, qksname, fullcqlver):
         expected_bits = [r'(?im)^CREATE KEYSPACE %s WITH\b' % re.escape(qksname),
-                         r'(?im)^USE \S+;$',
                          r';\s*$',
-                         r'\breplication = {\n  \'class\':']
+                         r'\breplication = {\'class\':']
         for expr in expected_bits:
             self.assertRegexpMatches(output, expr)
 
@@ -581,41 +591,38 @@
         # note columns are now comparator-ordered instead of original-order.
         table_desc3 = dedent("""
 
-            CREATE TABLE has_all_types (
-              num int,
-              asciicol ascii,
-              bigintcol bigint,
-              blobcol blob,
-              booleancol boolean,
-              decimalcol decimal,
-              doublecol double,
-              floatcol float,
-              intcol int,
-              textcol text,
-              timestampcol timestamp,
-              uuidcol uuid,
-              varcharcol text,
-              varintcol varint,
-              PRIMARY KEY ((num))
-            ) WITH
-              bloom_filter_fp_chance=0.010000 AND
-              caching='KEYS_ONLY' AND
-              comment='' AND
-              dclocal_read_repair_chance=0.100000 AND
-              gc_grace_seconds=864000 AND
-              index_interval=128 AND
-              read_repair_chance=0.000000 AND
-              replicate_on_write='true' AND
-              populate_io_cache_on_flush='false' AND
-              default_time_to_live=0 AND
-              speculative_retry='99.0PERCENTILE' AND
-              memtable_flush_period_in_ms=0 AND
-              compaction={'class': 'SizeTieredCompactionStrategy'} AND
-              compression={'sstable_compression': 'LZ4Compressor'};
+            CREATE TABLE %s.has_all_types (
+                num int PRIMARY KEY,
+                asciicol ascii,
+                bigintcol bigint,
+                blobcol blob,
+                booleancol boolean,
+                decimalcol decimal,
+                doublecol double,
+                floatcol float,
+                intcol int,
+                textcol text,
+                timestampcol timestamp,
+                uuidcol uuid,
+                varcharcol text,
+                varintcol varint
+            ) WITH bloom_filter_fp_chance = 0.01
+                AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
+                AND comment = ''
+                AND compaction = {'min_threshold': '4', 'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32'}
+                AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
+                AND dclocal_read_repair_chance = 0.1
+                AND default_time_to_live = 0
+                AND gc_grace_seconds = 864000
+                AND max_index_interval = 2048
+                AND memtable_flush_period_in_ms = 0
+                AND min_index_interval = 128
+                AND read_repair_chance = 0.0
+                AND speculative_retry = '99.0PERCENTILE';
 
-        """)
+        """ % quote_name(get_test_keyspace()))
 
-        with testrun_cqlsh(tty=True, cqlver='3.0.0') as c:
+        with testrun_cqlsh(tty=True, cqlver=cqlsh.DEFAULT_CQLVER) as c:
             for cmdword in ('describe table', 'desc columnfamily'):
                 for semicolon in (';', ''):
                     output = c.cmd_and_response('%s has_all_types%s' % (cmdword, semicolon))
@@ -633,7 +640,7 @@
 
         ks = get_test_keyspace()
 
-        with testrun_cqlsh(tty=True, keyspace=None, cqlver=3) as c:
+        with testrun_cqlsh(tty=True, keyspace=None, cqlver=cqlsh.DEFAULT_CQLVER) as c:
 
             # when not in a keyspace
             for cmdword in ('DESCRIBE COLUMNFAMILIES', 'desc tables'):
@@ -652,10 +659,10 @@
                             self.assertIn('ascii_with_special_chars', cfnames)
 
                     self.assertIn('system', ksnames)
-                    self.assertIn(quote_name('3.0.0', ks), ksnames)
+                    self.assertIn(quote_name(ks), ksnames)
 
             # when in a keyspace
-            c.send('USE %s;\n' % quote_name('3.0.0', ks))
+            c.send('USE %s;\n' % quote_name(ks))
             c.read_to_next_prompt()
 
             for cmdword in ('DESCRIBE COLUMNFAMILIES', 'desc tables'):
@@ -664,7 +671,7 @@
                     self.assertNoHasColors(output)
                     self.assertEqual(output[0], '\n')
                     self.assertEqual(output[-1], '\n')
-                    self.assertNotIn('Keyspace %s' % quote_name('3.0.0', ks), output)
+                    self.assertNotIn('Keyspace %s' % quote_name(ks), output)
                     self.assertIn('undefined_values_table', output)
 
     def test_describe_cluster_output(self):
@@ -673,7 +680,6 @@
             \n
             Cluster: [ ] (?P<clustername> .* ) \n
             Partitioner: [ ] (?P<partitionername> .* ) \n
-            Snitch: [ ] (?P<snitchname> .* ) \n
             \n
         '''
 
@@ -685,7 +691,7 @@
             \n
         '''
 
-        with testrun_cqlsh(tty=True, keyspace=None, cqlver=3) as c:
+        with testrun_cqlsh(tty=True, keyspace=None, cqlver=cqlsh.DEFAULT_CQLVER) as c:
 
             # not in a keyspace
             for semicolon in ('', ';'):
@@ -693,7 +699,7 @@
                 self.assertNoHasColors(output)
                 self.assertRegexpMatches(output, output_re + '$')
 
-            c.send('USE %s;\n' % quote_name('3.0.0', get_test_keyspace()))
+            c.send('USE %s;\n' % quote_name(get_test_keyspace()))
             c.read_to_next_prompt()
 
             for semicolon in ('', ';'):
@@ -707,7 +713,7 @@
                 output = c.cmd_and_response('desc full schema' + semicolon)
                 self.assertNoHasColors(output)
                 self.assertRegexpMatches(output, '^\nCREATE KEYSPACE')
-                self.assertIn("\nCREATE KEYSPACE system WITH replication = {\n  'class': 'LocalStrategy'\n};\n",
+                self.assertIn("\nCREATE KEYSPACE system WITH replication = {'class': 'LocalStrategy'}  AND durable_writes = true;\n",
                               output)
                 self.assertRegexpMatches(output, ';\s*$')
 
@@ -715,7 +721,7 @@
         with testrun_cqlsh(tty=True) as c:
             output = c.cmd_and_response('show version;')
             self.assertRegexpMatches(output,
-                    '^\[cqlsh \S+ \| Cassandra \S+ \| CQL spec \S+ \| Thrift protocol \S+\]$')
+                    '^\[cqlsh \S+ \| Cassandra \S+ \| CQL spec \S+ \| Native protocol \S+\]$')
 
             output = c.cmd_and_response('show host;')
             self.assertHasColors(output)
@@ -769,3 +775,67 @@
 
     def test_empty_line(self):
         pass
+
+    def test_user_types_output(self):
+        self.assertCqlverQueriesGiveColoredOutput((
+            ("select addresses from users;", r"""
+             addresses
+             MMMMMMMMM
+            --------------------------------------------------------------------------------------------------------------------------------------------
+
+                                          {{city: 'Chelyabinsk', address: '3rd street', zip: null}, {city: 'Chigirinsk', address: null, zip: '676722'}}
+                                          BBYYYYBBYYYYYYYYYYYYYBBYYYYYYYBBYYYYYYYYYYYYBBYYYBBRRRRBBBBYYYYBBYYYYYYYYYYYYBBYYYYYYYBBRRRRBBYYYBBYYYYYYYYBB
+             {{city: 'Austin', address: '902 East 5th St. #202', zip: '78702'}, {city: 'Sunnyvale', address: '292 Gibraltar Drive #107', zip: '94089'}}
+             BBYYYYBBYYYYYYYYBBYYYYYYYBBYYYYYYYYYYYYYYYYYYYYYYYBBYYYBBYYYYYYYBBBBYYYYBBYYYYYYYYYYYBBYYYYYYYBBYYYYYYYYYYYYYYYYYYYYYYYYYYBBYYYBBYYYYYYYBB
+
+
+            (2 rows)
+            nnnnnnnn
+            """),
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        self.assertCqlverQueriesGiveColoredOutput((
+            ("select phone_numbers from users;", r"""
+             phone_numbers
+             MMMMMMMMMMMMM
+            -------------------------------------------------------------------------------------
+
+                                  {{country: null, number: '03'}, {country: '+7', number: null}}
+                                  BBYYYYYYYBBRRRRBBYYYYYYBBYYYYBBBBYYYYYYYBBYYYYBBYYYYYYBBRRRRBB
+             {{country: '+1', number: '512-537-7809'}, {country: '+44', number: '208 622 3021'}}
+             BBYYYYYYYBBYYYYBBYYYYYYBBYYYYYYYYYYYYYYBBBBYYYYYYYBBYYYYYBBYYYYYYBBYYYYYYYYYYYYYYBB
+
+
+            (2 rows)
+            nnnnnnnn
+            """),
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
+
+    def test_user_types_with_collections(self):
+        self.assertCqlverQueriesGiveColoredOutput((
+            ("select info from songs;", r"""
+             info
+             MMMM
+            -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+             {founded: 188694000, members: {'Adrian Smith', 'Bruce Dickinson', 'Dave Murray', 'Janick Gers', 'Nicko McBrain', 'Steve Harris'}, description: 'Pure evil metal'}
+             BYYYYYYYBBGGGGGGGGGBBYYYYYYYBBBYYYYYYYYYYYYYYBBYYYYYYYYYYYYYYYYYBBYYYYYYYYYYYYYBBYYYYYYYYYYYYYBBYYYYYYYYYYYYYYYBBYYYYYYYYYYYYYYBBBYYYYYYYYYYYBBYYYYYYYYYYYYYYYYYB
+
+
+            (1 rows)
+            nnnnnnnn
+            """),
+        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        self.assertCqlverQueriesGiveColoredOutput((
+            ("select tags from songs;", r"""
+             tags
+             MMMM
+            -------------------------------------------------
+
+             {tags: {'genre': 'metal', 'origin': 'england'}}
+             BYYYYBBBYYYYYYYBBYYYYYYYBBYYYYYYYYBBYYYYYYYYYBB
+
+
+            (1 rows)
+            nnnnnnnn
+            """),
+        ), cqlver=cqlsh.DEFAULT_CQLVER)

diff --git a/pylib/cqlshlib/test/test_keyspace_init.cql b/pylib/cqlshlib/test/test_keyspace_init.cql
index f63811f..cd5ac75 100644
--- a/pylib/cqlshlib/test/test_keyspace_init.cql
+++ b/pylib/cqlshlib/test/test_keyspace_init.cql

@@ -52,6 +52,7 @@
         blobAsVarint(0x));
 
 
+
 CREATE TABLE empty_table (
     lonelykey float primary key,
     lonelycol text
@@ -180,3 +181,77 @@
 INSERT INTO twenty_rows_composite_table (a, b, c) VALUES ('A', '18', '18');
 INSERT INTO twenty_rows_composite_table (a, b, c) VALUES ('A', '19', '19');
 INSERT INTO twenty_rows_composite_table (a, b, c) VALUES ('A', '20', '20');
+
+CREATE TYPE address (
+    city text,
+    address text,
+    zip text
+);
+
+CREATE TYPE phone_number (
+    country text,
+    number text
+);
+
+CREATE TABLE users (
+    login text PRIMARY KEY,
+    name text,
+    addresses set<frozen<address>>,
+    phone_numbers set<frozen<phone_number>>
+);
+
+insert into users (login, name, addresses, phone_numbers)
+values ('jbellis',
+        'jonathan ellis',
+        {{city: 'Austin', address: '902 East 5th St. #202', zip: '78702'},
+         {city: 'Sunnyvale', address: '292 Gibraltar Drive #107', zip: '94089'}},
+        {{country: '+44', number: '208 622 3021'},
+         {country: '+1', number: '512-537-7809'}});
+
+insert into users (login, name, addresses, phone_numbers)
+values ('vpupkin',
+        'vasya pupkin',
+        {{city: 'Chelyabinsk', address: '3rd street', zip: null},
+         {city: 'Chigirinsk', address: null, zip: '676722'}},
+        {{country: '+7', number: null},
+         {country: null, number: '03'}});
+
+CREATE TYPE band_info_type (
+  founded varint,
+  members set<text>,
+  description text
+);
+
+CREATE TYPE tags (
+  tags map<text, text>
+);
+
+CREATE TABLE songs (
+    title text PRIMARY KEY,
+    band text,
+    info frozen<band_info_type>,
+    tags frozen<tags>
+);
+
+insert into songs (title, band, info, tags)
+values (
+    'The trooper',
+    'Iron Maiden',
+    {
+        founded:188694000,
+        members: {
+            'Bruce Dickinson',
+            'Dave Murray',
+            'Adrian Smith',
+            'Janick Gers',
+            'Steve Harris',
+            'Nicko McBrain'
+        },
+        description: 'Pure evil metal'
+     },
+    {
+        tags: {
+            'genre':'metal',
+            'origin':'england'
+        }
+    });

diff --git a/pylib/cqlshlib/tfactory.py b/pylib/cqlshlib/tfactory.py
deleted file mode 100644
index cc02e88..0000000
--- a/pylib/cqlshlib/tfactory.py
+++ /dev/null

@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from thrift.transport import TSocket, TTransport
-
-def regular_transport_factory(host, port, env, config_file):
-    """
-    Basic unencrypted Thrift transport factory function.
-    Returns instantiated Thrift transport for use with cql.Connection.
-
-    Params:
-    * host .........: hostname of Cassandra node.
-    * port .........: port number to connect to.
-    * env ..........: environment variables (os.environ) - not used by this implementation.
-    * config_file ..: path to cqlsh config file - not used by this implementation.
-    """
-    tsocket = TSocket.TSocket(host, port)
-    return TTransport.TFramedTransport(tsocket)

diff --git a/pylib/cqlshlib/tracing.py b/pylib/cqlshlib/tracing.py
index 6dd4b14..40d22f0 100644
--- a/pylib/cqlshlib/tracing.py
+++ b/pylib/cqlshlib/tracing.py

@@ -14,75 +14,70 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
-from cql.cqltypes import UTF8Type, InetAddressType, Int32Type
 from cqlshlib.displaying import MAGENTA
+from datetime import datetime
+import time
+from cassandra.query import QueryTrace, TraceUnavailable
 
-TRACING_KS = 'system_traces'
-SESSIONS_CF = 'sessions'
-EVENTS_CF = 'events'
-MAX_WAIT = 10.0
 
-def print_trace_session(shell, cursor, session_id):
-    rows = fetch_trace_session(cursor, session_id)
-    if not rows:
+def print_trace_session(shell, session, session_id):
+    """
+    Lookup a trace by session and trace session ID, then print it.
+    """
+    trace = QueryTrace(session_id, session)
+    try:
+        trace.populate()
+    except TraceUnavailable:
         shell.printerr("Session %s wasn't found." % session_id)
+    else:
+        print_trace(shell, trace)
+
+
+def print_trace(shell, trace):
+    """
+    Print an already populated cassandra.query.QueryTrace instance.
+    """
+    rows = make_trace_rows(trace)
+    if not rows:
+        shell.printerr("No rows for session %s found." % (trace.trace_id,))
         return
     names = ['activity', 'timestamp', 'source', 'source_elapsed']
-    types = [UTF8Type, UTF8Type, InetAddressType, Int32Type]
 
-    formatted_names = [shell.myformat_colname(name, UTF8Type) for name in names]
-    formatted_values = [map(shell.myformat_value, row, types) for row in rows]
+    formatted_names = map(shell.myformat_colname, names)
+    formatted_values = [map(shell.myformat_value, row) for row in rows]
 
     shell.writeresult('')
     shell.writeresult('Tracing session: ', color=MAGENTA, newline=False)
-    shell.writeresult(session_id)
+    shell.writeresult(trace.trace_id)
     shell.writeresult('')
     shell.print_formatted_result(formatted_names, formatted_values)
     shell.writeresult('')
 
-def fetch_trace_session(cursor, session_id):
-    start = time.time()
-    while True:
-        time_spent = time.time() - start
-        if time_spent >= MAX_WAIT:
-            return []
-        cursor.execute("SELECT request, coordinator, started_at, duration "
-                       "FROM %s.%s "
-                       "WHERE session_id = %s" % (TRACING_KS, SESSIONS_CF, session_id),
-                       consistency_level='ONE')
-        session = cursor.fetchone()
 
-        if not session or session[3] is None: #session[3] is a duration
-            time.sleep(0.5)
-            continue
+def make_trace_rows(trace):
+    if not trace.events:
+        return []
 
-        (request, coordinator, started_at, duration) = session
-        cursor.execute("SELECT activity, event_id, source, source_elapsed "
-                       "FROM %s.%s "
-                       "WHERE session_id = %s" % (TRACING_KS, EVENTS_CF, session_id),
-                       consistency_level='ONE')
-        events = cursor.fetchall()
+    rows = [[trace.request_type, str(datetime_from_utc_to_local(trace.started_at)), trace.coordinator, 0]]
 
-        rows = []
-        # append header row (from sessions table).
-        rows.append([request, format_timestamp(started_at), coordinator, 0])
-        # append main rows (from events table).
-        for activity, event_id, source, source_elapsed in events:
-            rows.append([activity, format_timeuuid(event_id), source, source_elapsed])
-        # append footer row (from sessions table).
-        finished_at = format_timestamp(started_at + (duration / 1000000.))
+    # append main rows (from events table).
+    for event in trace.events:
+        rows.append(["%s [%s]" % (event.description, event.thread_name),
+                     str(datetime_from_utc_to_local(event.datetime)),
+                     event.source,
+                     event.source_elapsed.microseconds if event.source_elapsed else "--"])
+    # append footer row (from sessions table).
+    if trace.duration:
+        finished_at = (datetime_from_utc_to_local(trace.started_at) + trace.duration)
+    else:
+        finished_at = trace.duration = "--"
 
-        rows.append(['Request complete', finished_at, coordinator, duration])
+    rows.append(['Request complete', str(finished_at), trace.coordinator, trace.duration.microseconds])
 
-        return rows
+    return rows
 
-def format_timestamp(value):
-    return format_time(int(value * 1000))
 
-def format_timeuuid(value):
-    return format_time((value.get_time() - 0x01b21dd213814000) / 10000)
-
-def format_time(millis):
-    s, ms = divmod(millis, 1000)
-    return time.strftime('%H:%M:%S', time.localtime(s)) + ',' + str(ms).rjust(3, '0')
+def datetime_from_utc_to_local(utc_datetime):
+    now_timestamp = time.time()
+    offset = datetime.fromtimestamp(now_timestamp) - datetime.utcfromtimestamp(now_timestamp)
+    return utc_datetime + offset

diff --git a/src/java/org/apache/cassandra/auth/Auth.java b/src/java/org/apache/cassandra/auth/Auth.java
index 94d4b3d..4f18111 100644
--- a/src/java/org/apache/cassandra/auth/Auth.java
+++ b/src/java/org/apache/cassandra/auth/Auth.java

@@ -29,12 +29,15 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.statements.CFStatement;
+import org.apache.cassandra.cql3.statements.CreateTableStatement;
 import org.apache.cassandra.cql3.statements.SelectStatement;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.exceptions.RequestExecutionException;
@@ -175,7 +178,7 @@
             return;
 
         setupAuthKeyspace();
-        setupUsersTable();
+        setupTable(USERS_CF, USERS_CF_SCHEMA);
 
         DatabaseDescriptor.getAuthenticator().setup();
         DatabaseDescriptor.getAuthorizer().setup();
@@ -223,7 +226,7 @@
             try
             {
                 KSMetaData ksm = KSMetaData.newKeyspace(AUTH_KS, SimpleStrategy.class.getName(), ImmutableMap.of("replication_factor", "1"), true);
-                MigrationManager.announceNewKeyspace(ksm, 0);
+                MigrationManager.announceNewKeyspace(ksm, 0, false);
             }
             catch (Exception e)
             {
@@ -232,15 +235,26 @@
         }
     }
 
-    private static void setupUsersTable()
+    /**
+     * Set up table from given CREATE TABLE statement under system_auth keyspace, if not already done so.
+     *
+     * @param name name of the table
+     * @param cql CREATE TABLE statement
+     */
+    public static void setupTable(String name, String cql)
     {
-        if (Schema.instance.getCFMetaData(AUTH_KS, USERS_CF) == null)
+        if (Schema.instance.getCFMetaData(AUTH_KS, name) == null)
         {
             try
             {
-                QueryProcessor.process(USERS_CF_SCHEMA, ConsistencyLevel.ANY);
+                CFStatement parsed = (CFStatement)QueryProcessor.parseStatement(cql);
+                parsed.prepareKeyspace(AUTH_KS);
+                CreateTableStatement statement = (CreateTableStatement) parsed.prepare().statement;
+                CFMetaData cfm = statement.getCFMetaData().copy(CFMetaData.generateLegacyCfId(AUTH_KS, name));
+                assert cfm.cfName.equals(name);
+                MigrationManager.announceNewColumnFamily(cfm);
             }
-            catch (RequestExecutionException e)
+            catch (Exception e)
             {
                 throw new AssertionError(e);
             }
@@ -290,9 +304,9 @@
         try
         {
             ResultMessage.Rows rows = selectUserStatement.execute(QueryState.forInternalCalls(),
-                                                                  new QueryOptions(consistencyForUser(username),
-                                                                                   Lists.newArrayList(ByteBufferUtil.bytes(username))));
-            return new UntypedResultSet(rows.result);
+                                                                  QueryOptions.forInternalCalls(consistencyForUser(username),
+                                                                                                Lists.newArrayList(ByteBufferUtil.bytes(username))));
+            return UntypedResultSet.create(rows.result);
         }
         catch (RequestValidationException e)
         {
@@ -319,6 +333,10 @@
             DatabaseDescriptor.getAuthorizer().revokeAll(DataResource.columnFamily(ksName, cfName));
         }
 
+        public void onDropUserType(String ksName, String userType)
+        {
+        }
+
         public void onCreateKeyspace(String ksName)
         {
         }
@@ -327,6 +345,10 @@
         {
         }
 
+        public void onCreateUserType(String ksName, String userType)
+        {
+        }
+
         public void onUpdateKeyspace(String ksName)
         {
         }
@@ -334,5 +356,9 @@
         public void onUpdateColumnFamily(String ksName, String cfName)
         {
         }
+
+        public void onUpdateUserType(String ksName, String userType)
+        {
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/auth/CassandraAuthorizer.java b/src/java/org/apache/cassandra/auth/CassandraAuthorizer.java
index 61ad9a4..20060c0 100644
--- a/src/java/org/apache/cassandra/auth/CassandraAuthorizer.java
+++ b/src/java/org/apache/cassandra/auth/CassandraAuthorizer.java

@@ -25,7 +25,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.QueryOptions;
@@ -33,7 +32,6 @@
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -73,10 +71,10 @@
         try
         {
             ResultMessage.Rows rows = authorizeStatement.execute(QueryState.forInternalCalls(),
-                                                                 new QueryOptions(ConsistencyLevel.LOCAL_ONE,
-                                                                                  Lists.newArrayList(ByteBufferUtil.bytes(user.getName()),
-                                                                                                     ByteBufferUtil.bytes(resource.getName()))));
-            result = new UntypedResultSet(rows.result);
+                                                                 QueryOptions.forInternalCalls(ConsistencyLevel.LOCAL_ONE,
+                                                                                               Lists.newArrayList(ByteBufferUtil.bytes(user.getName()),
+                                                                                                                  ByteBufferUtil.bytes(resource.getName()))));
+            result = UntypedResultSet.create(rows.result);
         }
         catch (RequestValidationException e)
         {
@@ -187,7 +185,7 @@
         {
             process(String.format("DELETE FROM %s.%s WHERE username = '%s'", Auth.AUTH_KS, PERMISSIONS_CF, escape(droppedUser)));
         }
-        catch (Throwable e)
+        catch (RequestExecutionException e)
         {
             logger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", droppedUser, e);
         }
@@ -206,7 +204,7 @@
                                          PERMISSIONS_CF,
                                          escape(droppedResource.getName())));
         }
-        catch (Throwable e)
+        catch (RequestExecutionException e)
         {
             logger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", droppedResource, e);
             return;
@@ -222,7 +220,7 @@
                                       escape(row.getString(USERNAME)),
                                       escape(droppedResource.getName())));
             }
-            catch (Throwable e)
+            catch (RequestExecutionException e)
             {
                 logger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", droppedResource, e);
             }
@@ -240,17 +238,7 @@
 
     public void setup()
     {
-        if (Schema.instance.getCFMetaData(Auth.AUTH_KS, PERMISSIONS_CF) == null)
-        {
-            try
-            {
-                process(PERMISSIONS_CF_SCHEMA);
-            }
-            catch (RequestExecutionException e)
-            {
-                throw new AssertionError(e);
-            }
-        }
+        Auth.setupTable(PERMISSIONS_CF, PERMISSIONS_CF_SCHEMA);
 
         try
         {

diff --git a/src/java/org/apache/cassandra/auth/PasswordAuthenticator.java b/src/java/org/apache/cassandra/auth/PasswordAuthenticator.java
index e4c00b7..1218ee2 100644
--- a/src/java/org/apache/cassandra/auth/PasswordAuthenticator.java
+++ b/src/java/org/apache/cassandra/auth/PasswordAuthenticator.java

@@ -30,14 +30,12 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.SelectStatement;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.transport.messages.ResultMessage;
@@ -106,9 +104,9 @@
         try
         {
             ResultMessage.Rows rows = authenticateStatement.execute(QueryState.forInternalCalls(),
-                                                                    new QueryOptions(consistencyForUser(username),
-                                                                                     Lists.newArrayList(ByteBufferUtil.bytes(username))));
-            result = new UntypedResultSet(rows.result);
+                                                                    QueryOptions.forInternalCalls(consistencyForUser(username),
+                                                                                                  Lists.newArrayList(ByteBufferUtil.bytes(username))));
+            result = UntypedResultSet.create(rows.result);
         }
         catch (RequestValidationException e)
         {
@@ -166,7 +164,7 @@
 
     public void setup()
     {
-        setupCredentialsTable();
+        Auth.setupTable(CREDENTIALS_CF, CREDENTIALS_CF_SCHEMA);
 
         // the delay is here to give the node some time to see its peers - to reduce
         // "skipped default user setup: some nodes are were not ready" log spam.
@@ -200,21 +198,6 @@
         return new PlainTextSaslAuthenticator();
     }
 
-    private void setupCredentialsTable()
-    {
-        if (Schema.instance.getCFMetaData(Auth.AUTH_KS, CREDENTIALS_CF) == null)
-        {
-            try
-            {
-                process(CREDENTIALS_CF_SCHEMA, ConsistencyLevel.ANY);
-            }
-            catch (RequestExecutionException e)
-            {
-                throw new AssertionError(e);
-            }
-        }
-    }
-
     // if there are no users yet - add default superuser.
     private void setupDefaultUser()
     {

diff --git a/src/java/org/apache/cassandra/cache/AutoSavingCache.java b/src/java/org/apache/cassandra/cache/AutoSavingCache.java
index 64234e2..d8fd5e0 100644
--- a/src/java/org/apache/cassandra/cache/AutoSavingCache.java
+++ b/src/java/org/apache/cassandra/cache/AutoSavingCache.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.cache;
 
 import java.io.*;
-import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.concurrent.Future;
 import java.util.concurrent.ScheduledFuture;
@@ -30,7 +29,7 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.ColumnFamilyType;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
@@ -38,12 +37,12 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.LengthAvailableInputStream;
 import org.apache.cassandra.io.util.SequentialWriter;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 
 public class AutoSavingCache<K extends CacheKey, V> extends InstrumentingCache<K, V>
@@ -66,9 +65,16 @@
         this.cacheLoader = cacheloader;
     }
 
-    public File getCachePath(String ksName, String cfName, String version)
+    @Deprecated
+    public File getCachePath(String ksName, String cfName, UUID cfId, String version)
     {
-        return DatabaseDescriptor.getSerializedCachePath(ksName, cfName, cacheType, version);
+        return DatabaseDescriptor.getSerializedCachePath(ksName, cfName, cfId, cacheType, version);
+    }
+
+    public File getCachePath(UUID cfId, String version)
+    {
+        Pair<String, String> names = Schema.instance.getCF(cfId);
+        return DatabaseDescriptor.getSerializedCachePath(names.left, names.right, cfId, cacheType, version);
     }
 
     public Writer getWriter(int keysToSave)
@@ -105,7 +111,10 @@
         long start = System.nanoTime();
 
         // modern format, allows both key and value (so key cache load can be purely sequential)
-        File path = getCachePath(cfs.keyspace.getName(), cfs.name, CURRENT_VERSION);
+        File path = getCachePath(cfs.metadata.cfId, CURRENT_VERSION);
+        // if path does not exist, try without cfId (assuming saved cache is created with current CF)
+        if (!path.exists())
+            path = getCachePath(cfs.keyspace.getName(), cfs.name, null, CURRENT_VERSION);
         if (path.exists())
         {
             DataInputStream in = null;
@@ -127,7 +136,8 @@
                 for (Future<Pair<K, V>> future : futures)
                 {
                     Pair<K, V> entry = future.get();
-                    put(entry.left, entry.right);
+                    if (entry != null)
+                        put(entry.left, entry.right);
                 }
             }
             catch (Exception e)
@@ -168,10 +178,12 @@
                 type = OperationType.KEY_CACHE_SAVE;
             else if (cacheType == CacheService.CacheType.ROW_CACHE)
                 type = OperationType.ROW_CACHE_SAVE;
+            else if (cacheType == CacheService.CacheType.COUNTER_CACHE)
+                type = OperationType.COUNTER_CACHE_SAVE;
             else
                 type = OperationType.UNKNOWN;
 
-            info = new CompactionInfo(new CFMetaData(Keyspace.SYSTEM_KS, cacheType.toString(), ColumnFamilyType.Standard, BytesType.instance, null),
+            info = new CompactionInfo(CFMetaData.denseCFMetaData(Keyspace.SYSTEM_KS, cacheType.toString(), BytesType.instance),
                                       type,
                                       0,
                                       keys.size(),
@@ -202,18 +214,21 @@
 
             long start = System.nanoTime();
 
-            HashMap<Pair<String, String>, SequentialWriter> writers = new HashMap<Pair<String, String>, SequentialWriter>();
+            HashMap<UUID, SequentialWriter> writers = new HashMap<>();
 
             try
             {
                 for (K key : keys)
                 {
-                    Pair<String, String> path = key.getPathInfo();
-                    SequentialWriter writer = writers.get(path);
+                    UUID cfId = key.getCFId();
+                    if (!Schema.instance.hasCF(key.getCFId()))
+                        continue; // the table has been dropped.
+
+                    SequentialWriter writer = writers.get(cfId);
                     if (writer == null)
                     {
-                        writer = tempCacheFile(path);
-                        writers.put(path, writer);
+                        writer = tempCacheFile(cfId);
+                        writers.put(cfId, writer);
                     }
 
                     try
@@ -234,13 +249,13 @@
                     FileUtils.closeQuietly(writer);
             }
 
-            for (Map.Entry<Pair<String, String>, SequentialWriter> info : writers.entrySet())
+            for (Map.Entry<UUID, SequentialWriter> entry : writers.entrySet())
             {
-                Pair<String, String> path = info.getKey();
-                SequentialWriter writer = info.getValue();
+                UUID cfId = entry.getKey();
+                SequentialWriter writer = entry.getValue();
 
                 File tmpFile = new File(writer.getPath());
-                File cacheFile = getCachePath(path.left, path.right, CURRENT_VERSION);
+                File cacheFile = getCachePath(cfId, CURRENT_VERSION);
 
                 cacheFile.delete(); // ignore error if it didn't exist
                 if (!tmpFile.renameTo(cacheFile))
@@ -250,11 +265,11 @@
             logger.info("Saved {} ({} items) in {} ms", cacheType, keys.size(), TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
         }
 
-        private SequentialWriter tempCacheFile(Pair<String, String> pathInfo)
+        private SequentialWriter tempCacheFile(UUID cfId)
         {
-            File path = getCachePath(pathInfo.left, pathInfo.right, CURRENT_VERSION);
+            File path = getCachePath(cfId, CURRENT_VERSION);
             File tmpFile = FileUtils.createTempFile(path.getName(), null, path.getParentFile());
-            return SequentialWriter.open(tmpFile, true);
+            return SequentialWriter.open(tmpFile);
         }
 
         private void deleteOldCacheFiles()
@@ -278,13 +293,15 @@
                 }
             }
             else
+            {
                 logger.warn("Could not list files in {}", savedCachesDir);
+            }
         }
     }
 
     public interface CacheSerializer<K extends CacheKey, V>
     {
-        void serialize(K key, DataOutput out) throws IOException;
+        void serialize(K key, DataOutputPlus out) throws IOException;
 
         Future<Pair<K, V>> deserialize(DataInputStream in, ColumnFamilyStore cfs) throws IOException;
     }

diff --git a/src/java/org/apache/cassandra/cache/CacheKey.java b/src/java/org/apache/cassandra/cache/CacheKey.java
index aa9f5f6..44fead0 100644
--- a/src/java/org/apache/cassandra/cache/CacheKey.java
+++ b/src/java/org/apache/cassandra/cache/CacheKey.java

@@ -17,12 +17,12 @@
  */
 package org.apache.cassandra.cache;
 
-import org.apache.cassandra.utils.Pair;
+import java.util.UUID;
 
 public interface CacheKey extends IMeasurableMemory
 {
     /**
-     * @return The keyspace and ColumnFamily names to which this key belongs
+     * @return The cf id of the cache key.
      */
-    public Pair<String, String> getPathInfo();
+    public UUID getCFId();
 }

diff --git a/src/java/org/apache/cassandra/cache/CachingOptions.java b/src/java/org/apache/cassandra/cache/CachingOptions.java
new file mode 100644
index 0000000..6eeaa37
--- /dev/null
+++ b/src/java/org/apache/cassandra/cache/CachingOptions.java

@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cache;
+
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.cassandra.exceptions.ConfigurationException;
+import static org.apache.cassandra.utils.FBUtilities.fromJsonMap;
+
+/*
+CQL: { 'keys' : 'ALL|NONE', 'rows_per_partition': '200|NONE|ALL' }
+ */
+public class CachingOptions
+{
+    public static final CachingOptions KEYS_ONLY = new CachingOptions(new KeyCache(KeyCache.Type.ALL), new RowCache(RowCache.Type.NONE));
+    public static final CachingOptions ALL = new CachingOptions(new KeyCache(KeyCache.Type.ALL), new RowCache(RowCache.Type.ALL));
+    public static final CachingOptions ROWS_ONLY = new CachingOptions(new KeyCache(KeyCache.Type.NONE), new RowCache(RowCache.Type.ALL));
+    public static final CachingOptions NONE = new CachingOptions(new KeyCache(KeyCache.Type.NONE), new RowCache(RowCache.Type.NONE));
+
+    public final KeyCache keyCache;
+    public final RowCache rowCache;
+    private static final Set<String> legacyOptions = new HashSet<>(Arrays.asList("ALL", "NONE", "KEYS_ONLY", "ROWS_ONLY"));
+
+    public CachingOptions(KeyCache kc, RowCache rc)
+    {
+        this.keyCache = kc;
+        this.rowCache = rc;
+    }
+
+    public static CachingOptions fromString(String cache) throws ConfigurationException
+    {
+        if (legacyOptions.contains(cache.toUpperCase()))
+            return fromLegacyOption(cache.toUpperCase());
+        return fromMap(fromJsonMap(cache));
+    }
+
+    public static CachingOptions fromMap(Map<String, String> cacheConfig) throws ConfigurationException
+    {
+        validateCacheConfig(cacheConfig);
+        if (!cacheConfig.containsKey("keys") && !cacheConfig.containsKey("rows_per_partition"))
+            return CachingOptions.NONE;
+        if (!cacheConfig.containsKey("keys"))
+            return new CachingOptions(new KeyCache(KeyCache.Type.NONE), RowCache.fromString(cacheConfig.get("rows_per_partition")));
+        if (!cacheConfig.containsKey("rows_per_partition"))
+            return CachingOptions.KEYS_ONLY;
+
+        return new CachingOptions(KeyCache.fromString(cacheConfig.get("keys")), RowCache.fromString(cacheConfig.get("rows_per_partition")));
+    }
+
+    private static void validateCacheConfig(Map<String, String> cacheConfig) throws ConfigurationException
+    {
+        for (Map.Entry<String, String> entry : cacheConfig.entrySet())
+        {
+            String value = entry.getValue().toUpperCase();
+            if (entry.getKey().equals("keys"))
+            {
+                if (!(value.equals("ALL") || value.equals("NONE")))
+                {
+                    throw new ConfigurationException("'keys' can only have values 'ALL' or 'NONE'");
+                }
+            }
+            else if (entry.getKey().equals("rows_per_partition"))
+            {
+                if (!(value.equals("ALL") || value.equals("NONE") || StringUtils.isNumeric(value)))
+                {
+                    throw new ConfigurationException("'rows_per_partition' can only have values 'ALL', 'NONE' or be numeric.");
+                }
+            }
+            else
+                throw new ConfigurationException("Only supported CachingOptions parameters are 'keys' and 'rows_per_partition'");
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("{\"keys\":\"%s\", \"rows_per_partition\":\"%s\"}", keyCache.toString(), rowCache.toString());
+    }
+
+    private static CachingOptions fromLegacyOption(String cache)
+    {
+        if (cache.equals("ALL"))
+            return ALL;
+        if (cache.equals("KEYS_ONLY"))
+            return KEYS_ONLY;
+        if (cache.equals("ROWS_ONLY"))
+            return ROWS_ONLY;
+        return NONE;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        CachingOptions o2 = (CachingOptions) o;
+
+        if (!keyCache.equals(o2.keyCache)) return false;
+        if (!rowCache.equals(o2.rowCache)) return false;
+
+        return true;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        int result = keyCache.hashCode();
+        result = 31 * result + rowCache.hashCode();
+        return result;
+    }
+
+    public static boolean isLegacy(String CachingOptions)
+    {
+        return legacyOptions.contains(CachingOptions.toUpperCase());
+    }
+
+    public static CachingOptions fromThrift(String caching, String cellsPerRow) throws ConfigurationException
+    {
+
+        RowCache rc = new RowCache(RowCache.Type.NONE);
+        KeyCache kc = new KeyCache(KeyCache.Type.ALL);
+        // if we get a caching string from thrift it is legacy, "ALL", "KEYS_ONLY" etc, fromString handles those
+        if (caching != null)
+        {
+            CachingOptions givenOptions = CachingOptions.fromString(caching);
+            rc = givenOptions.rowCache;
+            kc = givenOptions.keyCache;
+        }
+        // if we get cells_per_row from thrift, it is either "ALL" or "<number of cells to cache>".
+        if (cellsPerRow != null && rc.isEnabled())
+            rc = RowCache.fromString(cellsPerRow);
+        return new CachingOptions(kc, rc);
+    }
+
+    public String toThriftCaching()
+    {
+        if (rowCache.isEnabled() && keyCache.isEnabled())
+            return "ALL";
+        if (rowCache.isEnabled())
+            return "ROWS_ONLY";
+        if (keyCache.isEnabled())
+            return "KEYS_ONLY";
+        return "NONE";
+    }
+
+    public String toThriftCellsPerRow()
+    {
+        if (rowCache.cacheFullPartitions())
+            return "ALL";
+        return String.valueOf(rowCache.rowsToCache);
+    }
+
+
+    public static class KeyCache
+    {
+        public final Type type;
+        public KeyCache(Type type)
+        {
+            this.type = type;
+        }
+
+        public enum Type
+        {
+            ALL, NONE
+        }
+        public static KeyCache fromString(String keyCache)
+        {
+            return new KeyCache(Type.valueOf(keyCache.toUpperCase()));
+        }
+
+        public boolean isEnabled()
+        {
+            return type.equals(Type.ALL);
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+
+            KeyCache keyCache = (KeyCache) o;
+
+            if (type != keyCache.type) return false;
+
+            return true;
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return type.hashCode();
+        }
+        @Override
+        public String toString()
+        {
+            return type.toString();
+        }
+    }
+
+    public static class RowCache
+    {
+        public final Type type;
+        public final int rowsToCache;
+
+        public RowCache(Type type)
+        {
+            this(type, type.equals(Type.ALL) ? Integer.MAX_VALUE : 0);
+        }
+        public RowCache(Type type, int rowsToCache)
+        {
+            this.type = type;
+            this.rowsToCache = rowsToCache;
+        }
+
+        public enum Type
+        {
+            ALL, NONE, HEAD
+        }
+
+        public static RowCache fromString(String rowCache)
+        {
+            if (rowCache == null || rowCache.equalsIgnoreCase("none"))
+                return new RowCache(Type.NONE, 0);
+            else if (rowCache.equalsIgnoreCase("all"))
+                return new RowCache(Type.ALL, Integer.MAX_VALUE);
+            return new RowCache(Type.HEAD, Integer.parseInt(rowCache));
+        }
+        public boolean isEnabled()
+        {
+            return type.equals(Type.ALL) || type.equals(Type.HEAD);
+        }
+        public boolean cacheFullPartitions()
+        {
+            return type.equals(Type.ALL);
+        }
+        @Override
+        public String toString()
+        {
+            if (type.equals(Type.ALL)) return "ALL";
+            if (type.equals(Type.NONE)) return "NONE";
+            return String.valueOf(rowsToCache);
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+
+            RowCache rowCache = (RowCache) o;
+
+            if (rowsToCache != rowCache.rowsToCache) return false;
+            if (type != rowCache.type) return false;
+
+            return true;
+        }
+
+        @Override
+        public int hashCode()
+        {
+            int result = type.hashCode();
+            result = 31 * result + rowsToCache;
+            return result;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cache/ConcurrentLinkedHashCache.java b/src/java/org/apache/cassandra/cache/ConcurrentLinkedHashCache.java
index f1e0466..8182447 100644
--- a/src/java/org/apache/cassandra/cache/ConcurrentLinkedHashCache.java
+++ b/src/java/org/apache/cassandra/cache/ConcurrentLinkedHashCache.java

@@ -54,7 +54,7 @@
         {
             public int weightOf(K key, V value)
             {
-                long size = key.memorySize() + value.memorySize();
+                long size = key.unsharedHeapSize() + value.unsharedHeapSize();
                 assert size <= Integer.MAX_VALUE : "Serialized size cannot be more than 2GB/Integer.MAX_VALUE";
                 return (int) size;
             }

diff --git a/src/java/org/apache/cassandra/cache/CounterCacheKey.java b/src/java/org/apache/cassandra/cache/CounterCacheKey.java
new file mode 100644
index 0000000..60247c5
--- /dev/null
+++ b/src/java/org/apache/cassandra/cache/CounterCacheKey.java

@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cache;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.UUID;
+
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.utils.*;
+
+public class CounterCacheKey implements CacheKey
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new CounterCacheKey(null, ByteBufferUtil.EMPTY_BYTE_BUFFER, CellNames.simpleDense(ByteBuffer.allocate(1))))
+                                           + ObjectSizes.measure(new UUID(0, 0));
+
+    public final UUID cfId;
+    public final byte[] partitionKey;
+    public final byte[] cellName;
+
+    private CounterCacheKey(UUID cfId, ByteBuffer partitionKey, CellName cellName)
+    {
+        this.cfId = cfId;
+        this.partitionKey = ByteBufferUtil.getArray(partitionKey);
+        this.cellName = ByteBufferUtil.getArray(cellName.toByteBuffer());
+    }
+
+    public static CounterCacheKey create(UUID cfId, ByteBuffer partitionKey, CellName cellName)
+    {
+        return new CounterCacheKey(cfId, partitionKey, cellName);
+    }
+
+    public UUID getCFId()
+    {
+        return cfId;
+    }
+
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE
+               + ObjectSizes.sizeOfArray(partitionKey)
+               + ObjectSizes.sizeOfArray(cellName);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("CounterCacheKey(%s, %s, %s)",
+                             cfId,
+                             ByteBufferUtil.bytesToHex(ByteBuffer.wrap(partitionKey)),
+                             ByteBufferUtil.bytesToHex(ByteBuffer.wrap(cellName)));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Arrays.deepHashCode(new Object[]{cfId, partitionKey, cellName});
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof CounterCacheKey))
+            return false;
+
+        CounterCacheKey cck = (CounterCacheKey) o;
+
+        return cfId.equals(cck.cfId)
+            && Arrays.equals(partitionKey, cck.partitionKey)
+            && Arrays.equals(cellName, cck.cellName);
+    }
+}

diff --git a/src/java/org/apache/cassandra/cache/IMeasurableMemory.java b/src/java/org/apache/cassandra/cache/IMeasurableMemory.java
index 16ca7c2..149bff6 100644
--- a/src/java/org/apache/cassandra/cache/IMeasurableMemory.java
+++ b/src/java/org/apache/cassandra/cache/IMeasurableMemory.java

@@ -23,5 +23,10 @@
 
 public interface IMeasurableMemory
 {
-    public long memorySize();
+    /**
+     * @return the amount of on-heap memory retained by the object that might be reclaimed if the object were reclaimed,
+     * i.e. it should try to exclude globally cached data where possible, or counting portions of arrays that are
+     * referenced by the object but used by other objects only (e.g. slabbed byte-buffers), etc.
+     */
+    public long unsharedHeapSize();
 }

diff --git a/src/java/org/apache/cassandra/cache/KeyCacheKey.java b/src/java/org/apache/cassandra/cache/KeyCacheKey.java
index 3b2077c..cef37ce 100644
--- a/src/java/org/apache/cassandra/cache/KeyCacheKey.java
+++ b/src/java/org/apache/cassandra/cache/KeyCacheKey.java

@@ -19,30 +19,34 @@
 
 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import java.util.UUID;
 
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.Pair;
 
 public class KeyCacheKey implements CacheKey
 {
+    public final UUID cfId;
     public final Descriptor desc;
 
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new KeyCacheKey(null, null, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+
     // keeping an array instead of a ByteBuffer lowers the overhead of the key cache working set,
     // without extra copies on lookup since client-provided key ByteBuffers will be array-backed already
     public final byte[] key;
 
-    public KeyCacheKey(Descriptor desc, ByteBuffer key)
+    public KeyCacheKey(UUID cfId, Descriptor desc, ByteBuffer key)
     {
+        this.cfId = cfId;
         this.desc = desc;
         this.key = ByteBufferUtil.getArray(key);
         assert this.key != null;
     }
 
-    public Pair<String, String> getPathInfo()
+    public UUID getCFId()
     {
-        return Pair.create(desc.ksname, desc.cfname);
+        return cfId;
     }
 
     public String toString()
@@ -50,13 +54,9 @@
         return String.format("KeyCacheKey(%s, %s)", desc, ByteBufferUtil.bytesToHex(ByteBuffer.wrap(key)));
     }
 
-    public long memorySize()
+    public long unsharedHeapSize()
     {
-        return ObjectSizes.getFieldSize(// desc
-                                        ObjectSizes.getReferenceSize() +
-                                        // key
-                                        ObjectSizes.getReferenceSize())
-               + ObjectSizes.getArraySize(key);
+        return EMPTY_SIZE + ObjectSizes.sizeOfArray(key);
     }
 
     @Override
@@ -67,15 +67,15 @@
 
         KeyCacheKey that = (KeyCacheKey) o;
 
-        if (desc != null ? !desc.equals(that.desc) : that.desc != null) return false;
-        return Arrays.equals(key, that.key);
+        return cfId.equals(that.cfId) && desc.equals(that.desc) && Arrays.equals(key, that.key);
     }
 
     @Override
     public int hashCode()
     {
-        int result = desc != null ? desc.hashCode() : 0;
-        result = 31 * result + (key != null ? Arrays.hashCode(key) : 0);
+        int result = cfId.hashCode();
+        result = 31 * result + desc.hashCode();
+        result = 31 * result + Arrays.hashCode(key);
         return result;
     }
 }

diff --git a/src/java/org/apache/cassandra/cache/RefCountedMemory.java b/src/java/org/apache/cassandra/cache/RefCountedMemory.java
index 43c87b9..87dd6c4 100644
--- a/src/java/org/apache/cassandra/cache/RefCountedMemory.java
+++ b/src/java/org/apache/cassandra/cache/RefCountedMemory.java

@@ -17,13 +17,14 @@
  */
 package org.apache.cassandra.cache;
 
-import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
 
 import org.apache.cassandra.io.util.Memory;
 
 public class RefCountedMemory extends Memory implements AutoCloseable
 {
-    private final AtomicInteger references = new AtomicInteger(1);
+    private volatile int references = 1;
+    private static final AtomicIntegerFieldUpdater<RefCountedMemory> UPDATER = AtomicIntegerFieldUpdater.newUpdater(RefCountedMemory.class, "references");
 
     public RefCountedMemory(long size)
     {
@@ -38,10 +39,10 @@
     {
         while (true)
         {
-            int n = references.get();
+            int n = UPDATER.get(this);
             if (n <= 0)
                 return false;
-            if (references.compareAndSet(n, n + 1))
+            if (UPDATER.compareAndSet(this, n, n + 1))
                 return true;
         }
     }
@@ -49,8 +50,20 @@
     /** decrement reference count.  if count reaches zero, the object is freed. */
     public void unreference()
     {
-        if (references.decrementAndGet() == 0)
-            free();
+        if (UPDATER.decrementAndGet(this) == 0)
+            super.free();
+    }
+
+    public RefCountedMemory copy(long newSize)
+    {
+        RefCountedMemory copy = new RefCountedMemory(newSize);
+        copy.put(0, this, 0, Math.min(size(), newSize));
+        return copy;
+    }
+
+    public void free()
+    {
+        throw new AssertionError();
     }
 
     public void close()

diff --git a/src/java/org/apache/cassandra/cache/RowCacheKey.java b/src/java/org/apache/cassandra/cache/RowCacheKey.java
index 33e2065..af2d4d4 100644
--- a/src/java/org/apache/cassandra/cache/RowCacheKey.java
+++ b/src/java/org/apache/cassandra/cache/RowCacheKey.java

@@ -21,21 +21,21 @@
 import java.util.Arrays;
 import java.util.UUID;
 
-import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.Pair;
 
 public class RowCacheKey implements CacheKey, Comparable<RowCacheKey>
 {
     public final UUID cfId;
     public final byte[] key;
 
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new RowCacheKey(null, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+
     public RowCacheKey(UUID cfId, DecoratedKey key)
     {
-        this(cfId, key.key);
+        this(cfId, key.getKey());
     }
 
     public RowCacheKey(UUID cfId, ByteBuffer key)
@@ -45,18 +45,14 @@
         assert this.key != null;
     }
 
-    public Pair<String, String> getPathInfo()
+    public UUID getCFId()
     {
-        return Schema.instance.getCF(cfId);
+        return cfId;
     }
 
-    public long memorySize()
+    public long unsharedHeapSize()
     {
-        return ObjectSizes.getFieldSize(// cfId
-                                        ObjectSizes.getReferenceSize() +
-                                        // key
-                                        ObjectSizes.getReferenceSize())
-               + ObjectSizes.getArraySize(key);
+        return EMPTY_SIZE + ObjectSizes.sizeOfArray(key);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/cache/RowCacheSentinel.java b/src/java/org/apache/cassandra/cache/RowCacheSentinel.java
index 9a014dc..83c49d4 100644
--- a/src/java/org/apache/cassandra/cache/RowCacheSentinel.java
+++ b/src/java/org/apache/cassandra/cache/RowCacheSentinel.java

@@ -19,9 +19,6 @@
 
 import java.util.concurrent.atomic.AtomicLong;
 
-import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.utils.ObjectSizes;
-
 import com.google.common.base.Objects;
 
 /**
@@ -57,10 +54,4 @@
     {
         return Objects.hashCode(sentinelId);
     }
-
-    public long memorySize()
-    {
-        // Only long reference.
-        return ObjectSizes.getFieldSize(TypeSizes.NATIVE.sizeof(sentinelId));
-    }
 }

diff --git a/src/java/org/apache/cassandra/cache/SerializingCacheProvider.java b/src/java/org/apache/cassandra/cache/SerializingCacheProvider.java
index 84c948e..a058872 100644
--- a/src/java/org/apache/cassandra/cache/SerializingCacheProvider.java
+++ b/src/java/org/apache/cassandra/cache/SerializingCacheProvider.java

@@ -18,12 +18,12 @@
 package org.apache.cassandra.cache;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 
 public class SerializingCacheProvider
@@ -36,7 +36,7 @@
     // Package protected for tests
     static class RowCacheSerializer implements ISerializer<IRowCacheEntry>
     {
-        public void serialize(IRowCacheEntry entry, DataOutput out) throws IOException
+        public void serialize(IRowCacheEntry entry, DataOutputPlus out) throws IOException
         {
             assert entry != null; // unlike CFS we don't support nulls, since there is no need for that in the cache
             boolean isSentinel = entry instanceof RowCacheSentinel;

diff --git a/src/java/org/apache/cassandra/cli/Cli.g b/src/java/org/apache/cassandra/cli/Cli.g
index 6dee785..07cdc1f 100644
--- a/src/java/org/apache/cassandra/cli/Cli.g
+++ b/src/java/org/apache/cassandra/cli/Cli.g

@@ -98,7 +98,7 @@
 {
     public void reportError(RecognitionException e) 
     {
-        StringBuilder errorMessage = new StringBuilder("Syntax error at position " + e.charPositionInLine + ": ");
+        StringBuilder errorMessage = new StringBuilder("Syntax error at position ").append(e.charPositionInLine).append(": ");
 
         if (e instanceof NoViableAltException)
         {
@@ -106,7 +106,7 @@
             String error = this.input.substring(index, index);
             String statement = this.input.substring(0, this.input.size() - 1);
 
-            errorMessage.append("unexpected \"" + error + "\" for `" + statement + "`.");
+            errorMessage.append("unexpected \"").append(error).append("\" for `").append(statement).append("`.");
         }
         else
         {

diff --git a/src/java/org/apache/cassandra/cli/CliClient.java b/src/java/org/apache/cassandra/cli/CliClient.java
index 6d8aa2e..c6b3bcd 100644
--- a/src/java/org/apache/cassandra/cli/CliClient.java
+++ b/src/java/org/apache/cassandra/cli/CliClient.java

@@ -140,6 +140,7 @@
         INDEX_INTERVAL,
         MEMTABLE_FLUSH_PERIOD_IN_MS,
         CACHING,
+        CELLS_PER_ROW_TO_CACHE,
         DEFAULT_TIME_TO_LIVE,
         SPECULATIVE_RETRY,
         POPULATE_IO_CACHE_ON_FLUSH,
@@ -628,9 +629,8 @@
             }
             catch (RequestValidationException ce)
             {
-                StringBuilder errorMessage = new StringBuilder("Unknown comparator '" + compareWith + "'. ");
-                errorMessage.append("Available functions: ");
-                throw new RuntimeException(errorMessage.append(Function.getFunctionNames()).toString(), e);
+                String message = String.format("Unknown comparator '%s'. Available functions: %s", compareWith, Function.getFunctionNames());
+                throw new RuntimeException(message, e);
             }
         }
 
@@ -1349,6 +1349,9 @@
             case CACHING:
                 cfDef.setCaching(CliUtils.unescapeSQLString(mValue));
                 break;
+            case CELLS_PER_ROW_TO_CACHE:
+                cfDef.setCells_per_row_to_cache(CliUtils.unescapeSQLString(mValue));
+                break;
             case DEFAULT_TIME_TO_LIVE:
                 cfDef.setDefault_time_to_live(Integer.parseInt(mValue));
                 break;
@@ -1768,7 +1771,7 @@
             String prefix = "";
             for (Map.Entry<String, String> opt : ksDef.strategy_options.entrySet())
             {
-                opts.append(prefix + CliUtils.escapeSQLString(opt.getKey()) + " : " + CliUtils.escapeSQLString(opt.getValue()));
+                opts.append(prefix).append(CliUtils.escapeSQLString(opt.getKey())).append(" : ").append(CliUtils.escapeSQLString(opt.getValue()));
                 prefix = ", ";
             }
             opts.append("}");
@@ -1780,7 +1783,7 @@
         output.append(";").append(NEWLINE);
         output.append(NEWLINE);
 
-        output.append("use " + CliUtils.maybeEscapeName(ksDef.name) + ";");
+        output.append("use ").append(CliUtils.maybeEscapeName(ksDef.name)).append(";");
         output.append(NEWLINE);
         output.append(NEWLINE);
 
@@ -1813,13 +1816,12 @@
 
         writeAttr(output, false, "read_repair_chance", cfDef.read_repair_chance);
         writeAttr(output, false, "dclocal_read_repair_chance", cfDef.dclocal_read_repair_chance);
-        writeAttr(output, false, "populate_io_cache_on_flush", cfDef.populate_io_cache_on_flush);
         writeAttr(output, false, "gc_grace", cfDef.gc_grace_seconds);
         writeAttr(output, false, "min_compaction_threshold", cfDef.min_compaction_threshold);
         writeAttr(output, false, "max_compaction_threshold", cfDef.max_compaction_threshold);
-        writeAttr(output, false, "replicate_on_write", cfDef.replicate_on_write);
         writeAttr(output, false, "compaction_strategy", cfDef.compaction_strategy);
         writeAttr(output, false, "caching", cfDef.caching);
+        writeAttr(output, false, "cells_per_row_to_cache", cfDef.cells_per_row_to_cache);
         writeAttr(output, false, "default_time_to_live", cfDef.default_time_to_live);
         writeAttr(output, false, "speculative_retry", cfDef.speculative_retry);
 
@@ -1849,7 +1851,6 @@
 
             writeAttrRaw(output, false, "compaction_strategy_options", cOptions.toString());
         }
-
         if (!StringUtils.isEmpty(cfDef.comment))
             writeAttr(output, false, "comment", cfDef.comment);
 
@@ -2186,8 +2187,6 @@
         sessionState.out.printf("      Compaction min/max thresholds: %s/%s%n", cf_def.min_compaction_threshold, cf_def.max_compaction_threshold);
         sessionState.out.printf("      Read repair chance: %s%n", cf_def.read_repair_chance);
         sessionState.out.printf("      DC Local Read repair chance: %s%n", cf_def.dclocal_read_repair_chance);
-        sessionState.out.printf("      Populate IO Cache on flush: %b%n", cf_def.populate_io_cache_on_flush);
-        sessionState.out.printf("      Replicate on write: %s%n", cf_def.replicate_on_write);
         sessionState.out.printf("      Caching: %s%n", cf_def.caching);
         sessionState.out.printf("      Default time to live: %s%n", cf_def.default_time_to_live);
         sessionState.out.printf("      Bloom Filter FP chance: %s%n", cf_def.isSetBloom_filter_fp_chance() ? cf_def.bloom_filter_fp_chance : "default");
@@ -2830,9 +2829,8 @@
         }
         catch (IllegalArgumentException e)
         {
-            StringBuilder errorMessage = new StringBuilder("Function '" + functionName + "' not found. ");
-            errorMessage.append("Available functions: ");
-            throw new RuntimeException(errorMessage.append(Function.getFunctionNames()).toString(), e);
+            String message = String.format("Function '%s' not found. Available functions: %s", functionName, Function.getFunctionNames());
+            throw new RuntimeException(message, e);
         }
 
         return function.getValidator();

diff --git a/src/java/org/apache/cassandra/client/RingCache.java b/src/java/org/apache/cassandra/client/RingCache.java
index cc9b1b2..c3dbda5 100644
--- a/src/java/org/apache/cassandra/client/RingCache.java
+++ b/src/java/org/apache/cassandra/client/RingCache.java

@@ -81,25 +81,20 @@
                     try
                     {
                         rangeMap.put(r, InetAddress.getByName(host));
-                    }
-                    catch (UnknownHostException e)
+                    } catch (UnknownHostException e)
                     {
                         throw new AssertionError(e); // host strings are IPs
                     }
                 }
             }
         }
-        catch (InvalidRequestException e)
-        {
-            throw new RuntimeException(e);
-        }
-        catch (IOException e)
+        catch (InvalidRequestException | IOException e)
         {
             throw new RuntimeException(e);
         }
         catch (TException e)
         {
-            logger.debug("Error contacting seed list" + ConfigHelper.getOutputInitialAddress(conf) + " " + e.getMessage());
+            logger.debug("Error contacting seed list {} {}", ConfigHelper.getOutputInitialAddress(conf), e.getMessage());
         }
     }
 

diff --git a/src/java/org/apache/cassandra/concurrent/AbstractTracingAwareExecutorService.java b/src/java/org/apache/cassandra/concurrent/AbstractTracingAwareExecutorService.java
new file mode 100644
index 0000000..fb753b0
--- /dev/null
+++ b/src/java/org/apache/cassandra/concurrent/AbstractTracingAwareExecutorService.java

@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.concurrent;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.tracing.TraceState;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+
+import static org.apache.cassandra.tracing.Tracing.isTracing;
+
+public abstract class AbstractTracingAwareExecutorService implements TracingAwareExecutorService
+{
+    private static final Logger logger = LoggerFactory.getLogger(AbstractTracingAwareExecutorService.class);
+
+    protected abstract void addTask(FutureTask<?> futureTask);
+    protected abstract void onCompletion();
+
+    /** Task Submission / Creation / Objects **/
+
+    public <T> FutureTask<T> submit(Callable<T> task)
+    {
+        return submit(newTaskFor(task));
+    }
+
+    public FutureTask<?> submit(Runnable task)
+    {
+        return submit(newTaskFor(task, null));
+    }
+
+    public <T> FutureTask<T> submit(Runnable task, T result)
+    {
+        return submit(newTaskFor(task, result));
+    }
+
+    public <T> List<Future<T>> invokeAll(Collection<? extends Callable<T>> tasks)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public <T> List<Future<T>> invokeAll(Collection<? extends Callable<T>> tasks, long timeout, TimeUnit unit) throws InterruptedException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public <T> T invokeAny(Collection<? extends Callable<T>> tasks) throws InterruptedException, ExecutionException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public <T> T invokeAny(Collection<? extends Callable<T>> tasks, long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    protected <T> FutureTask<T> newTaskFor(Runnable runnable, T result)
+    {
+        return newTaskFor(runnable, result, Tracing.instance.get());
+    }
+
+    protected <T> FutureTask<T> newTaskFor(Runnable runnable, T result, TraceState traceState)
+    {
+        if (traceState != null)
+        {
+            if (runnable instanceof TraceSessionFutureTask)
+                return (TraceSessionFutureTask<T>) runnable;
+            return new TraceSessionFutureTask<T>(runnable, result, traceState);
+        }
+        if (runnable instanceof FutureTask)
+            return (FutureTask<T>) runnable;
+        return new FutureTask<>(runnable, result);
+    }
+
+    protected <T> FutureTask<T> newTaskFor(Callable<T> callable)
+    {
+        if (isTracing())
+        {
+            if (callable instanceof TraceSessionFutureTask)
+                return (TraceSessionFutureTask<T>) callable;
+            return new TraceSessionFutureTask<T>(callable, Tracing.instance.get());
+        }
+        if (callable instanceof FutureTask)
+            return (FutureTask<T>) callable;
+        return new FutureTask<>(callable);
+    }
+
+    private class TraceSessionFutureTask<T> extends FutureTask<T>
+    {
+        private final TraceState state;
+
+        public TraceSessionFutureTask(Callable<T> callable, TraceState state)
+        {
+            super(callable);
+            this.state = state;
+        }
+
+        public TraceSessionFutureTask(Runnable runnable, T result, TraceState state)
+        {
+            super(runnable, result);
+            this.state = state;
+        }
+
+        public void run()
+        {
+            TraceState oldState = Tracing.instance.get();
+            Tracing.instance.set(state);
+            try
+            {
+                super.run();
+            }
+            finally
+            {
+                Tracing.instance.set(oldState);
+            }
+        }
+    }
+
+    class FutureTask<T> extends SimpleCondition implements Future<T>, Runnable
+    {
+        private boolean failure;
+        private Object result = this;
+        private final Callable<T> callable;
+
+        public FutureTask(Callable<T> callable)
+        {
+            this.callable = callable;
+        }
+        public FutureTask(Runnable runnable, T result)
+        {
+            this(Executors.callable(runnable, result));
+        }
+
+        public void run()
+        {
+            try
+            {
+                result = callable.call();
+            }
+            catch (Throwable t)
+            {
+                JVMStabilityInspector.inspectThrowable(t);
+                logger.warn("Uncaught exception on thread {}: {}", Thread.currentThread(), t);
+                result = t;
+                failure = true;
+            }
+            finally
+            {
+                signalAll();
+                onCompletion();
+            }
+        }
+
+        public boolean cancel(boolean mayInterruptIfRunning)
+        {
+            return false;
+        }
+
+        public boolean isCancelled()
+        {
+            return false;
+        }
+
+        public boolean isDone()
+        {
+            return isSignaled();
+        }
+
+        public T get() throws InterruptedException, ExecutionException
+        {
+            await();
+            Object result = this.result;
+            if (failure)
+                throw new ExecutionException((Throwable) result);
+            return (T) result;
+        }
+
+        public T get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
+        {
+            await(timeout, unit);
+            Object result = this.result;
+            if (failure)
+                throw new ExecutionException((Throwable) result);
+            return (T) result;
+        }
+    }
+
+    private <T> FutureTask<T> submit(FutureTask<T> task)
+    {
+        addTask(task);
+        return task;
+    }
+
+    public void execute(Runnable command)
+    {
+        addTask(newTaskFor(command, null));
+    }
+
+    public void execute(Runnable command, TraceState state)
+    {
+        addTask(newTaskFor(command, null, state));
+    }
+}

diff --git a/src/java/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutor.java b/src/java/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutor.java
index a41df54..4fc1d6c 100644
--- a/src/java/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutor.java
+++ b/src/java/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutor.java

@@ -19,6 +19,8 @@
 
 import java.util.concurrent.*;
 
+import org.apache.cassandra.utils.JVMStabilityInspector;
+
 /**
  * Like DebuggableThreadPoolExecutor, DebuggableScheduledThreadPoolExecutor always
  * logs exceptions from the tasks it is given, even if Future.get is never called elsewhere.
@@ -74,9 +76,10 @@
             {
                 runnable.run();
             }
-            catch (Throwable e)
+            catch (Throwable t)
             {
-                DebuggableThreadPoolExecutor.handleOrLog(e);
+                JVMStabilityInspector.inspectThrowable(t);
+                DebuggableThreadPoolExecutor.handleOrLog(t);
             }
         }
     }

diff --git a/src/java/org/apache/cassandra/concurrent/DebuggableThreadPoolExecutor.java b/src/java/org/apache/cassandra/concurrent/DebuggableThreadPoolExecutor.java
index 8e4dc7a..ea04af3 100644
--- a/src/java/org/apache/cassandra/concurrent/DebuggableThreadPoolExecutor.java
+++ b/src/java/org/apache/cassandra/concurrent/DebuggableThreadPoolExecutor.java

@@ -138,6 +138,11 @@
                       : new TraceSessionWrapper<Object>(command, state));
     }
 
+    public void maybeExecuteImmediately(Runnable command)
+    {
+        execute(command);
+    }
+
     // execute does not call newTaskFor
     @Override
     public void execute(Runnable command)

diff --git a/src/java/org/apache/cassandra/concurrent/JMXEnabledSharedExecutorPool.java b/src/java/org/apache/cassandra/concurrent/JMXEnabledSharedExecutorPool.java
new file mode 100644
index 0000000..d70e524
--- /dev/null
+++ b/src/java/org/apache/cassandra/concurrent/JMXEnabledSharedExecutorPool.java

@@ -0,0 +1,114 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.cassandra.concurrent;
+
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import javax.management.MBeanServer;
+import javax.management.ObjectName;
+
+import org.apache.cassandra.metrics.SEPMetrics;
+
+public class JMXEnabledSharedExecutorPool extends SharedExecutorPool
+{
+
+    public static final JMXEnabledSharedExecutorPool SHARED = new JMXEnabledSharedExecutorPool("SharedPool");
+
+    public JMXEnabledSharedExecutorPool(String poolName)
+    {
+        super(poolName);
+    }
+
+    public interface JMXEnabledSEPExecutorMBean extends JMXEnabledThreadPoolExecutorMBean
+    {
+    }
+
+    public class JMXEnabledSEPExecutor extends SEPExecutor implements JMXEnabledSEPExecutorMBean
+    {
+
+        private final SEPMetrics metrics;
+        private final String mbeanName;
+
+        public JMXEnabledSEPExecutor(int poolSize, int maxQueuedLength, String name, String jmxPath)
+        {
+            super(JMXEnabledSharedExecutorPool.this, poolSize, maxQueuedLength);
+            metrics = new SEPMetrics(this, jmxPath, name);
+
+            MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
+            mbeanName = "org.apache.cassandra." + jmxPath + ":type=" + name;
+
+            try
+            {
+                mbs.registerMBean(this, new ObjectName(mbeanName));
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+
+        private void unregisterMBean()
+        {
+            try
+            {
+                ManagementFactory.getPlatformMBeanServer().unregisterMBean(new ObjectName(mbeanName));
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+
+            // release metrics
+            metrics.release();
+        }
+
+        @Override
+        public synchronized void shutdown()
+        {
+            // synchronized, because there is no way to access super.mainLock, which would be
+            // the preferred way to make this threadsafe
+            if (!isShutdown())
+            {
+                unregisterMBean();
+            }
+            super.shutdown();
+        }
+
+        public int getCoreThreads()
+        {
+            return 0;
+        }
+
+        public void setCoreThreads(int number)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void setMaximumThreads(int number)
+        {
+            throw new UnsupportedOperationException();
+        }
+    }
+
+    public TracingAwareExecutorService newExecutor(int maxConcurrency, int maxQueuedTasks, String name, String jmxPath)
+    {
+        JMXEnabledSEPExecutor executor = new JMXEnabledSEPExecutor(maxConcurrency, maxQueuedTasks, name, jmxPath);
+        executors.add(executor);
+        return executor;
+    }
+}

diff --git a/src/java/org/apache/cassandra/concurrent/SEPExecutor.java b/src/java/org/apache/cassandra/concurrent/SEPExecutor.java
new file mode 100644
index 0000000..b6f5e97
--- /dev/null
+++ b/src/java/org/apache/cassandra/concurrent/SEPExecutor.java

@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.concurrent;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+
+import static org.apache.cassandra.concurrent.SEPWorker.Work;
+
+public class SEPExecutor extends AbstractTracingAwareExecutorService
+{
+    private final SharedExecutorPool pool;
+
+    private final int maxWorkers;
+    private final int maxTasksQueued;
+
+    // stores both a set of work permits and task permits:
+    //  bottom 32 bits are number of queued tasks, in the range [0..maxTasksQueued]   (initially 0)
+    //  top 32 bits are number of work permits available in the range [0..maxWorkers]   (initially maxWorkers)
+    private final AtomicLong permits = new AtomicLong();
+
+    // producers wait on this when there is no room on the queue
+    private final WaitQueue hasRoom = new WaitQueue();
+    private final AtomicLong totalBlocked = new AtomicLong();
+    private final AtomicInteger currentlyBlocked = new AtomicInteger();
+    private final AtomicLong completedTasks = new AtomicLong();
+
+    volatile boolean shuttingDown = false;
+    final SimpleCondition shutdown = new SimpleCondition();
+
+    // TODO: see if other queue implementations might improve throughput
+    protected final ConcurrentLinkedQueue<FutureTask<?>> tasks = new ConcurrentLinkedQueue<>();
+
+    SEPExecutor(SharedExecutorPool pool, int maxWorkers, int maxTasksQueued)
+    {
+        this.pool = pool;
+        this.maxWorkers = maxWorkers;
+        this.maxTasksQueued = maxTasksQueued;
+        this.permits.set(combine(0, maxWorkers));
+    }
+
+    protected void onCompletion()
+    {
+        completedTasks.incrementAndGet();
+    }
+
+    // schedules another worker for this pool if there is work outstanding and there are no spinning threads that
+    // will self-assign to it in the immediate future
+    boolean maybeSchedule()
+    {
+        if (pool.spinningCount.get() > 0 || !takeWorkPermit(true))
+            return false;
+
+        pool.schedule(new Work(this));
+        return true;
+    }
+
+    protected void addTask(FutureTask<?> task)
+    {
+        // we add to the queue first, so that when a worker takes a task permit it can be certain there is a task available
+        // this permits us to schedule threads non-spuriously; it also means work is serviced fairly
+        tasks.add(task);
+        int taskPermits;
+        while (true)
+        {
+            long current = permits.get();
+            taskPermits = taskPermits(current);
+            // because there is no difference in practical terms between the work permit being added or not (the work is already in existence)
+            // we always add our permit, but block after the fact if we breached the queue limit
+            if (permits.compareAndSet(current, updateTaskPermits(current, taskPermits + 1)))
+                break;
+        }
+
+        if (taskPermits == 0)
+        {
+            // we only need to schedule a thread if there are no tasks already waiting to be processed, as
+            // the original enqueue will have started a thread to service its work which will have itself
+            // spawned helper workers that would have either exhausted the available tasks or are still being spawned.
+            // to avoid incurring any unnecessary signalling penalties we also do not take any work to hand to the new
+            // worker, we simply start a worker in a spinning state
+            pool.maybeStartSpinningWorker();
+        }
+        else if (taskPermits >= maxTasksQueued)
+        {
+            // register to receive a signal once a task is processed bringing the queue below its threshold
+            WaitQueue.Signal s = hasRoom.register();
+
+            // we will only be signalled once the queue drops below full, so this creates equivalent external behaviour
+            // however the advantage is that we never wake-up spuriously;
+            // we choose to always sleep, even if in the intervening time the queue has dropped below limit,
+            // so long as we _will_ eventually receive a signal
+            if (taskPermits(permits.get()) > maxTasksQueued)
+            {
+                // if we're blocking, we might as well directly schedule a worker if we aren't already at max
+                if (takeWorkPermit(true))
+                    pool.schedule(new Work(this));
+                totalBlocked.incrementAndGet();
+                currentlyBlocked.incrementAndGet();
+                s.awaitUninterruptibly();
+                currentlyBlocked.decrementAndGet();
+            }
+            else // don't propagate our signal when we cancel, just cancel
+                s.cancel();
+        }
+    }
+
+    // takes permission to perform a task, if any are available; once taken it is guaranteed
+    // that a proceeding call to tasks.poll() will return some work
+    boolean takeTaskPermit()
+    {
+        while (true)
+        {
+            long current = permits.get();
+            int taskPermits = taskPermits(current);
+            if (taskPermits == 0)
+                return false;
+            if (permits.compareAndSet(current, updateTaskPermits(current, taskPermits - 1)))
+            {
+                if (taskPermits == maxTasksQueued && hasRoom.hasWaiters())
+                    hasRoom.signalAll();
+                return true;
+            }
+        }
+    }
+
+    // takes a worker permit and (optionally) a task permit simultaneously; if one of the two is unavailable, returns false
+    boolean takeWorkPermit(boolean takeTaskPermit)
+    {
+        int taskDelta = takeTaskPermit ? 1 : 0;
+        while (true)
+        {
+            long current = permits.get();
+            int workPermits = workPermits(current);
+            int taskPermits = taskPermits(current);
+            if (workPermits == 0 || taskPermits == 0)
+                return false;
+            if (permits.compareAndSet(current, combine(taskPermits - taskDelta, workPermits - 1)))
+            {
+                if (takeTaskPermit && taskPermits == maxTasksQueued && hasRoom.hasWaiters())
+                    hasRoom.signalAll();
+                return true;
+            }
+        }
+    }
+
+    // gives up a work permit
+    void returnWorkPermit()
+    {
+        while (true)
+        {
+            long current = permits.get();
+            int workPermits = workPermits(current);
+            if (permits.compareAndSet(current, updateWorkPermits(current, workPermits + 1)))
+                return;
+        }
+    }
+
+    public void maybeExecuteImmediately(Runnable command)
+    {
+        FutureTask<?> ft = newTaskFor(command, null);
+        if (!takeWorkPermit(false))
+        {
+            addTask(ft);
+        }
+        else
+        {
+            try
+            {
+                ft.run();
+            }
+            finally
+            {
+                returnWorkPermit();
+                // we have to maintain our invariant of always scheduling after any work is performed
+                // in this case in particular we are not processing the rest of the queue anyway, and so
+                // the work permit may go wasted if we don't immediately attempt to spawn another worker
+                maybeSchedule();
+            }
+        }
+    }
+
+    public synchronized void shutdown()
+    {
+        shuttingDown = true;
+        pool.executors.remove(this);
+        if (getActiveCount() == 0)
+            shutdown.signalAll();
+    }
+
+    public synchronized List<Runnable> shutdownNow()
+    {
+        shutdown();
+        List<Runnable> aborted = new ArrayList<>();
+        while (takeTaskPermit())
+            aborted.add(tasks.poll());
+        return aborted;
+    }
+
+    public boolean isShutdown()
+    {
+        return shuttingDown;
+    }
+
+    public boolean isTerminated()
+    {
+        return shuttingDown && shutdown.isSignaled();
+    }
+
+    public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException
+    {
+        shutdown.await(timeout, unit);
+        return isTerminated();
+    }
+
+    public long getPendingTasks()
+    {
+        return taskPermits(permits.get());
+    }
+
+    public long getCompletedTasks()
+    {
+        return completedTasks.get();
+    }
+
+    public int getActiveCount()
+    {
+        return maxWorkers - workPermits(permits.get());
+    }
+
+    public int getTotalBlockedTasks()
+    {
+        return (int) totalBlocked.get();
+    }
+
+    public int getMaximumThreads()
+    {
+        return maxWorkers;
+    }
+
+    public int getCurrentlyBlockedTasks()
+    {
+        return currentlyBlocked.get();
+    }
+
+    private static int taskPermits(long both)
+    {
+        return (int) both;
+    }
+
+    private static int workPermits(long both)
+    {
+        return (int) (both >>> 32);
+    }
+
+    private static long updateTaskPermits(long prev, int taskPermits)
+    {
+        return (prev & (-1L << 32)) | taskPermits;
+    }
+
+    private static long updateWorkPermits(long prev, int workPermits)
+    {
+        return (((long) workPermits) << 32) | (prev & (-1L >>> 32));
+    }
+
+    private static long combine(int taskPermits, int workPermits)
+    {
+        return (((long) workPermits) << 32) | taskPermits;
+    }
+}

diff --git a/src/java/org/apache/cassandra/concurrent/SEPWorker.java b/src/java/org/apache/cassandra/concurrent/SEPWorker.java
new file mode 100644
index 0000000..3b3e7ad
--- /dev/null
+++ b/src/java/org/apache/cassandra/concurrent/SEPWorker.java

@@ -0,0 +1,388 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.concurrent;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.locks.LockSupport;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.utils.JVMStabilityInspector;
+
+final class SEPWorker extends AtomicReference<SEPWorker.Work> implements Runnable
+{
+    private static final Logger logger = LoggerFactory.getLogger(SEPWorker.class);
+
+    final Long workerId;
+    final Thread thread;
+    final SharedExecutorPool pool;
+
+    // prevStopCheck stores the value of pool.stopCheck after we last incremented it; if it hasn't changed,
+    // we know nobody else was spinning in the interval, so we increment our soleSpinnerSpinTime accordingly,
+    // and otherwise we set it to zero; this is then used to terminate the final spinning thread, as the coordinated
+    // strategy can only work when there are multiple threads spinning (as more sleep time must elapse than real time)
+    long prevStopCheck = 0;
+    long soleSpinnerSpinTime = 0;
+
+    SEPWorker(Long workerId, Work initialState, SharedExecutorPool pool)
+    {
+        this.pool = pool;
+        this.workerId = workerId;
+        thread = new Thread(this, pool.poolName + "-Worker-" + workerId);
+        thread.setDaemon(true);
+        set(initialState);
+        thread.start();
+    }
+
+    public void run()
+    {
+        /**
+         * we maintain two important invariants:
+         * 1)   after exiting spinning phase, we ensure at least one more task on _each_ queue will be processed
+         *      promptly after we begin, assuming any are outstanding on any pools. this is to permit producers to
+         *      avoid signalling if there are _any_ spinning threads. we achieve this by simply calling maybeSchedule()
+         *      on each queue if on decrementing the spin counter we hit zero.
+         * 2)   before processing a task on a given queue, we attempt to assign another worker to the _same queue only_;
+         *      this allows a producer to skip signalling work if the task queue is currently non-empty, and in conjunction
+         *      with invariant (1) ensures that if any thread was spinning when a task was added to any executor, that
+         *      task will be processed immediately if work permits are available
+         */
+
+        SEPExecutor assigned = null;
+        Runnable task = null;
+        try
+        {
+            while (true)
+            {
+                if (isSpinning() && !selfAssign())
+                {
+                    doWaitSpin();
+                    continue;
+                }
+
+                // if stop was signalled, go to sleep (don't try self-assign; being put to sleep is rare, so let's obey it
+                // whenever we receive it - though we don't apply this constraint to producers, who may reschedule us before
+                // we go to sleep)
+                if (stop())
+                    while (isStopped())
+                        LockSupport.park();
+
+                // we can be assigned any state from STOPPED, so loop if we don't actually have any tasks assigned
+                assigned = get().assigned;
+                if (assigned == null)
+                    continue;
+                task = assigned.tasks.poll();
+
+                // if we do have tasks assigned, nobody will change our state so we can simply set it to WORKING
+                // (which is also a state that will never be interrupted externally)
+                set(Work.WORKING);
+                boolean shutdown;
+                while (true)
+                {
+                    // before we process any task, we maybe schedule a new worker _to our executor only_; this
+                    // ensures that even once all spinning threads have found work, if more work is left to be serviced
+                    // and permits are available, it will be dealt with immediately.
+                    assigned.maybeSchedule();
+
+                    // we know there is work waiting, as we have a work permit, so poll() will always succeed
+                    task.run();
+                    task = null;
+
+                    // if we're shutting down, or we fail to take a permit, we don't perform any more work
+                    if ((shutdown = assigned.shuttingDown) || !assigned.takeTaskPermit())
+                        break;
+                    task = assigned.tasks.poll();
+                }
+
+                // return our work permit, and maybe signal shutdown
+                assigned.returnWorkPermit();
+                if (shutdown && assigned.getActiveCount() == 0)
+                    assigned.shutdown.signalAll();
+                assigned = null;
+
+                // try to immediately reassign ourselves some work; if we fail, start spinning
+                if (!selfAssign())
+                    startSpinning();
+            }
+        }
+        catch (Throwable t)
+        {
+            JVMStabilityInspector.inspectThrowable(t);
+            while (true)
+            {
+                if (get().assigned != null)
+                {
+                    assigned = get().assigned;
+                    set(Work.WORKING);
+                }
+                if (assign(Work.STOPPED, true))
+                    break;
+            }
+            if (assigned != null)
+                assigned.returnWorkPermit();
+            if (task != null)
+                logger.error("Failed to execute task, unexpected exception killed worker: {}", t);
+            else
+                logger.error("Unexpected exception killed worker: {}", t);
+        }
+    }
+
+    // try to assign this worker the provided work
+    // valid states to assign are SPINNING, STOP_SIGNALLED, (ASSIGNED);
+    // restores invariants of the various states (e.g. spinningCount, descheduled collection and thread park status)
+    boolean assign(Work work, boolean self)
+    {
+        Work state = get();
+        while (state.canAssign(self))
+        {
+            if (!compareAndSet(state, work))
+            {
+                state = get();
+                continue;
+            }
+            // if we were spinning, exit the state (decrement the count); this is valid even if we are already spinning,
+            // as the assigning thread will have incremented the spinningCount
+            if (state.isSpinning())
+                stopSpinning();
+
+            // if we're being descheduled, place ourselves in the descheduled collection
+            if (work.isStop())
+                pool.descheduled.put(workerId, this);
+
+            // if we're currently stopped, and the new state is not a stop signal
+            // (which we can immediately convert to stopped), unpark the worker
+            if (state.isStopped() && (!work.isStop() || !stop()))
+                LockSupport.unpark(thread);
+            return true;
+        }
+        return false;
+    }
+
+    // try to assign ourselves an executor with work available
+    private boolean selfAssign()
+    {
+        // if we aren't permitted to assign in this state, fail
+        if (!get().canAssign(true))
+            return false;
+        for (SEPExecutor exec : pool.executors)
+        {
+            if (exec.takeWorkPermit(true))
+            {
+                Work work = new Work(exec);
+                // we successfully started work on this executor, so we must either assign it to ourselves or ...
+                if (assign(work, true))
+                    return true;
+                // ... if we fail, schedule it to another worker
+                pool.schedule(work);
+                // and return success as we must have already been assigned a task
+                assert get().assigned != null;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // we can only call this when our state is WORKING, and no other thread may change our state in this case;
+    // so in this case only we do not need to CAS. We increment the spinningCount and add ourselves to the spinning
+    // collection at the same time
+    private void startSpinning()
+    {
+        assert get() == Work.WORKING;
+        pool.spinningCount.incrementAndGet();
+        set(Work.SPINNING);
+    }
+
+    // exit the spinning state; if there are no remaining spinners, we immediately try and schedule work for all executors
+    // so that any producer is safe to not spin up a worker when they see a spinning thread (invariant (1) above)
+    private void stopSpinning()
+    {
+        if (pool.spinningCount.decrementAndGet() == 0)
+            for (SEPExecutor executor : pool.executors)
+                executor.maybeSchedule();
+        prevStopCheck = soleSpinnerSpinTime = 0;
+    }
+
+    // perform a sleep-spin, incrementing pool.stopCheck accordingly
+    private void doWaitSpin()
+    {
+        // pick a random sleep interval based on the number of threads spinning, so that
+        // we should always have a thread about to wake up, but most threads are sleeping
+        long sleep = 10000L * pool.spinningCount.get();
+        sleep = Math.min(1000000, sleep);
+        sleep *= Math.random();
+        sleep = Math.max(10000, sleep);
+
+        long start = System.nanoTime();
+
+        // place ourselves in the spinning collection; if we clash with another thread just exit
+        Long target = start + sleep;
+        if (pool.spinning.putIfAbsent(target, this) != null)
+            return;
+        LockSupport.parkNanos(sleep);
+
+        // remove ourselves (if haven't been already) - we should be at or near the front, so should be cheap-ish
+        pool.spinning.remove(target, this);
+
+        // finish timing and grab spinningTime (before we finish timing so it is under rather than overestimated)
+        long end = System.nanoTime();
+        long spin = end - start;
+        long stopCheck = pool.stopCheck.addAndGet(spin);
+        maybeStop(stopCheck, end);
+        if (prevStopCheck + spin == stopCheck)
+            soleSpinnerSpinTime += spin;
+        else
+            soleSpinnerSpinTime = 0;
+        prevStopCheck = stopCheck;
+    }
+
+    private static final long stopCheckInterval = TimeUnit.MILLISECONDS.toNanos(10L);
+
+    // stops a worker if elapsed real time is less than elapsed spin time, as this implies the equivalent of
+    // at least one worker achieved nothing in the interval. we achieve this by maintaining a stopCheck which
+    // is initialised to a negative offset from realtime; as we spin we add to this value, and if we ever exceed
+    // realtime we have spun too much and deschedule; if we get too far behind realtime, we reset to our initial offset
+    private void maybeStop(long stopCheck, long now)
+    {
+        long delta = now - stopCheck;
+        if (delta <= 0)
+        {
+            // if stopCheck has caught up with present, we've been spinning too much, so if we can atomically
+            // set it to the past again, we should stop a worker
+            if (pool.stopCheck.compareAndSet(stopCheck, now - stopCheckInterval))
+            {
+                // try and stop ourselves;
+                // if we've already been assigned work stop another worker
+                if (!assign(Work.STOP_SIGNALLED, true))
+                    pool.schedule(Work.STOP_SIGNALLED);
+            }
+        }
+        else if (soleSpinnerSpinTime > stopCheckInterval && pool.spinningCount.get() == 1)
+        {
+            // permit self-stopping
+            assign(Work.STOP_SIGNALLED, true);
+        }
+        else
+        {
+            // if stop check has gotten too far behind present, update it so new spins can affect it
+            while (delta > stopCheckInterval * 2 && !pool.stopCheck.compareAndSet(stopCheck, now - stopCheckInterval))
+            {
+                stopCheck = pool.stopCheck.get();
+                delta = now - stopCheck;
+            }
+        }
+    }
+
+    private boolean isSpinning()
+    {
+        return get().isSpinning();
+    }
+
+    private boolean stop()
+    {
+        return get().isStop() && compareAndSet(Work.STOP_SIGNALLED, Work.STOPPED);
+    }
+
+    private boolean isStopped()
+    {
+        return get().isStopped();
+    }
+
+    /**
+     * Represents, and communicates changes to, a worker's work state - there are three non-actively-working
+     * states (STOP_SIGNALLED, STOPPED, AND SPINNING) and two working states: WORKING, and (ASSIGNED), the last
+     * being represented by a non-static instance with its "assigned" executor set.
+     *
+     * STOPPED:         indicates the worker is descheduled, and whilst accepts work in this state (causing it to
+     *                  be rescheduled) it will generally not be considered for work until all other worker threads are busy.
+     *                  In this state we should be present in the pool.descheduled collection, and should be parked
+     * -> (ASSIGNED)|SPINNING
+     * STOP_SIGNALLED:  the worker has been asked to deschedule itself, but has not yet done so; only entered from a SPINNING
+     *                  state, and generally communicated to itself, but maybe set from any worker. this state may be preempted
+     *                  and replaced with (ASSIGNED) or SPINNING
+     *                  In this state we should be present in the pool.descheduled collection
+     * -> (ASSIGNED)|STOPPED|SPINNING
+     * SPINNING:        indicates the worker has no work to perform, so is performing a friendly wait-based-spinning
+     *                  until it either is (ASSIGNED) some work (by itself or another thread), or sent STOP_SIGNALLED
+     *                  In this state we _may_ be in the pool.spinning collection (but only if we are in the middle of a sleep)
+     * -> (ASSIGNED)|STOP_SIGNALLED|SPINNING
+     * (ASSIGNED):      asks the worker to perform some work against the specified executor, and preassigns a task permit
+     *                  from that executor so that in this state there is always work to perform.
+     *                  In general a worker assigns itself this state, but sometimes it may assign another worker the state
+     *                  either if there is work outstanding and no-spinning threads, or there is a race to self-assign
+     * -> WORKING
+     * WORKING:         indicates the worker is actively processing an executor's task queue; in this state it accepts
+     *                  no state changes/communications, except from itself; it usually exits this mode into SPINNING,
+     *                  but if work is immediately available on another executor it self-triggers (ASSIGNED)
+     * -> SPINNING|(ASSIGNED)
+     */
+
+    static final class Work
+    {
+        static final Work STOP_SIGNALLED = new Work();
+        static final Work STOPPED = new Work();
+        static final Work SPINNING = new Work();
+        static final Work WORKING = new Work();
+
+        final SEPExecutor assigned;
+
+        Work(SEPExecutor executor)
+        {
+            this.assigned = executor;
+        }
+
+        private Work()
+        {
+            this.assigned = null;
+        }
+
+        boolean canAssign(boolean self)
+        {
+            // we can assign work if there isn't new work already assigned and either
+            // 1) we are assigning to ourselves
+            // 2) the worker we are assigning to is not already in the middle of WORKING
+            return assigned == null && (self || !isWorking());
+        }
+
+        boolean isSpinning()
+        {
+            return this == Work.SPINNING;
+        }
+
+        boolean isWorking()
+        {
+            return this == Work.WORKING;
+        }
+
+        boolean isStop()
+        {
+            return this == Work.STOP_SIGNALLED;
+        }
+
+        boolean isStopped()
+        {
+            return this == Work.STOPPED;
+        }
+
+        boolean isAssigned()
+        {
+            return assigned != null;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java b/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java
new file mode 100644
index 0000000..e03ec57
--- /dev/null
+++ b/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java

@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.concurrent;
+
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.cassandra.concurrent.SEPWorker.Work;
+
+/**
+ * A pool of worker threads that are shared between all Executors created with it. Each executor is treated as a distinct
+ * unit, with its own concurrency and task queue limits, but the threads that service the tasks on each executor are
+ * free to hop between executors at will.
+ *
+ * To keep producers from incurring unnecessary delays, once an executor is "spun up" (i.e. is processing tasks at a steady
+ * rate), adding tasks to the executor often involves only placing the task on the work queue and updating the
+ * task permits (which imposes our max queue length constraints). Only when it cannot be guaranteed the task will be serviced
+ * promptly does the producer have to signal a thread itself to perform the work.
+ *
+ * We do this by scheduling only if
+ *
+ * The worker threads schedule themselves as far as possible: when they are assigned a task, they will attempt to spawn
+ * a partner worker to service any other work outstanding on the queue (if any); once they have finished the task they
+ * will either take another (if any remaining) and repeat this, or they will attempt to assign themselves to another executor
+ * that does have tasks remaining. If both fail, it will enter a non-busy-spinning phase, where it will sleep for a short
+ * random interval (based upon the number of threads in this mode, so that the total amount of non-sleeping time remains
+ * approximately fixed regardless of the number of spinning threads), and upon waking up will again try to assign themselves
+ * an executor with outstanding tasks to perform.
+ */
+public class SharedExecutorPool
+{
+
+    // the name assigned to workers in the pool, and the id suffix
+    final String poolName;
+    final AtomicLong workerId = new AtomicLong();
+
+    // the collection of executors serviced by this pool; periodically ordered by traffic volume
+    final List<SEPExecutor> executors = new CopyOnWriteArrayList<>();
+
+    // the number of workers currently in a spinning state
+    final AtomicInteger spinningCount = new AtomicInteger();
+    // see SEPWorker.maybeStop() - used to self coordinate stopping of threads
+    final AtomicLong stopCheck = new AtomicLong();
+    // the collection of threads that are (most likely) in a spinning state - new workers are scheduled from here first
+    // TODO: consider using a queue partially-ordered by scheduled wake-up time
+    // (a full-fledged correctly ordered SkipList is overkill)
+    final ConcurrentSkipListMap<Long, SEPWorker> spinning = new ConcurrentSkipListMap<>();
+    // the collection of threads that have been asked to stop/deschedule - new workers are scheduled from here last
+    final ConcurrentSkipListMap<Long, SEPWorker> descheduled = new ConcurrentSkipListMap<>();
+
+    public SharedExecutorPool(String poolName)
+    {
+        this.poolName = poolName;
+    }
+
+    void schedule(Work work)
+    {
+        // we try to hand-off our work to the spinning queue before the descheduled queue, even though we expect it to be empty
+        // all we're doing here is hoping to find a worker without work to do, but it doesn't matter too much what we find;
+        // we atomically set the task so even if this were a collection of all workers it would be safe, and if they are both
+        // empty we schedule a new thread
+        Map.Entry<Long, SEPWorker> e;
+        while (null != (e = spinning.pollFirstEntry()) || null != (e = descheduled.pollFirstEntry()))
+            if (e.getValue().assign(work, false))
+                return;
+
+        if (!work.isStop())
+            new SEPWorker(workerId.incrementAndGet(), work, this);
+    }
+
+    void maybeStartSpinningWorker()
+    {
+        // in general the workers manage spinningCount directly; however if it is zero, we increment it atomically
+        // ourselves to avoid starting a worker unless we have to
+        int current = spinningCount.get();
+        if (current == 0 && spinningCount.compareAndSet(0, 1))
+            schedule(Work.SPINNING);
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/concurrent/Stage.java b/src/java/org/apache/cassandra/concurrent/Stage.java
index f2907e2..6192cab 100644
--- a/src/java/org/apache/cassandra/concurrent/Stage.java
+++ b/src/java/org/apache/cassandra/concurrent/Stage.java

@@ -21,6 +21,7 @@
 {
     READ,
     MUTATION,
+    COUNTER_MUTATION,
     GOSSIP,
     REQUEST_RESPONSE,
     ANTI_ENTROPY,
@@ -28,8 +29,7 @@
     MISC,
     TRACING,
     INTERNAL_RESPONSE,
-    READ_REPAIR,
-    REPLICATE_ON_WRITE;
+    READ_REPAIR;
 
     public String getJmxType()
     {
@@ -43,9 +43,9 @@
             case INTERNAL_RESPONSE:
                 return "internal";
             case MUTATION:
+            case COUNTER_MUTATION:
             case READ:
             case REQUEST_RESPONSE:
-            case REPLICATE_ON_WRITE:
             case READ_REPAIR:
                 return "request";
             default:

diff --git a/src/java/org/apache/cassandra/concurrent/StageManager.java b/src/java/org/apache/cassandra/concurrent/StageManager.java
index 2960f22..303f658 100644
--- a/src/java/org/apache/cassandra/concurrent/StageManager.java
+++ b/src/java/org/apache/cassandra/concurrent/StageManager.java

@@ -43,15 +43,13 @@
 
     public static final long KEEPALIVE = 60; // seconds to keep "extra" threads alive for when idle
 
-    public static final int MAX_REPLICATE_ON_WRITE_TASKS = 1024 * FBUtilities.getAvailableProcessors();
-
     static
     {
-        stages.put(Stage.MUTATION, multiThreadedConfigurableStage(Stage.MUTATION, getConcurrentWriters()));
-        stages.put(Stage.READ, multiThreadedConfigurableStage(Stage.READ, getConcurrentReaders()));
-        stages.put(Stage.REQUEST_RESPONSE, multiThreadedStage(Stage.REQUEST_RESPONSE, FBUtilities.getAvailableProcessors()));
+        stages.put(Stage.MUTATION, multiThreadedLowSignalStage(Stage.MUTATION, getConcurrentWriters()));
+        stages.put(Stage.COUNTER_MUTATION, multiThreadedLowSignalStage(Stage.COUNTER_MUTATION, getConcurrentCounterWriters()));
+        stages.put(Stage.READ, multiThreadedLowSignalStage(Stage.READ, getConcurrentReaders()));
+        stages.put(Stage.REQUEST_RESPONSE, multiThreadedLowSignalStage(Stage.REQUEST_RESPONSE, FBUtilities.getAvailableProcessors()));
         stages.put(Stage.INTERNAL_RESPONSE, multiThreadedStage(Stage.INTERNAL_RESPONSE, FBUtilities.getAvailableProcessors()));
-        stages.put(Stage.REPLICATE_ON_WRITE, multiThreadedConfigurableStage(Stage.REPLICATE_ON_WRITE, getConcurrentReplicators(), MAX_REPLICATE_ON_WRITE_TASKS));
         // the rest are all single-threaded
         stages.put(Stage.GOSSIP, new JMXEnabledThreadPoolExecutor(Stage.GOSSIP));
         stages.put(Stage.ANTI_ENTROPY, new JMXEnabledThreadPoolExecutor(Stage.ANTI_ENTROPY));
@@ -89,24 +87,9 @@
                                                 stage.getJmxType());
     }
 
-    private static JMXConfigurableThreadPoolExecutor multiThreadedConfigurableStage(Stage stage, int numThreads)
+    private static TracingAwareExecutorService multiThreadedLowSignalStage(Stage stage, int numThreads)
     {
-        return new JMXConfigurableThreadPoolExecutor(numThreads,
-                                                     KEEPALIVE,
-                                                     TimeUnit.SECONDS,
-                                                     new LinkedBlockingQueue<Runnable>(),
-                                                     new NamedThreadFactory(stage.getJmxName()),
-                                                     stage.getJmxType());
-    }
-
-    private static JMXConfigurableThreadPoolExecutor multiThreadedConfigurableStage(Stage stage, int numThreads, int maxTasksBeforeBlock)
-    {
-        return new JMXConfigurableThreadPoolExecutor(numThreads,
-                                                     KEEPALIVE,
-                                                     TimeUnit.SECONDS,
-                                                     new LinkedBlockingQueue<Runnable>(maxTasksBeforeBlock),
-                                                     new NamedThreadFactory(stage.getJmxName()),
-                                                     stage.getJmxType());
+        return JMXEnabledSharedExecutorPool.SHARED.newExecutor(numThreads, Integer.MAX_VALUE, stage.getJmxName(), stage.getJmxType());
     }
 
     /**
@@ -146,6 +129,11 @@
             super.execute(command);
         }
 
+        public void maybeExecuteImmediately(Runnable command)
+        {
+            execute(command);
+        }
+
         @Override
         public Future<?> submit(Runnable task)
         {

diff --git a/src/java/org/apache/cassandra/concurrent/TracingAwareExecutorService.java b/src/java/org/apache/cassandra/concurrent/TracingAwareExecutorService.java
index e5dcd7e..f580fea 100644
--- a/src/java/org/apache/cassandra/concurrent/TracingAwareExecutorService.java
+++ b/src/java/org/apache/cassandra/concurrent/TracingAwareExecutorService.java

@@ -30,4 +30,7 @@
     // we need a way to inject a TraceState directly into the Executor context without going through
     // the global Tracing sessions; see CASSANDRA-5668
     public void execute(Runnable command, TraceState state);
+
+    // permits executing in the context of the submitting thread
+    public void maybeExecuteImmediately(Runnable command);
 }

diff --git a/src/java/org/apache/cassandra/config/CFMetaData.java b/src/java/org/apache/cassandra/config/CFMetaData.java
index e726957..b5784ed 100644
--- a/src/java/org/apache/cassandra/config/CFMetaData.java
+++ b/src/java/org/apache/cassandra/config/CFMetaData.java

@@ -20,35 +20,74 @@
 import java.io.DataInput;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
 import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Objects;
+import com.google.common.base.Strings;
+import com.google.common.collect.AbstractIterator;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.MapDifference;
 import com.google.common.collect.Maps;
 import org.apache.commons.lang3.ArrayUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.lang3.builder.EqualsBuilder;
 import org.apache.commons.lang3.builder.HashCodeBuilder;
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.cql3.CFDefinition;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
+import org.apache.cassandra.cache.CachingOptions;
+import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.statements.CFStatement;
 import org.apache.cassandra.cql3.statements.CreateTableStatement;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.AbstractCell;
+import org.apache.cassandra.db.AtomDeserializer;
+import org.apache.cassandra.db.CFRowAdder;
+import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.ColumnFamilyType;
+import org.apache.cassandra.db.ColumnSerializer;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.OnDiskAtom;
+import org.apache.cassandra.db.RangeTombstone;
+import org.apache.cassandra.db.Row;
+import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
 import org.apache.cassandra.db.compaction.LeveledCompactionStrategy;
 import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
+import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.composites.CompoundCType;
+import org.apache.cassandra.db.composites.SimpleCType;
 import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.CounterColumnType;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.TypeParser;
+import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestValidationException;
@@ -57,52 +96,55 @@
 import org.apache.cassandra.io.compress.LZ4Compressor;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.thrift.CfDef;
+import org.apache.cassandra.thrift.CqlResult;
 import org.apache.cassandra.thrift.CqlRow;
-import org.apache.cassandra.thrift.IndexType;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
 
-import static org.apache.cassandra.utils.FBUtilities.*;
+import static org.apache.cassandra.utils.FBUtilities.fromJsonList;
+import static org.apache.cassandra.utils.FBUtilities.fromJsonMap;
+import static org.apache.cassandra.utils.FBUtilities.json;
 
+/**
+ * This class can be tricky to modify. Please read http://wiki.apache.org/cassandra/ConfigurationNotes for how to do so safely.
+ */
 public final class CFMetaData
 {
-    //
-    // !! Important !!
-    // This class can be tricky to modify.  Please read http://wiki.apache.org/cassandra/ConfigurationNotes
-    // for how to do so safely.
-    //
-
     private static final Logger logger = LoggerFactory.getLogger(CFMetaData.class);
 
     public final static double DEFAULT_READ_REPAIR_CHANCE = 0.0;
     public final static double DEFAULT_DCLOCAL_READ_REPAIR_CHANCE = 0.1;
-    public final static boolean DEFAULT_REPLICATE_ON_WRITE = true;
     public final static int DEFAULT_GC_GRACE_SECONDS = 864000;
     public final static int DEFAULT_MIN_COMPACTION_THRESHOLD = 4;
     public final static int DEFAULT_MAX_COMPACTION_THRESHOLD = 32;
     public final static Class<? extends AbstractCompactionStrategy> DEFAULT_COMPACTION_STRATEGY_CLASS = SizeTieredCompactionStrategy.class;
-    public final static Caching DEFAULT_CACHING_STRATEGY = Caching.KEYS_ONLY;
+    public final static CachingOptions DEFAULT_CACHING_STRATEGY = CachingOptions.KEYS_ONLY;
     public final static int DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
     public final static SpeculativeRetry DEFAULT_SPECULATIVE_RETRY = new SpeculativeRetry(SpeculativeRetry.RetryType.PERCENTILE, 0.99);
-    public final static int DEFAULT_INDEX_INTERVAL = 128;
-    public final static boolean DEFAULT_POPULATE_IO_CACHE_ON_FLUSH = false;
+    public final static int DEFAULT_MIN_INDEX_INTERVAL = 128;
+    public final static int DEFAULT_MAX_INDEX_INTERVAL = 2048;
 
     // Note that this is the default only for user created tables
     public final static String DEFAULT_COMPRESSOR = LZ4Compressor.class.getCanonicalName();
 
+    // Note that this need to come *before* any CFMetaData is defined so before the compile below.
+    private static final Comparator<ColumnDefinition> regularColumnComparator = new Comparator<ColumnDefinition>()
+    {
+        public int compare(ColumnDefinition def1, ColumnDefinition def2)
+        {
+            return ByteBufferUtil.compareUnsigned(def1.name.bytes, def2.name.bytes);
+        }
+    };
+
     public static final CFMetaData IndexCf = compile("CREATE TABLE \"" + SystemKeyspace.INDEX_CF + "\" ("
                                                      + "table_name text,"
                                                      + "index_name text,"
                                                      + "PRIMARY KEY (table_name, index_name)"
                                                      + ") WITH COMPACT STORAGE AND COMMENT='indexes that have been completed'");
 
-    public static final CFMetaData CounterIdCf = compile("CREATE TABLE \"" + SystemKeyspace.COUNTER_ID_CF + "\" ("
-                                                         + "key text,"
-                                                         + "id timeuuid,"
-                                                         + "PRIMARY KEY (key, id)"
-                                                         + ") WITH COMPACT STORAGE AND COMMENT='counter node IDs'");
-
     public static final CFMetaData SchemaKeyspacesCf = compile("CREATE TABLE " + SystemKeyspace.SCHEMA_KEYSPACES_CF + " ("
                                                                + "keyspace_name text PRIMARY KEY,"
                                                                + "durable_writes boolean,"
@@ -113,6 +155,7 @@
     public static final CFMetaData SchemaColumnFamiliesCf = compile("CREATE TABLE " + SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF + " ("
                                                                     + "keyspace_name text,"
                                                                     + "columnfamily_name text,"
+                                                                    + "cf_id uuid," // post-2.1 UUID cfid
                                                                     + "type text,"
                                                                     + "is_dense boolean,"
                                                                     + "comparator text,"
@@ -120,7 +163,6 @@
                                                                     + "comment text,"
                                                                     + "read_repair_chance double,"
                                                                     + "local_read_repair_chance double,"
-                                                                    + "replicate_on_write boolean,"
                                                                     + "gc_grace_seconds int,"
                                                                     + "default_validator text,"
                                                                     + "key_validator text,"
@@ -137,8 +179,9 @@
                                                                     + "column_aliases text,"
                                                                     + "compaction_strategy_options text,"
                                                                     + "speculative_retry text,"
-                                                                    + "populate_io_cache_on_flush boolean,"
                                                                     + "index_interval int,"
+                                                                    + "min_index_interval int,"
+                                                                    + "max_index_interval int,"
                                                                     + "dropped_columns map<text, bigint>,"
                                                                     + "PRIMARY KEY (keyspace_name, columnfamily_name)"
                                                                     + ") WITH COMMENT='ColumnFamily definitions' AND gc_grace_seconds=604800");
@@ -164,6 +207,14 @@
                                                               + "PRIMARY KEY (keyspace_name, columnfamily_name, trigger_name)"
                                                               + ") WITH COMMENT='triggers metadata table' AND gc_grace_seconds=604800");
 
+    public static final CFMetaData SchemaUserTypesCf = compile("CREATE TABLE " + SystemKeyspace.SCHEMA_USER_TYPES_CF + " ("
+                                                               + "keyspace_name text,"
+                                                               + "type_name text,"
+                                                               + "field_names list<text>,"
+                                                               + "field_types list<text>,"
+                                                               + "PRIMARY KEY (keyspace_name, type_name)"
+                                                               + ") WITH COMMENT='Defined user types' AND gc_grace_seconds=604800");
+
     public static final CFMetaData HintsCf = compile("CREATE TABLE " + SystemKeyspace.HINTS_CF + " ("
                                                      + "target_id uuid,"
                                                      + "hint_id timeuuid,"
@@ -283,28 +334,12 @@
                                                                  + "PRIMARY KEY (id)"
                                                                  + ") WITH COMMENT='show all compaction history' AND DEFAULT_TIME_TO_LIVE=604800");
 
-    public enum Caching
-    {
-        ALL, KEYS_ONLY, ROWS_ONLY, NONE;
-
-        public static Caching fromString(String cache) throws ConfigurationException
-        {
-            try
-            {
-                return valueOf(cache.toUpperCase());
-            }
-            catch (IllegalArgumentException e)
-            {
-                throw new ConfigurationException(String.format("%s not found, available types: %s.", cache, StringUtils.join(values(), ", ")));
-            }
-        }
-    }
 
     public static class SpeculativeRetry
     {
         public enum RetryType
         {
-            NONE, CUSTOM, PERCENTILE, ALWAYS;
+            NONE, CUSTOM, PERCENTILE, ALWAYS
         }
 
         public final RetryType type;
@@ -348,13 +383,19 @@
         @Override
         public boolean equals(Object obj)
         {
-            if (! (obj instanceof SpeculativeRetry))
+            if (!(obj instanceof SpeculativeRetry))
                 return false;
             SpeculativeRetry rhs = (SpeculativeRetry) obj;
             return Objects.equal(type, rhs.type) && Objects.equal(value, rhs.value);
         }
 
         @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(type, value);
+        }
+
+        @Override
         public String toString()
         {
             switch (type)
@@ -375,30 +416,29 @@
     public final String ksName;                       // name of keyspace
     public final String cfName;                       // name of this column family
     public final ColumnFamilyType cfType;             // standard, super
-    public volatile AbstractType<?> comparator;       // bytes, long, timeuuid, utf8, etc.
+    public volatile CellNameType comparator;          // bytes, long, timeuuid, utf8, etc.
 
     //OPTIONAL
     private volatile String comment = "";
     private volatile double readRepairChance = DEFAULT_READ_REPAIR_CHANCE;
     private volatile double dcLocalReadRepairChance = DEFAULT_DCLOCAL_READ_REPAIR_CHANCE;
-    private volatile boolean replicateOnWrite = DEFAULT_REPLICATE_ON_WRITE;
     private volatile int gcGraceSeconds = DEFAULT_GC_GRACE_SECONDS;
     private volatile AbstractType<?> defaultValidator = BytesType.instance;
     private volatile AbstractType<?> keyValidator = BytesType.instance;
     private volatile int minCompactionThreshold = DEFAULT_MIN_COMPACTION_THRESHOLD;
     private volatile int maxCompactionThreshold = DEFAULT_MAX_COMPACTION_THRESHOLD;
     private volatile Double bloomFilterFpChance = null;
-    private volatile Caching caching = DEFAULT_CACHING_STRATEGY;
-    private volatile int indexInterval = DEFAULT_INDEX_INTERVAL;
-    private int memtableFlushPeriod = 0;
+    private volatile CachingOptions caching = DEFAULT_CACHING_STRATEGY;
+    private volatile int minIndexInterval = DEFAULT_MIN_INDEX_INTERVAL;
+    private volatile int maxIndexInterval = DEFAULT_MAX_INDEX_INTERVAL;
+    private volatile int memtableFlushPeriod = 0;
     private volatile int defaultTimeToLive = DEFAULT_DEFAULT_TIME_TO_LIVE;
     private volatile SpeculativeRetry speculativeRetry = DEFAULT_SPECULATIVE_RETRY;
-    private volatile boolean populateIoCacheOnFlush = DEFAULT_POPULATE_IO_CACHE_ON_FLUSH;
-    private volatile Map<ByteBuffer, Long> droppedColumns = new HashMap<>();
+    private volatile Map<ColumnIdentifier, Long> droppedColumns = new HashMap<>();
     private volatile Map<String, TriggerDefinition> triggers = new HashMap<>();
-
+    private volatile boolean isPurged = false;
     /*
-     * All CQL3 columns definition are stored in the column_metadata map.
+     * All CQL3 columns definition are stored in the columnMetadata map.
      * On top of that, we keep separated collection of each kind of definition, to
      * 1) allow easy access to each kind and 2) for the partition key and
      * clustering key ones, those list are ordered by the "component index" of the
@@ -413,11 +453,11 @@
     // and CQL3 CF are *not* dense.
     private volatile Boolean isDense; // null means "we don't know and need to infer from other data"
 
-    private volatile Map<ByteBuffer, ColumnDefinition> column_metadata = new HashMap<>();
+    private volatile Map<ByteBuffer, ColumnDefinition> columnMetadata = new HashMap<>();
     private volatile List<ColumnDefinition> partitionKeyColumns;  // Always of size keyValidator.componentsCount, null padded if necessary
-    private volatile List<ColumnDefinition> clusteringKeyColumns; // Of size comparator.componentsCount or comparator.componentsCount -1, null padded if necessary
-    private volatile Set<ColumnDefinition> regularColumns;
-    private volatile Set<ColumnDefinition> staticColumns;
+    private volatile List<ColumnDefinition> clusteringColumns;    // Of size comparator.componentsCount or comparator.componentsCount -1, null padded if necessary
+    private volatile SortedSet<ColumnDefinition> regularColumns;  // We use a sorted set so iteration is of predictable order (for SELECT for instance)
+    private volatile SortedSet<ColumnDefinition> staticColumns;   // Same as above
     private volatile ColumnDefinition compactValueColumn;
 
     public volatile Class<? extends AbstractCompactionStrategy> compactionStrategyClass = DEFAULT_COMPACTION_STRATEGY_CLASS;
@@ -425,56 +465,70 @@
 
     public volatile CompressionParameters compressionParameters = new CompressionParameters(null);
 
-    // Processed infos used by CQL. This can be fully reconstructed from the CFMedata,
-    // so it's not saved on disk. It is however costlyish to recreate for each query
-    // so we cache it here (and update on each relevant CFMetadata change)
-    private volatile CFDefinition cqlCfDef;
-
-    public CFMetaData comment(String prop) { comment = enforceCommentNotNull(prop); return this;}
+    // attribute setters that return the modified CFMetaData instance
+    public CFMetaData comment(String prop) { comment = Strings.nullToEmpty(prop); return this;}
     public CFMetaData readRepairChance(double prop) {readRepairChance = prop; return this;}
     public CFMetaData dcLocalReadRepairChance(double prop) {dcLocalReadRepairChance = prop; return this;}
-    public CFMetaData replicateOnWrite(boolean prop) {replicateOnWrite = prop; return this;}
     public CFMetaData gcGraceSeconds(int prop) {gcGraceSeconds = prop; return this;}
     public CFMetaData defaultValidator(AbstractType<?> prop) {defaultValidator = prop; return this;}
     public CFMetaData keyValidator(AbstractType<?> prop) {keyValidator = prop; return this;}
     public CFMetaData minCompactionThreshold(int prop) {minCompactionThreshold = prop; return this;}
     public CFMetaData maxCompactionThreshold(int prop) {maxCompactionThreshold = prop; return this;}
-    public CFMetaData columnMetadata(Map<ByteBuffer,ColumnDefinition> prop) {column_metadata = prop; return this;}
     public CFMetaData compactionStrategyClass(Class<? extends AbstractCompactionStrategy> prop) {compactionStrategyClass = prop; return this;}
     public CFMetaData compactionStrategyOptions(Map<String, String> prop) {compactionStrategyOptions = prop; return this;}
     public CFMetaData compressionParameters(CompressionParameters prop) {compressionParameters = prop; return this;}
     public CFMetaData bloomFilterFpChance(Double prop) {bloomFilterFpChance = prop; return this;}
-    public CFMetaData caching(Caching prop) {caching = prop; return this;}
-    public CFMetaData indexInterval(int prop) {indexInterval = prop; return this;}
+    public CFMetaData caching(CachingOptions prop) {caching = prop; return this;}
+    public CFMetaData minIndexInterval(int prop) {minIndexInterval = prop; return this;}
+    public CFMetaData maxIndexInterval(int prop) {maxIndexInterval = prop; return this;}
     public CFMetaData memtableFlushPeriod(int prop) {memtableFlushPeriod = prop; return this;}
     public CFMetaData defaultTimeToLive(int prop) {defaultTimeToLive = prop; return this;}
     public CFMetaData speculativeRetry(SpeculativeRetry prop) {speculativeRetry = prop; return this;}
-    public CFMetaData populateIoCacheOnFlush(boolean prop) {populateIoCacheOnFlush = prop; return this;}
-    public CFMetaData droppedColumns(Map<ByteBuffer, Long> cols) {droppedColumns = cols; return this;}
+    public CFMetaData droppedColumns(Map<ColumnIdentifier, Long> cols) {droppedColumns = cols; return this;}
     public CFMetaData triggers(Map<String, TriggerDefinition> prop) {triggers = prop; return this;}
     public CFMetaData isDense(Boolean prop) {isDense = prop; return this;}
-
-    public CFMetaData(String keyspace, String name, ColumnFamilyType type, AbstractType<?> comp, AbstractType<?> subcc)
+    /**
+     * Create new ColumnFamily metadata with generated random ID.
+     * When loading from existing schema, use CFMetaData
+     *
+     * @param keyspace keyspace name
+     * @param name column family name
+     * @param comp default comparator
+     */
+    public CFMetaData(String keyspace, String name, ColumnFamilyType type, CellNameType comp)
     {
-        this(keyspace, name, type, makeComparator(type, comp, subcc));
+        this(keyspace, name, type, comp, UUIDGen.getTimeUUID());
     }
 
-    public CFMetaData(String keyspace, String name, ColumnFamilyType type, AbstractType<?> comp)
+    private CFMetaData(String keyspace, String name, ColumnFamilyType type, CellNameType comp, UUID id)
     {
-        this(keyspace, name, type, comp, getId(keyspace, name));
-    }
-
-    @VisibleForTesting
-    CFMetaData(String keyspace, String name, ColumnFamilyType type, AbstractType<?> comp,  UUID id)
-    {
-        // (subcc may be null for non-supercolumns)
-        // (comp may also be null for custom indexes, which is kind of broken if you ask me)
-
+        cfId = id;
         ksName = keyspace;
         cfName = name;
         cfType = type;
         comparator = comp;
-        cfId = id;
+    }
+
+    public static CFMetaData denseCFMetaData(String keyspace, String name, AbstractType<?> comp, AbstractType<?> subcc)
+    {
+        CellNameType cellNameType = CellNames.fromAbstractType(makeRawAbstractType(comp, subcc), true);
+        return new CFMetaData(keyspace, name, subcc == null ? ColumnFamilyType.Standard : ColumnFamilyType.Super, cellNameType);
+    }
+
+    public static CFMetaData sparseCFMetaData(String keyspace, String name, AbstractType<?> comp)
+    {
+        CellNameType cellNameType = CellNames.fromAbstractType(comp, false);
+        return new CFMetaData(keyspace, name, ColumnFamilyType.Standard, cellNameType);
+    }
+
+    public static CFMetaData denseCFMetaData(String keyspace, String name, AbstractType<?> comp)
+    {
+        return denseCFMetaData(keyspace, name, comp, null);
+    }
+
+    private static AbstractType<?> makeRawAbstractType(AbstractType<?> comparator, AbstractType<?> subComparator)
+    {
+        return subComparator == null ? comparator : CompositeType.getInstance(Arrays.asList(comparator, subComparator));
     }
 
     public Map<String, TriggerDefinition> getTriggers()
@@ -492,8 +546,10 @@
     {
         try
         {
-            CreateTableStatement statement = (CreateTableStatement) QueryProcessor.parseStatement(cql).prepare().statement;
-            CFMetaData cfm = newSystemMetadata(keyspace, statement.columnFamily(), "", statement.comparator, null);
+            CFStatement parsed = (CFStatement)QueryProcessor.parseStatement(cql);
+            parsed.prepareKeyspace(keyspace);
+            CreateTableStatement statement = (CreateTableStatement) parsed.prepare().statement;
+            CFMetaData cfm = newSystemMetadata(keyspace, statement.columnFamily(), "", statement.comparator);
             statement.applyPropertiesTo(cfm);
             return cfm.rebuild();
         }
@@ -503,45 +559,46 @@
         }
     }
 
-    private static AbstractType<?> makeComparator(ColumnFamilyType cftype, AbstractType<?> comp, AbstractType<?> subcc)
-    {
-        return cftype == ColumnFamilyType.Super
-             ? CompositeType.getInstance(comp, subcc == null ? BytesType.instance : subcc)
-             : comp;
-    }
-
-    private static String enforceCommentNotNull (CharSequence comment)
-    {
-        return (comment == null) ? "" : comment.toString();
-    }
-
-    static UUID getId(String ksName, String cfName)
+    /**
+     * Generates deterministic UUID from keyspace/columnfamily name pair.
+     * This is used to generate the same UUID for C* version < 2.1
+     *
+     * Since 2.1, this is only used for system columnfamilies and tests.
+     */
+    public static UUID generateLegacyCfId(String ksName, String cfName)
     {
         return UUID.nameUUIDFromBytes(ArrayUtils.addAll(ksName.getBytes(), cfName.getBytes()));
     }
 
-    private static CFMetaData newSystemMetadata(String keyspace, String cfName, String comment, AbstractType<?> comparator, AbstractType<?> subcc)
+    private static CFMetaData newSystemMetadata(String keyspace, String cfName, String comment, CellNameType comparator)
     {
-        ColumnFamilyType type = subcc == null ? ColumnFamilyType.Standard : ColumnFamilyType.Super;
-        CFMetaData newCFMD = new CFMetaData(keyspace, cfName, type, comparator,  subcc);
-
-        return newCFMD.comment(comment)
-                .readRepairChance(0)
-                .dcLocalReadRepairChance(0)
-                .gcGraceSeconds(0)
-                .memtableFlushPeriod(3600 * 1000);
+        return new CFMetaData(keyspace, cfName, ColumnFamilyType.Standard, comparator, generateLegacyCfId(keyspace, cfName))
+                             .comment(comment)
+                             .readRepairChance(0)
+                             .dcLocalReadRepairChance(0)
+                             .gcGraceSeconds(0)
+                             .memtableFlushPeriod(3600 * 1000);
     }
 
-    public static CFMetaData newIndexMetadata(CFMetaData parent, ColumnDefinition info, AbstractType<?> columnComparator)
+    /**
+     * Creates CFMetaData for secondary index CF.
+     * Secondary index CF has the same CF ID as parent's.
+     *
+     * @param parent Parent CF where secondary index is created
+     * @param info Column definition containing secondary index definition
+     * @param indexComparator Comparator for secondary index
+     * @return CFMetaData for secondary index
+     */
+    public static CFMetaData newIndexMetadata(CFMetaData parent, ColumnDefinition info, CellNameType indexComparator)
     {
         // Depends on parent's cache setting, turn on its index CF's cache.
         // Row caching is never enabled; see CASSANDRA-5732
-        Caching indexCaching = parent.getCaching() == Caching.ALL || parent.getCaching() == Caching.KEYS_ONLY
-                             ? Caching.KEYS_ONLY
-                             : Caching.NONE;
+        CachingOptions indexCaching = parent.getCaching().keyCache.isEnabled()
+                             ? CachingOptions.KEYS_ONLY
+                             : CachingOptions.NONE;
 
-        return new CFMetaData(parent.ksName, parent.indexColumnFamilyName(info), ColumnFamilyType.Standard, columnComparator, (AbstractType)null)
-                             .keyValidator(info.getValidator())
+        return new CFMetaData(parent.ksName, parent.indexColumnFamilyName(info), ColumnFamilyType.Standard, indexComparator, parent.cfId)
+                             .keyValidator(info.type)
                              .readRepairChance(0.0)
                              .dcLocalReadRepairChance(0.0)
                              .gcGraceSeconds(0)
@@ -563,46 +620,47 @@
         return this;
     }
 
-    public CFMetaData clone()
+    public CFMetaData copy()
     {
         return copyOpts(new CFMetaData(ksName, cfName, cfType, comparator, cfId), this);
     }
 
-    // Create a new CFMD by changing just the cfName
-    public static CFMetaData rename(CFMetaData cfm, String newName)
+    /**
+     * Clones the CFMetaData, but sets a different cfId
+     *
+     * @param newCfId the cfId for the cloned CFMetaData
+     * @return the cloned CFMetaData instance with the new cfId
+     */
+    public CFMetaData copy(UUID newCfId)
     {
-        return copyOpts(new CFMetaData(cfm.ksName, newName, cfm.cfType, cfm.comparator, cfm.cfId), cfm);
+        return copyOpts(new CFMetaData(ksName, cfName, cfType, comparator, newCfId), this);
     }
 
     static CFMetaData copyOpts(CFMetaData newCFMD, CFMetaData oldCFMD)
     {
-        Map<ByteBuffer, ColumnDefinition> clonedColumns = new HashMap<>();
-        for (ColumnDefinition cd : oldCFMD.column_metadata.values())
-        {
-            ColumnDefinition cloned = cd.clone();
-            clonedColumns.put(cloned.name, cloned);
-        }
+        List<ColumnDefinition> clonedColumns = new ArrayList<>(oldCFMD.allColumns().size());
+        for (ColumnDefinition cd : oldCFMD.allColumns())
+            clonedColumns.add(cd.copy());
 
-        return newCFMD.comment(oldCFMD.comment)
+        return newCFMD.addAllColumnDefinitions(clonedColumns)
+                      .comment(oldCFMD.comment)
                       .readRepairChance(oldCFMD.readRepairChance)
                       .dcLocalReadRepairChance(oldCFMD.dcLocalReadRepairChance)
-                      .replicateOnWrite(oldCFMD.replicateOnWrite)
                       .gcGraceSeconds(oldCFMD.gcGraceSeconds)
                       .defaultValidator(oldCFMD.defaultValidator)
                       .keyValidator(oldCFMD.keyValidator)
                       .minCompactionThreshold(oldCFMD.minCompactionThreshold)
                       .maxCompactionThreshold(oldCFMD.maxCompactionThreshold)
-                      .columnMetadata(clonedColumns)
                       .compactionStrategyClass(oldCFMD.compactionStrategyClass)
                       .compactionStrategyOptions(new HashMap<>(oldCFMD.compactionStrategyOptions))
                       .compressionParameters(oldCFMD.compressionParameters.copy())
                       .bloomFilterFpChance(oldCFMD.bloomFilterFpChance)
                       .caching(oldCFMD.caching)
                       .defaultTimeToLive(oldCFMD.defaultTimeToLive)
-                      .indexInterval(oldCFMD.indexInterval)
+                      .minIndexInterval(oldCFMD.minIndexInterval)
+                      .maxIndexInterval(oldCFMD.maxIndexInterval)
                       .speculativeRetry(oldCFMD.speculativeRetry)
                       .memtableFlushPeriod(oldCFMD.memtableFlushPeriod)
-                      .populateIoCacheOnFlush(oldCFMD.populateIoCacheOnFlush)
                       .droppedColumns(new HashMap<>(oldCFMD.droppedColumns))
                       .triggers(new HashMap<>(oldCFMD.triggers))
                       .isDense(oldCFMD.isDense)
@@ -620,7 +678,7 @@
     public String indexColumnFamilyName(ColumnDefinition info)
     {
         // TODO simplify this when info.index_name is guaranteed to be set
-        return cfName + Directories.SECONDARY_INDEX_NAME_SEPARATOR + (info.getIndexName() == null ? ByteBufferUtil.bytesToHex(info.name) : info.getIndexName());
+        return cfName + Directories.SECONDARY_INDEX_NAME_SEPARATOR + (info.getIndexName() == null ? ByteBufferUtil.bytesToHex(info.name.bytes) : info.getIndexName());
     }
 
     public String getComment()
@@ -633,6 +691,23 @@
         return cfType == ColumnFamilyType.Super;
     }
 
+    /**
+     * The '.' char is the only way to identify if the CFMetadata is for a secondary index
+     */
+    public boolean isSecondaryIndex()
+    {
+        return cfName.contains(".");
+    }
+
+    /**
+     *
+     * @return The name of the parent cf if this is a seconday index
+     */
+    public String getParentColumnFamilyName()
+    {
+        return isSecondaryIndex() ? cfName.substring(0, cfName.indexOf('.')) : null;
+    }
+
     public double getReadRepairChance()
     {
         return readRepairChance;
@@ -645,7 +720,7 @@
 
     public ReadRepairDecision newReadRepairDecision()
     {
-        double chance = FBUtilities.threadLocalRandom().nextDouble();
+        double chance = ThreadLocalRandom.current().nextDouble();
         if (getReadRepairChance() > chance)
             return ReadRepairDecision.GLOBAL;
 
@@ -655,16 +730,6 @@
         return ReadRepairDecision.NONE;
     }
 
-    public boolean getReplicateOnWrite()
-    {
-        return replicateOnWrite;
-    }
-
-    public boolean populateIoCacheOnFlush()
-    {
-        return populateIoCacheOnFlush;
-    }
-
     public int getGcGraceSeconds()
     {
         return gcGraceSeconds;
@@ -696,16 +761,9 @@
         if (partitionKeyColumns.size() > 1)
             throw new IllegalStateException("Cannot acces column family with composite key from CQL < 3.0.0");
 
-        try
-        {
-            // For compatibility sake, we uppercase if it's the default alias as we used to return it that way in resultsets.
-            String str = ByteBufferUtil.string(partitionKeyColumns.get(0).name);
-            return str.equalsIgnoreCase(DEFAULT_KEY_ALIAS) ? str.toUpperCase() : str;
-        }
-        catch (CharacterCodingException e)
-        {
-            throw new RuntimeException(e.getMessage(), e);
-        }
+        // For compatibility sake, we uppercase if it's the default alias as we used to return it that way in resultsets.
+        String str = partitionKeyColumns.get(0).name.toString();
+        return str.equalsIgnoreCase(DEFAULT_KEY_ALIAS) ? str.toUpperCase() : str;
     }
 
     public CompressionParameters compressionParameters()
@@ -715,7 +773,46 @@
 
     public Collection<ColumnDefinition> allColumns()
     {
-        return column_metadata.values();
+        return columnMetadata.values();
+    }
+
+    // An iterator over all column definitions but that respect the order of a SELECT *.
+    public Iterator<ColumnDefinition> allColumnsInSelectOrder()
+    {
+        return new AbstractIterator<ColumnDefinition>()
+        {
+            private final Iterator<ColumnDefinition> partitionKeyIter = partitionKeyColumns.iterator();
+            private final Iterator<ColumnDefinition> clusteringIter = clusteringColumns.iterator();
+            private boolean valueDone;
+            private final Iterator<ColumnDefinition> staticIter = staticColumns.iterator();
+            private final Iterator<ColumnDefinition> regularIter = regularColumns.iterator();
+
+            protected ColumnDefinition computeNext()
+            {
+                if (partitionKeyIter.hasNext())
+                    return partitionKeyIter.next();
+
+                if (clusteringIter.hasNext())
+                    return clusteringIter.next();
+
+                if (staticIter.hasNext())
+                    return staticIter.next();
+
+                if (compactValueColumn != null && !valueDone)
+                {
+                    valueDone = true;
+                    // If the compactValueColumn is empty, this means we have a dense table but
+                    // with only a PK. As far as selects are concerned, we should ignore the value.
+                    if (compactValueColumn.name.bytes.hasRemaining())
+                        return compactValueColumn;
+                }
+
+                if (regularIter.hasNext())
+                    return regularIter.next();
+
+                return endOfData();
+            }
+        };
     }
 
     public List<ColumnDefinition> partitionKeyColumns()
@@ -723,9 +820,9 @@
         return partitionKeyColumns;
     }
 
-    public List<ColumnDefinition> clusteringKeyColumns()
+    public List<ColumnDefinition> clusteringColumns()
     {
-        return clusteringKeyColumns;
+        return clusteringColumns;
     }
 
     public Set<ColumnDefinition> regularColumns()
@@ -748,6 +845,15 @@
         return compactValueColumn;
     }
 
+    // TODO: we could use CType for key validation too to make this unnecessary but
+    // it's unclear it would be a win overall
+    public CType getKeyValidatorAsCType()
+    {
+        return keyValidator instanceof CompositeType
+             ? new CompoundCType(((CompositeType) keyValidator).types)
+             : new SimpleCType(keyValidator);
+    }
+
     public double getBloomFilterFpChance()
     {
         // we disallow bFFPC==null starting in 1.2.1 but tolerated it before that
@@ -756,14 +862,19 @@
                : bloomFilterFpChance;
     }
 
-    public Caching getCaching()
+    public CachingOptions getCaching()
     {
         return caching;
     }
 
-    public int getIndexInterval()
+    public int getMinIndexInterval()
     {
-        return indexInterval;
+        return minIndexInterval;
+    }
+
+    public int getMaxIndexInterval()
+    {
+        return maxIndexInterval;
     }
 
     public SpeculativeRetry getSpeculativeRetry()
@@ -781,7 +892,7 @@
         return defaultTimeToLive;
     }
 
-    public Map<ByteBuffer, Long> getDroppedColumns()
+    public Map<ColumnIdentifier, Long> getDroppedColumns()
     {
         return droppedColumns;
     }
@@ -791,53 +902,51 @@
         return isDense;
     }
 
-    public boolean equals(Object obj)
+    @Override
+    public boolean equals(Object o)
     {
-        if (obj == this)
-        {
+        if (this == o)
             return true;
-        }
-        else if (obj == null || obj.getClass() != getClass())
-        {
-            return false;
-        }
 
-        CFMetaData rhs = (CFMetaData) obj;
-        return new EqualsBuilder()
-            .append(ksName, rhs.ksName)
-            .append(cfName, rhs.cfName)
-            .append(cfType, rhs.cfType)
-            .append(comparator, rhs.comparator)
-            .append(comment, rhs.comment)
-            .append(readRepairChance, rhs.readRepairChance)
-            .append(dcLocalReadRepairChance, rhs.dcLocalReadRepairChance)
-            .append(replicateOnWrite, rhs.replicateOnWrite)
-            .append(gcGraceSeconds, rhs.gcGraceSeconds)
-            .append(defaultValidator, rhs.defaultValidator)
-            .append(keyValidator, rhs.keyValidator)
-            .append(minCompactionThreshold, rhs.minCompactionThreshold)
-            .append(maxCompactionThreshold, rhs.maxCompactionThreshold)
-            .append(cfId, rhs.cfId)
-            .append(column_metadata, rhs.column_metadata)
-            .append(compactionStrategyClass, rhs.compactionStrategyClass)
-            .append(compactionStrategyOptions, rhs.compactionStrategyOptions)
-            .append(compressionParameters, rhs.compressionParameters)
-            .append(bloomFilterFpChance, rhs.bloomFilterFpChance)
-            .append(memtableFlushPeriod, rhs.memtableFlushPeriod)
-            .append(caching, rhs.caching)
-            .append(defaultTimeToLive, rhs.defaultTimeToLive)
-            .append(indexInterval, rhs.indexInterval)
-            .append(speculativeRetry, rhs.speculativeRetry)
-            .append(populateIoCacheOnFlush, rhs.populateIoCacheOnFlush)
-            .append(droppedColumns, rhs.droppedColumns)
-            .append(triggers, rhs.triggers)
-            .append(isDense, rhs.isDense)
-            .isEquals();
+        if (!(o instanceof CFMetaData))
+            return false;
+
+        CFMetaData other = (CFMetaData) o;
+
+        return Objects.equal(cfId, other.cfId)
+            && Objects.equal(ksName, other.ksName)
+            && Objects.equal(cfName, other.cfName)
+            && Objects.equal(cfType, other.cfType)
+            && Objects.equal(comparator, other.comparator)
+            && Objects.equal(comment, other.comment)
+            && Objects.equal(readRepairChance, other.readRepairChance)
+            && Objects.equal(dcLocalReadRepairChance, other.dcLocalReadRepairChance)
+            && Objects.equal(gcGraceSeconds, other.gcGraceSeconds)
+            && Objects.equal(defaultValidator, other.defaultValidator)
+            && Objects.equal(keyValidator, other.keyValidator)
+            && Objects.equal(minCompactionThreshold, other.minCompactionThreshold)
+            && Objects.equal(maxCompactionThreshold, other.maxCompactionThreshold)
+            && Objects.equal(columnMetadata, other.columnMetadata)
+            && Objects.equal(compactionStrategyClass, other.compactionStrategyClass)
+            && Objects.equal(compactionStrategyOptions, other.compactionStrategyOptions)
+            && Objects.equal(compressionParameters, other.compressionParameters)
+            && Objects.equal(bloomFilterFpChance, other.bloomFilterFpChance)
+            && Objects.equal(memtableFlushPeriod, other.memtableFlushPeriod)
+            && Objects.equal(caching, other.caching)
+            && Objects.equal(defaultTimeToLive, other.defaultTimeToLive)
+            && Objects.equal(minIndexInterval, other.minIndexInterval)
+            && Objects.equal(maxIndexInterval, other.maxIndexInterval)
+            && Objects.equal(speculativeRetry, other.speculativeRetry)
+            && Objects.equal(droppedColumns, other.droppedColumns)
+            && Objects.equal(triggers, other.triggers)
+            && Objects.equal(isDense, other.isDense);
     }
 
+    @Override
     public int hashCode()
     {
         return new HashCodeBuilder(29, 1597)
+            .append(cfId)
             .append(ksName)
             .append(cfName)
             .append(cfType)
@@ -845,14 +954,12 @@
             .append(comment)
             .append(readRepairChance)
             .append(dcLocalReadRepairChance)
-            .append(replicateOnWrite)
             .append(gcGraceSeconds)
             .append(defaultValidator)
             .append(keyValidator)
             .append(minCompactionThreshold)
             .append(maxCompactionThreshold)
-            .append(cfId)
-            .append(column_metadata)
+            .append(columnMetadata)
             .append(compactionStrategyClass)
             .append(compactionStrategyOptions)
             .append(compressionParameters)
@@ -860,22 +967,19 @@
             .append(memtableFlushPeriod)
             .append(caching)
             .append(defaultTimeToLive)
-            .append(indexInterval)
+            .append(minIndexInterval)
+            .append(maxIndexInterval)
             .append(speculativeRetry)
-            .append(populateIoCacheOnFlush)
             .append(droppedColumns)
             .append(triggers)
             .append(isDense)
             .toHashCode();
     }
 
-    /**
-     * Like getColumnDefinitionFromColumnName, the argument must be an internal column/cell name.
-     */
-    public AbstractType<?> getValueValidatorFromColumnName(ByteBuffer columnName)
+    public AbstractType<?> getValueValidator(CellName cellName)
     {
-        ColumnDefinition def = getColumnDefinitionFromColumnName(columnName);
-        return def == null ? defaultValidator : def.getValidator();
+        ColumnDefinition def = getColumnDefinition(cellName);
+        return def == null ? defaultValidator : def.type;
     }
 
     /** applies implicit defaults to cf definition. useful in updates */
@@ -883,10 +987,6 @@
     {
         if (!cf_def.isSetComment())
             cf_def.setComment("");
-        if (!cf_def.isSetReplicate_on_write())
-            cf_def.setReplicate_on_write(CFMetaData.DEFAULT_REPLICATE_ON_WRITE);
-        if (!cf_def.isSetPopulate_io_cache_on_flush())
-            cf_def.setPopulate_io_cache_on_flush(CFMetaData.DEFAULT_POPULATE_IO_CACHE_ON_FLUSH);
         if (!cf_def.isSetMin_compaction_threshold())
             cf_def.setMin_compaction_threshold(CFMetaData.DEFAULT_MIN_COMPACTION_THRESHOLD);
         if (!cf_def.isSetMax_compaction_threshold())
@@ -907,61 +1007,34 @@
             cf_def.setDefault_time_to_live(CFMetaData.DEFAULT_DEFAULT_TIME_TO_LIVE);
         if (!cf_def.isSetDclocal_read_repair_chance())
             cf_def.setDclocal_read_repair_chance(CFMetaData.DEFAULT_DCLOCAL_READ_REPAIR_CHANCE);
+
+        // if index_interval was set, use that for the min_index_interval default
+        if (!cf_def.isSetMin_index_interval())
+        {
+            if (cf_def.isSetIndex_interval())
+                cf_def.setMin_index_interval(cf_def.getIndex_interval());
+            else
+                cf_def.setMin_index_interval(CFMetaData.DEFAULT_MIN_INDEX_INTERVAL);
+        }
+        if (!cf_def.isSetMax_index_interval())
+        {
+            // ensure the max is at least as large as the min
+            cf_def.setMax_index_interval(Math.max(cf_def.min_index_interval, CFMetaData.DEFAULT_MAX_INDEX_INTERVAL));
+        }
     }
 
-    public static CFMetaData fromThrift(org.apache.cassandra.thrift.CfDef cf_def) throws InvalidRequestException, ConfigurationException
+    public static CFMetaData fromThrift(CfDef cf_def) throws InvalidRequestException, ConfigurationException
     {
-        CFMetaData cfm = internalFromThrift(cf_def);
-
-        if (cf_def.isSetKey_alias() && !(cfm.keyValidator instanceof CompositeType))
-            cfm.column_metadata.put(cf_def.key_alias, ColumnDefinition.partitionKeyDef(cf_def.key_alias, cfm.keyValidator, null));
-
-        try
-        {
-            return cfm.rebuild();
-        }
-        catch (MarshalException e)
-        {
-            throw new ConfigurationException(e.getMessage());
-        }
+        return internalFromThrift(cf_def, Collections.<ColumnDefinition>emptyList());
     }
 
-    public static CFMetaData fromThriftForUpdate(org.apache.cassandra.thrift.CfDef cf_def, CFMetaData toUpdate) throws InvalidRequestException, ConfigurationException
+    public static CFMetaData fromThriftForUpdate(CfDef cf_def, CFMetaData toUpdate) throws InvalidRequestException, ConfigurationException
     {
-        CFMetaData cfm = internalFromThrift(cf_def);
-
-        // Thrift update can't have CQL metadata, and so we'll copy the ones of the updated metadata (to make
-        // sure we don't override anything existing -- see #6831). One exception (for historical reasons) is
-        // the partition key column name however, which can be provided through thrift. If it is, make sure
-        // we use the one of the update.
-        boolean hasKeyAlias = cf_def.isSetKey_alias() && !(cfm.keyValidator instanceof CompositeType);
-        if (hasKeyAlias)
-            cfm.column_metadata.put(cf_def.key_alias, ColumnDefinition.partitionKeyDef(cf_def.key_alias, cfm.keyValidator, null));
-
-        for (ColumnDefinition def : toUpdate.allColumns())
-        {
-            // isPartOfCellName basically means 'is not just a CQL metadata'
-            if (def.isPartOfCellName())
-                continue;
-
-            if (def.type == ColumnDefinition.Type.PARTITION_KEY && hasKeyAlias)
-                continue;
-
-            cfm.addOrReplaceColumnDefinition(def);
-        }
-
-        try
-        {
-            return cfm.rebuild();
-        }
-        catch (MarshalException e)
-        {
-            throw new ConfigurationException(e.getMessage());
-        }
+        return internalFromThrift(cf_def, toUpdate.allColumns());
     }
 
-    // Do most of the work, but don't handle CQL metadata (i.e. skip key_alias and don't rebuild())
-    private static CFMetaData internalFromThrift(org.apache.cassandra.thrift.CfDef cf_def) throws InvalidRequestException, ConfigurationException
+    // Convert a thrift CfDef, given a list of ColumnDefinitions to copy over to the created CFMetadata before the CQL metadata are rebuild
+    private static CFMetaData internalFromThrift(CfDef cf_def, Collection<ColumnDefinition> previousCQLMetadata) throws InvalidRequestException, ConfigurationException
     {
         ColumnFamilyType cfType = ColumnFamilyType.create(cf_def.column_type);
         if (cfType == null)
@@ -971,49 +1044,84 @@
 
         try
         {
-            CFMetaData newCFMD = new CFMetaData(cf_def.keyspace,
-                                                cf_def.name,
-                                                cfType,
-                                                TypeParser.parse(cf_def.comparator_type),
-                                                cf_def.subcomparator_type == null ? null : TypeParser.parse(cf_def.subcomparator_type));
+            AbstractType<?> rawComparator = TypeParser.parse(cf_def.comparator_type);
+            AbstractType<?> subComparator = cfType == ColumnFamilyType.Standard
+                                          ? null
+                                          : cf_def.subcomparator_type == null ? BytesType.instance : TypeParser.parse(cf_def.subcomparator_type);
 
-            if (cf_def.isSetGc_grace_seconds()) { newCFMD.gcGraceSeconds(cf_def.gc_grace_seconds); }
-            if (cf_def.isSetMin_compaction_threshold()) { newCFMD.minCompactionThreshold(cf_def.min_compaction_threshold); }
-            if (cf_def.isSetMax_compaction_threshold()) { newCFMD.maxCompactionThreshold(cf_def.max_compaction_threshold); }
+            AbstractType<?> fullRawComparator = makeRawAbstractType(rawComparator, subComparator);
+
+            AbstractType<?> keyValidator = cf_def.isSetKey_validation_class() ? TypeParser.parse(cf_def.key_validation_class) : null;
+
+            // Convert the REGULAR definitions from the input CfDef
+            List<ColumnDefinition> defs = ColumnDefinition.fromThrift(cf_def.keyspace, cf_def.name, rawComparator, subComparator, cf_def.column_metadata);
+
+            // Add the keyAlias if there is one, since that's on CQL metadata that thrift can actually change (for
+            // historical reasons)
+            boolean hasKeyAlias = cf_def.isSetKey_alias() && keyValidator != null && !(keyValidator instanceof CompositeType);
+            if (hasKeyAlias)
+                defs.add(ColumnDefinition.partitionKeyDef(cf_def.keyspace, cf_def.name, cf_def.key_alias, keyValidator, null));
+
+            // Now add any CQL metadata that we want to copy, skipping the keyAlias if there was one
+            for (ColumnDefinition def : previousCQLMetadata)
+            {
+                // isPartOfCellName basically means 'is not just a CQL metadata'
+                if (def.isPartOfCellName())
+                    continue;
+
+                if (def.kind == ColumnDefinition.Kind.PARTITION_KEY && hasKeyAlias)
+                    continue;
+
+                defs.add(def);
+            }
+
+            CellNameType comparator = CellNames.fromAbstractType(fullRawComparator, calculateIsDense(fullRawComparator, defs));
+
+            UUID cfId = Schema.instance.getId(cf_def.keyspace, cf_def.name);
+            if (cfId == null)
+                cfId = UUIDGen.getTimeUUID();
+
+            CFMetaData newCFMD = new CFMetaData(cf_def.keyspace, cf_def.name, cfType, comparator, cfId);
+
+            newCFMD.addAllColumnDefinitions(defs);
+
+            if (keyValidator != null)
+                newCFMD.keyValidator(keyValidator);
+            if (cf_def.isSetGc_grace_seconds())
+                newCFMD.gcGraceSeconds(cf_def.gc_grace_seconds);
+            if (cf_def.isSetMin_compaction_threshold())
+                newCFMD.minCompactionThreshold(cf_def.min_compaction_threshold);
+            if (cf_def.isSetMax_compaction_threshold())
+                newCFMD.maxCompactionThreshold(cf_def.max_compaction_threshold);
             if (cf_def.isSetCompaction_strategy())
-                newCFMD.compactionStrategyClass = createCompactionStrategy(cf_def.compaction_strategy);
+                newCFMD.compactionStrategyClass(createCompactionStrategy(cf_def.compaction_strategy));
             if (cf_def.isSetCompaction_strategy_options())
                 newCFMD.compactionStrategyOptions(new HashMap<>(cf_def.compaction_strategy_options));
             if (cf_def.isSetBloom_filter_fp_chance())
                 newCFMD.bloomFilterFpChance(cf_def.bloom_filter_fp_chance);
             if (cf_def.isSetMemtable_flush_period_in_ms())
                 newCFMD.memtableFlushPeriod(cf_def.memtable_flush_period_in_ms);
-            if (cf_def.isSetCaching())
-                newCFMD.caching(Caching.fromString(cf_def.caching));
+            if (cf_def.isSetCaching() || cf_def.isSetCells_per_row_to_cache())
+                newCFMD.caching(CachingOptions.fromThrift(cf_def.caching, cf_def.cells_per_row_to_cache));
             if (cf_def.isSetRead_repair_chance())
                 newCFMD.readRepairChance(cf_def.read_repair_chance);
             if (cf_def.isSetDefault_time_to_live())
                 newCFMD.defaultTimeToLive(cf_def.default_time_to_live);
             if (cf_def.isSetDclocal_read_repair_chance())
                 newCFMD.dcLocalReadRepairChance(cf_def.dclocal_read_repair_chance);
-            if (cf_def.isSetIndex_interval())
-                newCFMD.indexInterval(cf_def.index_interval);
+            if (cf_def.isSetMin_index_interval())
+                newCFMD.minIndexInterval(cf_def.min_index_interval);
+            if (cf_def.isSetMax_index_interval())
+                newCFMD.maxIndexInterval(cf_def.max_index_interval);
             if (cf_def.isSetSpeculative_retry())
                 newCFMD.speculativeRetry(SpeculativeRetry.fromString(cf_def.speculative_retry));
-            if (cf_def.isSetPopulate_io_cache_on_flush())
-                newCFMD.populateIoCacheOnFlush(cf_def.populate_io_cache_on_flush);
             if (cf_def.isSetTriggers())
                 newCFMD.triggers(TriggerDefinition.fromThrift(cf_def.triggers));
 
-            CompressionParameters cp = CompressionParameters.create(cf_def.compression_options);
-
-            if (cf_def.isSetKey_validation_class()) { newCFMD.keyValidator(TypeParser.parse(cf_def.key_validation_class)); }
-
             return newCFMD.comment(cf_def.comment)
-                          .replicateOnWrite(cf_def.replicate_on_write)
                           .defaultValidator(TypeParser.parse(cf_def.default_validation_class))
-                          .columnMetadata(ColumnDefinition.fromThrift(cf_def.column_metadata, newCFMD.isSuper()))
-                          .compressionParameters(cp);
+                          .compressionParameters(CompressionParameters.create(cf_def.compression_options))
+                          .rebuild();
         }
         catch (SyntaxException | MarshalException e)
         {
@@ -1024,29 +1132,34 @@
     /**
      * Create CFMetaData from thrift {@link CqlRow} that contains columns from schema_columnfamilies.
      *
-     * @param row CqlRow containing columns from schema_columnfamilies.
+     * @param columnsRes CqlRow containing columns from schema_columnfamilies.
      * @return CFMetaData derived from CqlRow
      */
-    public static CFMetaData fromThriftCqlRow(CqlRow row)
+    public static CFMetaData fromThriftCqlRow(CqlRow cf, CqlResult columnsRes)
     {
-        Map<String, ByteBuffer> columns = new HashMap<>();
-        try
-        {
-            for (org.apache.cassandra.thrift.Column column : row.getColumns())
-                columns.put(ByteBufferUtil.string(column.bufferForName()), column.value);
-        }
-        catch (CharacterCodingException ignore)
-        {
-        }
-        UntypedResultSet.Row cql3row = new UntypedResultSet.Row(columns);
-        return fromSchemaNoColumnsNoTriggers(cql3row);
+        UntypedResultSet.Row cfRow = new UntypedResultSet.Row(convertThriftCqlRow(cf));
+
+        List<Map<String, ByteBuffer>> cols = new ArrayList<>(columnsRes.rows.size());
+        for (CqlRow row : columnsRes.rows)
+            cols.add(convertThriftCqlRow(row));
+        UntypedResultSet colsRow = UntypedResultSet.create(cols);
+
+        return fromSchemaNoTriggers(cfRow, colsRow);
+    }
+
+    private static Map<String, ByteBuffer> convertThriftCqlRow(CqlRow row)
+    {
+        Map<String, ByteBuffer> m = new HashMap<>();
+        for (org.apache.cassandra.thrift.Column column : row.getColumns())
+            m.put(UTF8Type.instance.getString(column.bufferForName()), column.value);
+        return m;
     }
 
     public void reload()
     {
         Row cfDefRow = SystemKeyspace.readSchemaRow(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF, ksName, cfName);
 
-        if (cfDefRow.cf == null || cfDefRow.cf.getColumnCount() == 0)
+        if (cfDefRow.cf == null || !cfDefRow.cf.hasColumns())
             throw new RuntimeException(String.format("%s not found in the schema definitions keyspace.", ksName + ":" + cfName));
 
         try
@@ -1062,11 +1175,11 @@
     /**
      * Updates CFMetaData in-place to match cf_def
      *
-     * *Note*: This method left public only for DefsTest, don't use directly!
+     * *Note*: This method left package-private only for DefsTest, don't use directly!
      *
      * @throws ConfigurationException if ks/cf names or cf ids didn't match
      */
-    public void apply(CFMetaData cfm) throws ConfigurationException
+    void apply(CFMetaData cfm) throws ConfigurationException
     {
         logger.debug("applying {} to {}", cfm, this);
 
@@ -1080,10 +1193,9 @@
         // compaction thresholds are checked by ThriftValidation. We shouldn't be doing
         // validation on the apply path; it's too late for that.
 
-        comment = enforceCommentNotNull(cfm.comment);
+        comment = Strings.nullToEmpty(cfm.comment);
         readRepairChance = cfm.readRepairChance;
         dcLocalReadRepairChance = cfm.dcLocalReadRepairChance;
-        replicateOnWrite = cfm.replicateOnWrite;
         gcGraceSeconds = cfm.gcGraceSeconds;
         defaultValidator = cfm.defaultValidator;
         keyValidator = cfm.keyValidator;
@@ -1091,28 +1203,29 @@
         maxCompactionThreshold = cfm.maxCompactionThreshold;
 
         bloomFilterFpChance = cfm.bloomFilterFpChance;
-        memtableFlushPeriod = cfm.memtableFlushPeriod;
         caching = cfm.caching;
+        minIndexInterval = cfm.minIndexInterval;
+        maxIndexInterval = cfm.maxIndexInterval;
+        memtableFlushPeriod = cfm.memtableFlushPeriod;
         defaultTimeToLive = cfm.defaultTimeToLive;
         speculativeRetry = cfm.speculativeRetry;
-        populateIoCacheOnFlush = cfm.populateIoCacheOnFlush;
 
         if (!cfm.droppedColumns.isEmpty())
             droppedColumns = cfm.droppedColumns;
 
-        MapDifference<ByteBuffer, ColumnDefinition> columnDiff = Maps.difference(column_metadata, cfm.column_metadata);
+        MapDifference<ByteBuffer, ColumnDefinition> columnDiff = Maps.difference(columnMetadata, cfm.columnMetadata);
         // columns that are no longer needed
         for (ColumnDefinition cd : columnDiff.entriesOnlyOnLeft().values())
-            column_metadata.remove(cd.name);
+            removeColumnDefinition(cd);
         // newly added columns
         for (ColumnDefinition cd : columnDiff.entriesOnlyOnRight().values())
-            column_metadata.put(cd.name, cd);
+            addColumnDefinition(cd);
         // old columns with updated attributes
         for (ByteBuffer name : columnDiff.entriesDiffering().keySet())
         {
-            ColumnDefinition oldDef = column_metadata.get(name);
-            ColumnDefinition def = cfm.column_metadata.get(name);
-            oldDef.apply(def, getColumnDefinitionComparator(oldDef));
+            ColumnDefinition oldDef = columnMetadata.get(name);
+            ColumnDefinition def = cfm.columnMetadata.get(name);
+            addOrReplaceColumnDefinition(oldDef.apply(def));
         }
 
         compactionStrategyClass = cfm.compactionStrategyClass;
@@ -1155,8 +1268,7 @@
             if (options == null)
                 return;
 
-            Method validateMethod = strategyClass.getMethod("validateOptions", Map.class);
-            Map<String, String> unknownOptions = (Map<String, String>) validateMethod.invoke(null, options);
+            Map<?,?> unknownOptions = (Map) strategyClass.getMethod("validateOptions", Map.class).invoke(null, options);
             if (!unknownOptions.isEmpty())
                 throw new ConfigurationException(String.format("Properties specified %s are not understood by %s", unknownOptions.keySet(), strategyClass.getSimpleName()));
         }
@@ -1214,20 +1326,17 @@
 
         if (isSuper())
         {
-            CompositeType ct = (CompositeType)comparator;
-            def.setComparator_type(ct.types.get(0).toString());
-            def.setSubcomparator_type(ct.types.get(1).toString());
+            def.setComparator_type(comparator.subtype(0).toString());
+            def.setSubcomparator_type(comparator.subtype(1).toString());
         }
         else
         {
             def.setComparator_type(comparator.toString());
         }
 
-        def.setComment(enforceCommentNotNull(comment));
+        def.setComment(Strings.nullToEmpty(comment));
         def.setRead_repair_chance(readRepairChance);
         def.setDclocal_read_repair_chance(dcLocalReadRepairChance);
-        def.setReplicate_on_write(replicateOnWrite);
-        def.setPopulate_io_cache_on_flush(populateIoCacheOnFlush);
         def.setGc_grace_seconds(gcGraceSeconds);
         def.setDefault_validation_class(defaultValidator == null ? null : defaultValidator.toString());
         def.setKey_validation_class(keyValidator.toString());
@@ -1235,16 +1344,18 @@
         def.setMax_compaction_threshold(maxCompactionThreshold);
         // We only return the alias if only one is set since thrift don't know about multiple key aliases
         if (partitionKeyColumns.size() == 1)
-            def.setKey_alias(partitionKeyColumns.get(0).name);
-        def.setColumn_metadata(ColumnDefinition.toThrift(column_metadata));
+            def.setKey_alias(partitionKeyColumns.get(0).name.bytes);
+        def.setColumn_metadata(ColumnDefinition.toThrift(columnMetadata));
         def.setCompaction_strategy(compactionStrategyClass.getName());
         def.setCompaction_strategy_options(new HashMap<>(compactionStrategyOptions));
         def.setCompression_options(compressionParameters.asThriftOptions());
         if (bloomFilterFpChance != null)
             def.setBloom_filter_fp_chance(bloomFilterFpChance);
-        def.setIndex_interval(indexInterval);
+        def.setMin_index_interval(minIndexInterval);
+        def.setMax_index_interval(maxIndexInterval);
         def.setMemtable_flush_period_in_ms(memtableFlushPeriod);
-        def.setCaching(caching.toString());
+        def.setCaching(caching.toThriftCaching());
+        def.setCells_per_row_to_cache(caching.toThriftCellsPerRow());
         def.setDefault_time_to_live(defaultTimeToLive);
         def.setSpeculative_retry(speculativeRetry.toString());
         def.setTriggers(TriggerDefinition.toThrift(triggers));
@@ -1254,63 +1365,45 @@
 
     /**
      * Returns the ColumnDefinition for {@code name}.
-     *
-     * Note that {@code name} correspond to the returned ColumnDefinition name,
-     * and in particular for composite cfs, it should usually be only a
-     * component of the full column name. If you have a full column name, use
-     * getColumnDefinitionFromColumnName instead.
      */
+    public ColumnDefinition getColumnDefinition(ColumnIdentifier name)
+    {
+        return columnMetadata.get(name.bytes);
+    }
+
+    // In general it is preferable to work with ColumnIdentifier to make it
+    // clear that we are talking about a CQL column, not a cell name, but there
+    // is a few cases where all we have is a ByteBuffer (when dealing with IndexExpression
+    // for instance) so...
     public ColumnDefinition getColumnDefinition(ByteBuffer name)
     {
-            return column_metadata.get(name);
+        return columnMetadata.get(name);
     }
 
     /**
-     * Returns a ColumnDefinition given a full (internal) column name.
+     * Returns a ColumnDefinition given a cell name.
      */
-    public ColumnDefinition getColumnDefinitionFromColumnName(ByteBuffer columnName)
+    public ColumnDefinition getColumnDefinition(CellName cellName)
     {
-        if (!isSuper() && (comparator instanceof CompositeType))
-        {
-            CompositeType composite = (CompositeType)comparator;
-            ByteBuffer[] components = composite.split(columnName);
-            for (ColumnDefinition def : regularAndStaticColumns())
-            {
-                ByteBuffer toCompare;
-                if (def.componentIndex == null)
-                {
-                    toCompare = columnName;
-                }
-                else
-                {
-                    if (def.componentIndex >= components.length)
-                        continue;
+        ColumnIdentifier id = cellName.cql3ColumnName(this);
+        ColumnDefinition def = id == null
+                             ? getColumnDefinition(cellName.toByteBuffer())  // Means a dense layout, try the full column name
+                             : getColumnDefinition(id);
 
-                    toCompare = components[def.componentIndex];
-                }
-                if (def.name.equals(toCompare))
-                    return def;
-            }
-            return null;
-        }
-        else
-        {
-            ColumnDefinition def = column_metadata.get(columnName);
-            // It's possible that the def is a PRIMARY KEY or COMPACT_VALUE one in case a concrete cell
-            // name conflicts with a CQL column name, which can happen in 2 cases:
-            // 1) because the user inserted a cell through Thrift that conflicts with a default "alias" used
-            //    by CQL for thrift tables (see #6892).
-            // 2) for COMPACT STORAGE tables with a single utf8 clustering column, the cell name can be anything,
-            //    including a CQL column name (without this being a problem).
-            // In any case, this is fine, this just mean that columnDefinition is not the ColumnDefinition we are
-            // looking for.
-            return def != null && def.isPartOfCellName() ? def : null;
-        }
+        // It's possible that the def is a PRIMARY KEY or COMPACT_VALUE one in case a concrete cell
+        // name conflicts with a CQL column name, which can happen in 2 cases:
+        // 1) because the user inserted a cell through Thrift that conflicts with a default "alias" used
+        //    by CQL for thrift tables (see #6892).
+        // 2) for COMPACT STORAGE tables with a single utf8 clustering column, the cell name can be anything,
+        //    including a CQL column name (without this being a problem).
+        // In any case, this is fine, this just mean that columnDefinition is not the ColumnDefinition we are
+        // looking for.
+        return def != null && def.isPartOfCellName() ? def : null;
     }
 
     public ColumnDefinition getColumnDefinitionForIndex(String indexName)
     {
-        for (ColumnDefinition def : column_metadata.values())
+        for (ColumnDefinition def : allColumns())
         {
             if (indexName.equals(def.getIndexName()))
                 return def;
@@ -1329,14 +1422,12 @@
         {
             CFMetaData cfm = Schema.instance.getCFMetaData(cfId);
 
-            for (Map.Entry<ByteBuffer, ColumnDefinition> entry : column_metadata.entrySet())
+            for (ColumnDefinition newDef : allColumns())
             {
-                ColumnDefinition newDef = entry.getValue();
-
-                if (!cfm.column_metadata.containsKey(entry.getKey()) || newDef.getIndexType() == null)
+                if (!cfm.columnMetadata.containsKey(newDef.name.bytes) || newDef.getIndexType() == null)
                     continue;
 
-                String oldIndexName = cfm.column_metadata.get(entry.getKey()).getIndexName();
+                String oldIndexName = cfm.getColumnDefinition(newDef.name).getIndexName();
 
                 if (oldIndexName == null)
                     continue;
@@ -1349,11 +1440,11 @@
         }
 
         Set<String> existingNames = existingIndexNames(null);
-        for (ColumnDefinition column : column_metadata.values())
+        for (ColumnDefinition column : allColumns())
         {
             if (column.getIndexType() != null && column.getIndexName() == null)
             {
-                String baseName = getDefaultIndexName(cfName, getColumnDefinitionComparator(column), column.name);
+                String baseName = getDefaultIndexName(cfName, column.name);
                 String indexName = baseName;
                 int i = 0;
                 while (existingNames.contains(indexName))
@@ -1363,21 +1454,24 @@
         }
     }
 
-    public static String getDefaultIndexName(String cfName, AbstractType<?> comparator, ByteBuffer columnName)
+    public static String getDefaultIndexName(String cfName, ColumnIdentifier columnName)
     {
-        return (cfName + "_" + comparator.getString(columnName) + "_idx").replaceAll("\\W", "");
+        return (cfName + "_" + columnName + "_idx").replaceAll("\\W", "");
     }
 
-    public Iterator<OnDiskAtom> getOnDiskIterator(DataInput in, int count, Descriptor.Version version)
+    public Iterator<OnDiskAtom> getOnDiskIterator(DataInput in, Descriptor.Version version)
     {
-        return getOnDiskIterator(in, count, ColumnSerializer.Flag.LOCAL, Integer.MIN_VALUE, version);
+        return getOnDiskIterator(in, ColumnSerializer.Flag.LOCAL, Integer.MIN_VALUE, version);
     }
 
-    public Iterator<OnDiskAtom> getOnDiskIterator(DataInput in, int count, ColumnSerializer.Flag flag, int expireBefore, Descriptor.Version version)
+    public Iterator<OnDiskAtom> getOnDiskIterator(DataInput in, ColumnSerializer.Flag flag, int expireBefore, Descriptor.Version version)
     {
-        if (version.hasSuperColumns && cfType == ColumnFamilyType.Super)
-            return SuperColumns.onDiskIterator(in, count, flag, expireBefore);
-        return Column.onDiskIterator(in, count, flag, expireBefore, version);
+        return AbstractCell.onDiskIterator(in, flag, expireBefore, version, comparator);
+    }
+
+    public AtomDeserializer getOnDiskDeserializer(DataInput in, Descriptor.Version version)
+    {
+        return new AtomDeserializer(comparator, in, ColumnSerializer.Flag.LOCAL, Integer.MIN_VALUE, version);
     }
 
     public static boolean isNameValid(String name)
@@ -1402,8 +1496,11 @@
         if (cfType == null)
             throw new ConfigurationException(String.format("Invalid column family type for %s", cfName));
 
-        if (comparator instanceof CounterColumnType)
-            throw new ConfigurationException("CounterColumnType is not a valid comparator");
+        for (int i = 0; i < comparator.size(); i++)
+        {
+            if (comparator.subtype(i) instanceof CounterColumnType)
+                throw new ConfigurationException("CounterColumnType is not a valid comparator");
+        }
         if (keyValidator instanceof CounterColumnType)
             throw new ConfigurationException("CounterColumnType is not a valid key validator");
 
@@ -1411,39 +1508,20 @@
         if (defaultValidator instanceof CounterColumnType)
         {
             for (ColumnDefinition def : regularAndStaticColumns())
-                if (!(def.getValidator() instanceof CounterColumnType))
-                    throw new ConfigurationException("Cannot add a non counter column (" + getColumnDefinitionComparator(def).getString(def.name) + ") in a counter column family");
+                if (!(def.type instanceof CounterColumnType))
+                    throw new ConfigurationException("Cannot add a non counter column (" + def.name + ") in a counter column family");
         }
         else
         {
-            for (ColumnDefinition def : column_metadata.values())
-                if (def.getValidator() instanceof CounterColumnType)
-                    throw new ConfigurationException("Cannot add a counter column (" + getColumnDefinitionComparator(def).getString(def.name) + ") in a non counter column family");
+            for (ColumnDefinition def : allColumns())
+                if (def.type instanceof CounterColumnType)
+                    throw new ConfigurationException("Cannot add a counter column (" + def.name + ") in a non counter column family");
         }
 
-        for (ColumnDefinition def : partitionKeyColumns)
-            validateAlias(def, "Key");
-        for (ColumnDefinition def : clusteringKeyColumns)
-            validateAlias(def, "Column");
-        if (compactValueColumn != null)
-            validateAlias(compactValueColumn, "Value");
-
         // initialize a set of names NOT in the CF under consideration
         Set<String> indexNames = existingIndexNames(cfName);
-        for (ColumnDefinition c : column_metadata.values())
+        for (ColumnDefinition c : allColumns())
         {
-            AbstractType<?> comparator = getColumnDefinitionComparator(c);
-
-            try
-            {
-                comparator.validate(c.name);
-            }
-            catch (MarshalException e)
-            {
-                throw new ConfigurationException(String.format("Column name %s is not valid for comparator %s",
-                                                               ByteBufferUtil.bytesToHex(c.name), comparator));
-            }
-
             if (c.getIndexType() == null)
             {
                 if (c.getIndexName() != null)
@@ -1476,33 +1554,21 @@
         if (bloomFilterFpChance != null && bloomFilterFpChance == 0)
             throw new ConfigurationException("Zero false positives is impossible; bloom filter false positive chance bffpc must be 0 < bffpc <= 1");
 
+        validateIndexIntervalThresholds();
+
         return this;
     }
 
     private static Set<String> existingIndexNames(String cfToExclude)
     {
-        Set<String> indexNames = new HashSet<String>();
+        Set<String> indexNames = new HashSet<>();
         for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
-        {
             if (cfToExclude == null || !cfs.name.equals(cfToExclude))
                 for (ColumnDefinition cd : cfs.metadata.allColumns())
                     indexNames.add(cd.getIndexName());
-        }
         return indexNames;
     }
 
-    private static void validateAlias(ColumnDefinition alias, String msg) throws ConfigurationException
-    {
-        try
-        {
-            UTF8Type.instance.validate(alias.name);
-        }
-        catch (MarshalException e)
-        {
-            throw new ConfigurationException(msg + " alias must be UTF8");
-        }
-    }
-
     private void validateCompactionThresholds() throws ConfigurationException
     {
         if (maxCompactionThreshold == 0)
@@ -1520,6 +1586,15 @@
                                                             minCompactionThreshold, maxCompactionThreshold));
     }
 
+    private void validateIndexIntervalThresholds() throws ConfigurationException
+    {
+        if (minIndexInterval <= 0)
+            throw new ConfigurationException(String.format("Min index interval must be greater than 0 (got %d).", minIndexInterval));
+        if (maxIndexInterval < minIndexInterval)
+            throw new ConfigurationException(String.format("Max index interval (%d) must be greater than the min index " +
+                                                           "interval (%d).", maxIndexInterval, minIndexInterval));
+    }
+
     /**
      * Create schema mutations to update this metadata to provided new state.
      *
@@ -1529,47 +1604,47 @@
      *
      * @return Difference between attributes in form of schema mutation
      */
-    public RowMutation toSchemaUpdate(CFMetaData newState, long modificationTimestamp, boolean fromThrift)
+    public Mutation toSchemaUpdate(CFMetaData newState, long modificationTimestamp, boolean fromThrift)
     {
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(ksName));
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(ksName));
 
-        newState.toSchemaNoColumnsNoTriggers(rm, modificationTimestamp);
+        newState.toSchemaNoColumnsNoTriggers(mutation, modificationTimestamp);
 
-        MapDifference<ByteBuffer, ColumnDefinition> columnDiff = Maps.difference(column_metadata, newState.column_metadata);
+        MapDifference<ByteBuffer, ColumnDefinition> columnDiff = Maps.difference(columnMetadata, newState.columnMetadata);
 
         // columns that are no longer needed
         for (ColumnDefinition cd : columnDiff.entriesOnlyOnLeft().values())
         {
             // Thrift only knows about the REGULAR ColumnDefinition type, so don't consider other type
             // are being deleted just because they are not here.
-            if (fromThrift && cd.type != ColumnDefinition.Type.REGULAR)
+            if (fromThrift && cd.kind != ColumnDefinition.Kind.REGULAR)
                 continue;
 
-            cd.deleteFromSchema(rm, cfName, getColumnDefinitionComparator(cd), modificationTimestamp);
+            cd.deleteFromSchema(mutation, modificationTimestamp);
         }
 
         // newly added columns
         for (ColumnDefinition cd : columnDiff.entriesOnlyOnRight().values())
-            cd.toSchema(rm, cfName, getColumnDefinitionComparator(cd), modificationTimestamp);
+            cd.toSchema(mutation, modificationTimestamp);
 
         // old columns with updated attributes
         for (ByteBuffer name : columnDiff.entriesDiffering().keySet())
         {
-            ColumnDefinition cd = newState.getColumnDefinition(name);
-            cd.toSchema(rm, cfName, getColumnDefinitionComparator(cd), modificationTimestamp);
+            ColumnDefinition cd = newState.columnMetadata.get(name);
+            cd.toSchema(mutation, modificationTimestamp);
         }
 
         MapDifference<String, TriggerDefinition> triggerDiff = Maps.difference(triggers, newState.triggers);
 
         // dropped triggers
         for (TriggerDefinition td : triggerDiff.entriesOnlyOnLeft().values())
-            td.deleteFromSchema(rm, cfName, modificationTimestamp);
+            td.deleteFromSchema(mutation, cfName, modificationTimestamp);
 
         // newly created triggers
         for (TriggerDefinition td : triggerDiff.entriesOnlyOnRight().values())
-            td.toSchema(rm, cfName, modificationTimestamp);
+            td.toSchema(mutation, cfName, modificationTimestamp);
 
-        return rm;
+        return mutation;
     }
 
     /**
@@ -1577,111 +1652,145 @@
      *
      * @param timestamp Timestamp to use
      *
-     * @return RowMutation to use to completely remove cf from schema
+     * @return Mutation to use to completely remove cf from schema
      */
-    public RowMutation dropFromSchema(long timestamp)
+    public Mutation dropFromSchema(long timestamp)
     {
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(ksName));
-        ColumnFamily cf = rm.addOrGet(SchemaColumnFamiliesCf);
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(ksName));
+        ColumnFamily cf = mutation.addOrGet(SchemaColumnFamiliesCf);
         int ldt = (int) (System.currentTimeMillis() / 1000);
 
-        ColumnNameBuilder builder = SchemaColumnFamiliesCf.getCfDef().getColumnNameBuilder();
-        builder.add(ByteBufferUtil.bytes(cfName));
-        cf.addAtom(new RangeTombstone(builder.build(), builder.buildAsEndOfRange(), timestamp, ldt));
+        Composite prefix = SchemaColumnFamiliesCf.comparator.make(cfName);
+        cf.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
 
-        for (ColumnDefinition cd : column_metadata.values())
-            cd.deleteFromSchema(rm, cfName, getColumnDefinitionComparator(cd), timestamp);
+        for (ColumnDefinition cd : allColumns())
+            cd.deleteFromSchema(mutation, timestamp);
 
         for (TriggerDefinition td : triggers.values())
-            td.deleteFromSchema(rm, cfName, timestamp);
+            td.deleteFromSchema(mutation, cfName, timestamp);
 
-        return rm;
+        for (String indexName : Keyspace.open(this.ksName).getColumnFamilyStore(this.cfName).getBuiltIndexes())
+        {
+            ColumnFamily indexCf = mutation.addOrGet(IndexCf);
+            indexCf.addTombstone(indexCf.getComparator().makeCellName(indexName), ldt, timestamp);
+        }
+
+        return mutation;
     }
 
-    public void toSchema(RowMutation rm, long timestamp)
+    public boolean isPurged()
     {
-        toSchemaNoColumnsNoTriggers(rm, timestamp);
-
-        for (TriggerDefinition td : triggers.values())
-            td.toSchema(rm, cfName, timestamp);
-
-        for (ColumnDefinition cd : column_metadata.values())
-            cd.toSchema(rm, cfName, getColumnDefinitionComparator(cd), timestamp);
+        return isPurged;
     }
 
-    private void toSchemaNoColumnsNoTriggers(RowMutation rm, long timestamp)
+    void markPurged()
+    {
+        isPurged = true;
+    }
+
+    public void toSchema(Mutation mutation, long timestamp)
+    {
+        toSchemaNoColumnsNoTriggers(mutation, timestamp);
+
+        for (TriggerDefinition td : triggers.values())
+            td.toSchema(mutation, cfName, timestamp);
+
+        for (ColumnDefinition cd : allColumns())
+            cd.toSchema(mutation, timestamp);
+    }
+
+    private void toSchemaNoColumnsNoTriggers(Mutation mutation, long timestamp)
     {
         // For property that can be null (and can be changed), we insert tombstones, to make sure
         // we don't keep a property the user has removed
-        ColumnFamily cf = rm.addOrGet(SchemaColumnFamiliesCf);
-        int ldt = (int) (System.currentTimeMillis() / 1000);
+        ColumnFamily cf = mutation.addOrGet(SchemaColumnFamiliesCf);
+        Composite prefix = SchemaColumnFamiliesCf.comparator.make(cfName);
+        CFRowAdder adder = new CFRowAdder(cf, prefix, timestamp);
 
-        cf.addColumn(Column.create("", timestamp, cfName, ""));
-        cf.addColumn(Column.create(cfType.toString(), timestamp, cfName, "type"));
+        adder.add("cf_id", cfId);
+        adder.add("type", cfType.toString());
 
         if (isSuper())
         {
             // We need to continue saving the comparator and subcomparator separatly, otherwise
             // we won't know at deserialization if the subcomparator should be taken into account
             // TODO: we should implement an on-start migration if we want to get rid of that.
-            CompositeType ct = (CompositeType)comparator;
-            cf.addColumn(Column.create(ct.types.get(0).toString(), timestamp, cfName, "comparator"));
-            cf.addColumn(Column.create(ct.types.get(1).toString(), timestamp, cfName, "subcomparator"));
+            adder.add("comparator", comparator.subtype(0).toString());
+            adder.add("subcomparator", comparator.subtype(1).toString());
         }
         else
         {
-            cf.addColumn(Column.create(comparator.toString(), timestamp, cfName, "comparator"));
+            adder.add("comparator", comparator.toString());
         }
 
-        cf.addColumn(comment == null ? DeletedColumn.create(ldt, timestamp, cfName, "comment")
-                                     : Column.create(comment, timestamp, cfName, "comment"));
-        cf.addColumn(Column.create(readRepairChance, timestamp, cfName, "read_repair_chance"));
-        cf.addColumn(Column.create(dcLocalReadRepairChance, timestamp, cfName, "local_read_repair_chance"));
-        cf.addColumn(Column.create(replicateOnWrite, timestamp, cfName, "replicate_on_write"));
-        cf.addColumn(Column.create(populateIoCacheOnFlush, timestamp, cfName, "populate_io_cache_on_flush"));
-        cf.addColumn(Column.create(gcGraceSeconds, timestamp, cfName, "gc_grace_seconds"));
-        cf.addColumn(Column.create(defaultValidator.toString(), timestamp, cfName, "default_validator"));
-        cf.addColumn(Column.create(keyValidator.toString(), timestamp, cfName, "key_validator"));
-        cf.addColumn(Column.create(minCompactionThreshold, timestamp, cfName, "min_compaction_threshold"));
-        cf.addColumn(Column.create(maxCompactionThreshold, timestamp, cfName, "max_compaction_threshold"));
-        cf.addColumn(bloomFilterFpChance == null ? DeletedColumn.create(ldt, timestamp, cfName, "bloomFilterFpChance")
-                                                 : Column.create(bloomFilterFpChance, timestamp, cfName, "bloom_filter_fp_chance"));
-        cf.addColumn(Column.create(memtableFlushPeriod, timestamp, cfName, "memtable_flush_period_in_ms"));
-        cf.addColumn(Column.create(caching.toString(), timestamp, cfName, "caching"));
-        cf.addColumn(Column.create(defaultTimeToLive, timestamp, cfName, "default_time_to_live"));
-        cf.addColumn(Column.create(compactionStrategyClass.getName(), timestamp, cfName, "compaction_strategy_class"));
-        cf.addColumn(Column.create(json(compressionParameters.asThriftOptions()), timestamp, cfName, "compression_parameters"));
-        cf.addColumn(Column.create(json(compactionStrategyOptions), timestamp, cfName, "compaction_strategy_options"));
-        cf.addColumn(Column.create(indexInterval, timestamp, cfName, "index_interval"));
-        cf.addColumn(Column.create(speculativeRetry.toString(), timestamp, cfName, "speculative_retry"));
+        adder.add("comment", comment);
+        adder.add("read_repair_chance", readRepairChance);
+        adder.add("local_read_repair_chance", dcLocalReadRepairChance);
+        adder.add("gc_grace_seconds", gcGraceSeconds);
+        adder.add("default_validator", defaultValidator.toString());
+        adder.add("key_validator", keyValidator.toString());
+        adder.add("min_compaction_threshold", minCompactionThreshold);
+        adder.add("max_compaction_threshold", maxCompactionThreshold);
+        adder.add("bloom_filter_fp_chance", bloomFilterFpChance);
 
-        for (Map.Entry<ByteBuffer, Long> entry : droppedColumns.entrySet())
-            cf.addColumn(new Column(makeDroppedColumnName(entry.getKey()), LongType.instance.decompose(entry.getValue()), timestamp));
+        adder.add("memtable_flush_period_in_ms", memtableFlushPeriod);
+        adder.add("caching", caching.toString());
+        adder.add("default_time_to_live", defaultTimeToLive);
+        adder.add("compaction_strategy_class", compactionStrategyClass.getName());
+        adder.add("compression_parameters", json(compressionParameters.asThriftOptions()));
+        adder.add("compaction_strategy_options", json(compactionStrategyOptions));
+        adder.add("min_index_interval", minIndexInterval);
+        adder.add("max_index_interval", maxIndexInterval);
+        adder.add("index_interval", null);
+        adder.add("speculative_retry", speculativeRetry.toString());
 
-        cf.addColumn(isDense == null ? DeletedColumn.create(ldt, timestamp, cfName, "is_dense")
-                                     : Column.create(isDense, timestamp, cfName, "is_dense"));
+        for (Map.Entry<ColumnIdentifier, Long> entry : droppedColumns.entrySet())
+            adder.addMapEntry("dropped_columns", entry.getKey().toString(), entry.getValue());
+
+        adder.add("is_dense", isDense);
 
         // Save the CQL3 metadata "the old way" for compatibility sake
-        cf.addColumn(Column.create(aliasesToJson(partitionKeyColumns), timestamp, cfName, "key_aliases"));
-        cf.addColumn(Column.create(aliasesToJson(clusteringKeyColumns), timestamp, cfName, "column_aliases"));
-        cf.addColumn(compactValueColumn == null ? DeletedColumn.create(ldt, timestamp, cfName, "value_alias")
-                                                : Column.create(compactValueColumn.name, timestamp, cfName, "value_alias"));
+        adder.add("key_aliases", aliasesToJson(partitionKeyColumns));
+        adder.add("column_aliases", aliasesToJson(clusteringColumns));
+        adder.add("value_alias", compactValueColumn == null ? null : compactValueColumn.name.toString());
     }
 
     // Package protected for use by tests
-    static CFMetaData fromSchemaNoColumnsNoTriggers(UntypedResultSet.Row result)
+    static CFMetaData fromSchemaNoTriggers(UntypedResultSet.Row result, UntypedResultSet serializedColumnDefinitions)
     {
         try
         {
-            CFMetaData cfm = new CFMetaData(result.getString("keyspace_name"),
-                                            result.getString("columnfamily_name"),
-                                            ColumnFamilyType.valueOf(result.getString("type")),
-                                            TypeParser.parse(result.getString("comparator")),
-                                            result.has("subcomparator") ? TypeParser.parse(result.getString("subcomparator")) : null);
+            String ksName = result.getString("keyspace_name");
+            String cfName = result.getString("columnfamily_name");
+
+            AbstractType<?> rawComparator = TypeParser.parse(result.getString("comparator"));
+            AbstractType<?> subComparator = result.has("subcomparator") ? TypeParser.parse(result.getString("subcomparator")) : null;
+            ColumnFamilyType cfType = ColumnFamilyType.valueOf(result.getString("type"));
+
+            AbstractType<?> fullRawComparator = makeRawAbstractType(rawComparator, subComparator);
+
+            List<ColumnDefinition> columnDefs = ColumnDefinition.fromSchema(serializedColumnDefinitions,
+                                                                            ksName,
+                                                                            cfName,
+                                                                            fullRawComparator,
+                                                                            cfType == ColumnFamilyType.Super);
+
+            boolean isDense = result.has("is_dense")
+                            ? result.getBoolean("is_dense")
+                            : calculateIsDense(fullRawComparator, columnDefs);
+
+            CellNameType comparator = CellNames.fromAbstractType(fullRawComparator, isDense);
+
+            // if we are upgrading, we use id generated from names initially
+            UUID cfId = result.has("cf_id")
+                      ? result.getUUID("cf_id")
+                      : generateLegacyCfId(ksName, cfName);
+
+            CFMetaData cfm = new CFMetaData(ksName, cfName, cfType, comparator, cfId);
+            cfm.isDense(isDense);
 
             cfm.readRepairChance(result.getDouble("read_repair_chance"));
             cfm.dcLocalReadRepairChance(result.getDouble("local_read_repair_chance"));
-            cfm.replicateOnWrite(result.getBoolean("replicate_on_write"));
             cfm.gcGraceSeconds(result.getInt("gc_grace_seconds"));
             cfm.defaultValidator(TypeParser.parse(result.getString("default_validator")));
             cfm.keyValidator(TypeParser.parse(result.getString("key_validator")));
@@ -1693,7 +1802,7 @@
                 cfm.bloomFilterFpChance(result.getDouble("bloom_filter_fp_chance"));
             if (result.has("memtable_flush_period_in_ms"))
                 cfm.memtableFlushPeriod(result.getInt("memtable_flush_period_in_ms"));
-            cfm.caching(Caching.valueOf(result.getString("caching")));
+            cfm.caching(CachingOptions.fromString(result.getString("caching")));
             if (result.has("default_time_to_live"))
                 cfm.defaultTimeToLive(result.getInt("default_time_to_live"));
             if (result.has("speculative_retry"))
@@ -1701,45 +1810,36 @@
             cfm.compactionStrategyClass(createCompactionStrategy(result.getString("compaction_strategy_class")));
             cfm.compressionParameters(CompressionParameters.create(fromJsonMap(result.getString("compression_parameters"))));
             cfm.compactionStrategyOptions(fromJsonMap(result.getString("compaction_strategy_options")));
-            if (result.has("index_interval"))
-            {
-                cfm.indexInterval(result.getInt("index_interval"));
-            }
-            else
-            {
-                if (DatabaseDescriptor.getIndexInterval() != null)
-                {
-                    // use index_interval set in cassandra.yaml as default value (in memory only)
-                    cfm.indexInterval(DatabaseDescriptor.getIndexInterval());
-                }
-            }
-            if (result.has("populate_io_cache_on_flush"))
-                cfm.populateIoCacheOnFlush(result.getBoolean("populate_io_cache_on_flush"));
 
-            if (result.has("is_dense"))
-                cfm.isDense(result.getBoolean("is_dense"));
+            // migrate old index_interval values to min_index_interval, if present
+            if (result.has("min_index_interval"))
+                cfm.minIndexInterval(result.getInt("min_index_interval"));
+            else if (result.has("index_interval"))
+                cfm.minIndexInterval(result.getInt("index_interval"));
+            if (result.has("max_index_interval"))
+                cfm.maxIndexInterval(result.getInt("max_index_interval"));
 
             /*
-             * The info previously hold by key_aliases, column_aliases and value_alias is now stored in column_metadata (because 1) this
+             * The info previously hold by key_aliases, column_aliases and value_alias is now stored in columnMetadata (because 1) this
              * make more sense and 2) this allow to store indexing information).
              * However, for upgrade sake we need to still be able to read those old values. Moreover, we cannot easily
-             * remove those old columns once "converted" to column_metadata because that would screw up nodes that may
+             * remove those old columns once "converted" to columnMetadata because that would screw up nodes that may
              * not have upgraded. So for now we keep the both info and in sync, even though its redundant.
-             * In other words, the ColumnDefinition the following lines add may be replaced later when ColumnDefinition.fromSchema
-             * is called but that's ok.
              */
             if (result.has("key_aliases"))
-                cfm.addColumnMetadataFromAliases(aliasesFromStrings(fromJsonList(result.getString("key_aliases"))), cfm.keyValidator, ColumnDefinition.Type.PARTITION_KEY);
+                cfm.addColumnMetadataFromAliases(aliasesFromStrings(fromJsonList(result.getString("key_aliases"))), cfm.keyValidator, ColumnDefinition.Kind.PARTITION_KEY);
             if (result.has("column_aliases"))
-                cfm.addColumnMetadataFromAliases(aliasesFromStrings(fromJsonList(result.getString("column_aliases"))), cfm.comparator, ColumnDefinition.Type.CLUSTERING_KEY);
-
+                cfm.addColumnMetadataFromAliases(aliasesFromStrings(fromJsonList(result.getString("column_aliases"))), cfm.comparator.asAbstractType(), ColumnDefinition.Kind.CLUSTERING_COLUMN);
             if (result.has("value_alias"))
-                cfm.addColumnMetadataFromAliases(Collections.<ByteBuffer>singletonList(result.getBytes("value_alias")), cfm.defaultValidator, ColumnDefinition.Type.COMPACT_VALUE);
+                cfm.addColumnMetadataFromAliases(Collections.singletonList(result.getBytes("value_alias")), cfm.defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
 
             if (result.has("dropped_columns"))
                 cfm.droppedColumns(convertDroppedColumns(result.getMap("dropped_columns", UTF8Type.instance, LongType.instance)));
 
-            return cfm;
+            for (ColumnDefinition cd : columnDefs)
+                cfm.addOrReplaceColumnDefinition(cd);
+
+            return cfm.rebuild();
         }
         catch (SyntaxException | ConfigurationException e)
         {
@@ -1747,7 +1847,7 @@
         }
     }
 
-    public void addColumnMetadataFromAliases(List<ByteBuffer> aliases, AbstractType<?> comparator, ColumnDefinition.Type type)
+    public void addColumnMetadataFromAliases(List<ByteBuffer> aliases, AbstractType<?> comparator, ColumnDefinition.Kind kind)
     {
         if (comparator instanceof CompositeType)
         {
@@ -1755,14 +1855,16 @@
             for (int i = 0; i < aliases.size(); ++i)
             {
                 if (aliases.get(i) != null)
-                    column_metadata.put(aliases.get(i), new ColumnDefinition(aliases.get(i), ct.types.get(i), i, type));
+                {
+                    addOrReplaceColumnDefinition(new ColumnDefinition(this, aliases.get(i), ct.types.get(i), i, kind));
+                }
             }
         }
         else
         {
             assert aliases.size() <= 1;
             if (!aliases.isEmpty() && aliases.get(0) != null)
-                column_metadata.put(aliases.get(0), new ColumnDefinition(aliases.get(0), comparator, null, type));
+                addOrReplaceColumnDefinition(new ColumnDefinition(this, aliases.get(0), comparator, null, kind));
         }
     }
 
@@ -1773,13 +1875,16 @@
      */
     public static CFMetaData fromSchema(UntypedResultSet.Row result)
     {
-        CFMetaData cfDef = fromSchemaNoColumnsNoTriggers(result);
+        String ksName = result.getString("keyspace_name");
+        String cfName = result.getString("columnfamily_name");
 
-        Row serializedTriggers = SystemKeyspace.readSchemaRow(SystemKeyspace.SCHEMA_TRIGGERS_CF, cfDef.ksName, cfDef.cfName);
-        addTriggerDefinitionsFromSchema(cfDef, serializedTriggers);
+        Row serializedColumns = SystemKeyspace.readSchemaRow(SystemKeyspace.SCHEMA_COLUMNS_CF, ksName, cfName);
+        CFMetaData cfm = fromSchemaNoTriggers(result, ColumnDefinition.resultify(serializedColumns));
 
-        Row serializedColumns = SystemKeyspace.readSchemaRow(SystemKeyspace.SCHEMA_COLUMNS_CF, cfDef.ksName, cfDef.cfName);
-        return addColumnDefinitionsFromSchema(cfDef, serializedColumns);
+        Row serializedTriggers = SystemKeyspace.readSchemaRow(SystemKeyspace.SCHEMA_TRIGGERS_CF, ksName, cfName);
+        addTriggerDefinitionsFromSchema(cfm, serializedTriggers);
+
+        return cfm;
     }
 
     private static CFMetaData fromSchema(Row row)
@@ -1790,36 +1895,31 @@
 
     private String aliasesToJson(List<ColumnDefinition> rawAliases)
     {
-        List<String> aliases = new ArrayList<String>(rawAliases.size());
+        if (rawAliases == null)
+            return null;
+
+        List<String> aliases = new ArrayList<>(rawAliases.size());
         for (ColumnDefinition rawAlias : rawAliases)
-            aliases.add(UTF8Type.instance.compose(rawAlias.name));
+            aliases.add(rawAlias.name.toString());
         return json(aliases);
     }
 
     private static List<ByteBuffer> aliasesFromStrings(List<String> aliases)
     {
-        List<ByteBuffer> rawAliases = new ArrayList<ByteBuffer>(aliases.size());
+        List<ByteBuffer> rawAliases = new ArrayList<>(aliases.size());
         for (String alias : aliases)
             rawAliases.add(UTF8Type.instance.decompose(alias));
         return rawAliases;
     }
 
-    private static Map<ByteBuffer, Long> convertDroppedColumns(Map<String, Long> raw)
+    private static Map<ColumnIdentifier, Long> convertDroppedColumns(Map<String, Long> raw)
     {
-        Map<ByteBuffer, Long> converted = Maps.newHashMap();
+        Map<ColumnIdentifier, Long> converted = Maps.newHashMap();
         for (Map.Entry<String, Long> entry : raw.entrySet())
-            converted.put(UTF8Type.instance.decompose(entry.getKey()), entry.getValue());
+            converted.put(new ColumnIdentifier(entry.getKey(), true), entry.getValue());
         return converted;
     }
 
-    private ByteBuffer makeDroppedColumnName(ByteBuffer column)
-    {
-        ColumnNameBuilder builder = SchemaColumnFamiliesCf.cqlCfDef.getColumnNameBuilder();
-        builder.add(UTF8Type.instance.decompose(cfName));
-        builder.add(UTF8Type.instance.decompose("dropped_columns"));
-        return builder.add(column).build();
-    }
-
     /**
      * Convert current metadata into schema mutation
      *
@@ -1829,72 +1929,67 @@
      *
      * @throws ConfigurationException if any of the attributes didn't pass validation
      */
-    public RowMutation toSchema(long timestamp) throws ConfigurationException
+    public Mutation toSchema(long timestamp) throws ConfigurationException
     {
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(ksName));
-        toSchema(rm, timestamp);
-        return rm;
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(ksName));
+        toSchema(mutation, timestamp);
+        return mutation;
     }
 
     // The comparator to validate the definition name.
 
     public AbstractType<?> getColumnDefinitionComparator(ColumnDefinition def)
     {
-        return getComponentComparator(def.componentIndex, def.type);
+        return getComponentComparator(def.isOnAllComponents() ? null : def.position(), def.kind);
     }
 
-    public AbstractType<?> getComponentComparator(Integer componentIndex, ColumnDefinition.Type type)
+    public AbstractType<?> getComponentComparator(Integer componentIndex, ColumnDefinition.Kind kind)
     {
-        switch (type)
+        switch (kind)
         {
             case REGULAR:
-                AbstractType<?> cfComparator = cfType == ColumnFamilyType.Super ? ((CompositeType)comparator).types.get(1) : comparator;
-                if (cfComparator instanceof CompositeType)
-                {
-                    if (componentIndex == null)
-                        return cfComparator;
+                if (componentIndex == null)
+                    return comparator.asAbstractType();
 
-                    List<AbstractType<?>> types = ((CompositeType)cfComparator).types;
-                    AbstractType<?> t = types.get(componentIndex);
-                    assert t != null : "Non-sensical component index";
-                    return t;
-                }
-                else
-                {
-                    return cfComparator;
-                }
+                AbstractType<?> t = comparator.subtype(componentIndex);
+                assert t != null : "Non-sensical component index";
+                return t;
             default:
                 // CQL3 column names are UTF8
                 return UTF8Type.instance;
         }
     }
 
-    // Package protected for use by tests
-    static CFMetaData addColumnDefinitionsFromSchema(CFMetaData cfDef, Row serializedColumnDefinitions)
+    public CFMetaData addAllColumnDefinitions(Collection<ColumnDefinition> defs)
     {
-        for (ColumnDefinition cd : ColumnDefinition.fromSchema(serializedColumnDefinitions, cfDef))
-            cfDef.column_metadata.put(cd.name, cd);
-        return cfDef.rebuild();
+        for (ColumnDefinition def : defs)
+            addOrReplaceColumnDefinition(def);
+        return this;
     }
 
-    public void addColumnDefinition(ColumnDefinition def) throws ConfigurationException
+    public CFMetaData addColumnDefinition(ColumnDefinition def) throws ConfigurationException
     {
-        if (column_metadata.containsKey(def.name))
-            throw new ConfigurationException(String.format("Cannot add column %s, a column with the same name already exists", getColumnDefinitionComparator(def).getString(def.name)));
+        if (columnMetadata.containsKey(def.name.bytes))
+            throw new ConfigurationException(String.format("Cannot add column %s, a column with the same name already exists", def.name));
 
-        addOrReplaceColumnDefinition(def);
+        return addOrReplaceColumnDefinition(def);
     }
 
     // This method doesn't check if a def of the same name already exist and should only be used when we
     // know this cannot happen.
-    public void addOrReplaceColumnDefinition(ColumnDefinition def)
+    public CFMetaData addOrReplaceColumnDefinition(ColumnDefinition def)
     {
-        column_metadata.put(def.name, def);
+        if (def.kind == ColumnDefinition.Kind.REGULAR)
+            comparator.addCQL3Column(def.name);
+        columnMetadata.put(def.name.bytes, def);
+        return this;
     }
 
     public boolean removeColumnDefinition(ColumnDefinition def)
     {
-        return column_metadata.remove(def.name) != null;
+        if (def.kind == ColumnDefinition.Kind.REGULAR)
+            comparator.removeCQL3Column(def.name);
+        return columnMetadata.remove(def.name.bytes) != null;
     }
 
     private static void addTriggerDefinitionsFromSchema(CFMetaData cfDef, Row serializedTriggerDefinitions)
@@ -1903,13 +1998,19 @@
             cfDef.triggers.put(td.name, td);
     }
 
-    public void addTriggerDefinition(TriggerDefinition def) throws ConfigurationException
+    public void addTriggerDefinition(TriggerDefinition def) throws InvalidRequestException
     {
-        if (triggers.containsKey(def.name))
-            throw new ConfigurationException(String.format("Cannot create trigger %s, a trigger with the same name already exists", def.name));
+        if (containsTriggerDefinition(def))
+            throw new InvalidRequestException(
+                String.format("Cannot create trigger %s, a trigger with the same name already exists", def.name));
         triggers.put(def.name, def);
     }
 
+    public boolean containsTriggerDefinition(TriggerDefinition def)
+    {
+        return triggers.containsKey(def.name);
+    }
+
     public boolean removeTrigger(String name)
     {
         return triggers.remove(name) != null;
@@ -1917,78 +2018,58 @@
 
     public void recordColumnDrop(ColumnDefinition def)
     {
-        assert def.componentIndex != null;
+        assert !def.isOnAllComponents();
         droppedColumns.put(def.name, FBUtilities.timestampMicros());
     }
 
-    public void renameColumn(ByteBuffer from, String strFrom, ByteBuffer to, String strTo) throws InvalidRequestException
+    public void renameColumn(ColumnIdentifier from, ColumnIdentifier to) throws InvalidRequestException
     {
-        ColumnDefinition def = column_metadata.get(from);
+        ColumnDefinition def = getColumnDefinition(from);
         if (def == null)
-            throw new InvalidRequestException(String.format("Cannot rename unknown column %s in keyspace %s", strFrom, cfName));
+            throw new InvalidRequestException(String.format("Cannot rename unknown column %s in keyspace %s", from, cfName));
 
-        if (column_metadata.get(to) != null)
-            throw new InvalidRequestException(String.format("Cannot rename column %s to %s in keyspace %s; another column of that name already exist", strFrom, strTo, cfName));
+        if (getColumnDefinition(to) != null)
+            throw new InvalidRequestException(String.format("Cannot rename column %s to %s in keyspace %s; another column of that name already exist", from, to, cfName));
 
         if (def.isPartOfCellName())
         {
-            throw new InvalidRequestException(String.format("Cannot rename non PRIMARY KEY part %s", strFrom));
+            throw new InvalidRequestException(String.format("Cannot rename non PRIMARY KEY part %s", from));
         }
         else if (def.isIndexed())
         {
-            throw new InvalidRequestException(String.format("Cannot rename column %s because it is secondary indexed", strFrom));
+            throw new InvalidRequestException(String.format("Cannot rename column %s because it is secondary indexed", from));
         }
 
-        ColumnDefinition newDef = def.cloneWithNewName(to);
+        ColumnDefinition newDef = def.withNewName(to);
         // don't call addColumnDefinition/removeColumnDefition because we want to avoid recomputing
         // the CQL3 cfDef between those two operation
-        column_metadata.put(newDef.name, newDef);
-        column_metadata.remove(def.name);
+        addOrReplaceColumnDefinition(newDef);
+        removeColumnDefinition(def);
     }
 
     public CFMetaData rebuild()
     {
-        /*
-         * TODO: There is definitively some repetition between the CQL3  metadata stored in this
-         * object (partitionKeyColumns, ...) and the one stored in CFDefinition.
-         * Ultimately, we should probably merge both. However, there is enough details to fix that
-         * it's worth doing that in a separate issue.
-         */
-        rebuildCQL3Metadata();
-        cqlCfDef = new CFDefinition(this);
-        return this;
-    }
-
-    public CFDefinition getCfDef()
-    {
-        assert cqlCfDef != null;
-        return cqlCfDef;
-    }
-
-    private void rebuildCQL3Metadata()
-    {
-        List<ColumnDefinition> pkCols = nullInitializedList(keyValidator.componentsCount());
         if (isDense == null)
-            isDense(calculateIsDense(comparator, column_metadata.values()));
-        int nbCkCols = isDense
-                     ? comparator.componentsCount()
-                     : comparator.componentsCount() - (hasCollection() ? 2 : 1);
-        List<ColumnDefinition> ckCols = nullInitializedList(nbCkCols);
-        Set<ColumnDefinition> regCols = new HashSet<ColumnDefinition>();
-        Set<ColumnDefinition> statCols = new HashSet<ColumnDefinition>();
+            isDense(calculateIsDense(comparator.asAbstractType(), allColumns()));
+
+        List<ColumnDefinition> pkCols = nullInitializedList(keyValidator.componentsCount());
+        List<ColumnDefinition> ckCols = nullInitializedList(comparator.clusteringPrefixSize());
+        // We keep things sorted to get consistent/predictable order in select queries
+        SortedSet<ColumnDefinition> regCols = new TreeSet<>(regularColumnComparator);
+        SortedSet<ColumnDefinition> statCols = new TreeSet<>(regularColumnComparator);
         ColumnDefinition compactCol = null;
 
-        for (ColumnDefinition def : column_metadata.values())
+        for (ColumnDefinition def : allColumns())
         {
-            switch (def.type)
+            switch (def.kind)
             {
                 case PARTITION_KEY:
-                    assert !(def.componentIndex == null && keyValidator instanceof CompositeType);
-                    pkCols.set(def.componentIndex == null ? 0 : def.componentIndex, def);
+                    assert !(def.isOnAllComponents() && keyValidator instanceof CompositeType);
+                    pkCols.set(def.position(), def);
                     break;
-                case CLUSTERING_KEY:
-                    assert !(def.componentIndex == null && comparator instanceof CompositeType);
-                    ckCols.set(def.componentIndex == null ? 0 : def.componentIndex, def);
+                case CLUSTERING_COLUMN:
+                    assert !(def.isOnAllComponents() && comparator.isCompound());
+                    ckCols.set(def.position(), def);
                     break;
                 case REGULAR:
                     regCols.add(def);
@@ -2005,10 +2086,11 @@
 
         // Now actually assign the correct value. This is not atomic, but then again, updating CFMetaData is never atomic anyway.
         partitionKeyColumns = addDefaultKeyAliases(pkCols);
-        clusteringKeyColumns = addDefaultColumnAliases(ckCols);
+        clusteringColumns = addDefaultColumnAliases(ckCols);
         regularColumns = regCols;
         staticColumns = statCols;
         compactValueColumn = addDefaultValueAlias(compactCol);
+        return this;
     }
 
     private List<ColumnDefinition> addDefaultKeyAliases(List<ColumnDefinition> pkCols)
@@ -2027,8 +2109,8 @@
                 // For compatibility sake, we call the first alias 'key' rather than 'key1'. This
                 // is inconsistent with column alias, but it's probably not worth risking breaking compatibility now.
                 ByteBuffer name = ByteBufferUtil.bytes(i == 0 ? DEFAULT_KEY_ALIAS : DEFAULT_KEY_ALIAS + (i + 1));
-                ColumnDefinition newDef = ColumnDefinition.partitionKeyDef(name, type, idx);
-                column_metadata.put(newDef.name, newDef);
+                ColumnDefinition newDef = ColumnDefinition.partitionKeyDef(this, name, type, idx);
+                addOrReplaceColumnDefinition(newDef);
                 pkCols.set(i, newDef);
             }
         }
@@ -2041,16 +2123,21 @@
         {
             if (ckCols.get(i) == null)
             {
-                Integer idx = null;
-                AbstractType<?> type = comparator;
-                if (comparator instanceof CompositeType)
+                Integer idx;
+                AbstractType<?> type;
+                if (comparator.isCompound())
                 {
                     idx = i;
-                    type = ((CompositeType)comparator).types.get(i);
+                    type = comparator.subtype(i);
+                }
+                else
+                {
+                    idx = null;
+                    type = comparator.asAbstractType();
                 }
                 ByteBuffer name = ByteBufferUtil.bytes(DEFAULT_COLUMN_ALIAS + (i + 1));
-                ColumnDefinition newDef = ColumnDefinition.clusteringKeyDef(name, type, idx);
-                column_metadata.put(newDef.name, newDef);
+                ColumnDefinition newDef = ColumnDefinition.clusteringKeyDef(this, name, type, idx);
+                addOrReplaceColumnDefinition(newDef);
                 ckCols.set(i, newDef);
             }
         }
@@ -2059,13 +2146,13 @@
 
     private ColumnDefinition addDefaultValueAlias(ColumnDefinition compactValueDef)
     {
-        if (isDense)
+        if (comparator.isDense())
         {
             if (compactValueDef != null)
                 return compactValueDef;
 
-            ColumnDefinition newDef = ColumnDefinition.compactValueDef(ByteBufferUtil.bytes(DEFAULT_VALUE_ALIAS), defaultValidator);
-            column_metadata.put(newDef.name, newDef);
+            ColumnDefinition newDef = ColumnDefinition.compactValueDef(this, ByteBufferUtil.bytes(DEFAULT_VALUE_ALIAS), defaultValidator);
+            addOrReplaceColumnDefinition(newDef);
             return newDef;
         }
         else
@@ -2075,15 +2162,6 @@
         }
     }
 
-    private boolean hasCollection()
-    {
-        if (isSuper() || !(comparator instanceof CompositeType))
-            return false;
-
-        List<AbstractType<?>> types = ((CompositeType)comparator).types;
-        return types.get(types.size() - 1) instanceof ColumnToCollectionType;
-    }
-
     /*
      * We call dense a CF for which each component of the comparator is a clustering column, i.e. no
      * component is used to store a regular column names. In other words, non-composite static "thrift"
@@ -2107,7 +2185,7 @@
          *
          * So we need to recognize those special case CQL3 table with only a primary key. If we have some
          * clustering columns, we're fine as said above. So the only problem is that we cannot decide for
-         * sure if a CF without REGULAR columns nor CLUSTERING_KEY definition is meant to be dense, or if it
+         * sure if a CF without REGULAR columns nor CLUSTERING_COLUMN definition is meant to be dense, or if it
          * has been created in CQL3 by say:
          *    CREATE TABLE test (k int PRIMARY KEY)
          * in which case it should not be dense. However, we can limit our margin of error by assuming we are
@@ -2117,10 +2195,10 @@
         int maxClusteringIdx = -1;
         for (ColumnDefinition def : defs)
         {
-            switch (def.type)
+            switch (def.kind)
             {
-                case CLUSTERING_KEY:
-                    maxClusteringIdx = Math.max(maxClusteringIdx, def.componentIndex == null ? 0 : def.componentIndex);
+                case CLUSTERING_COLUMN:
+                    maxClusteringIdx = Math.max(maxClusteringIdx, def.position());
                     break;
                 case REGULAR:
                     hasRegular = true;
@@ -2142,6 +2220,11 @@
         return ct.types.size() == 1 && ct.types.get(0) instanceof UTF8Type;
     }
 
+    public boolean isCQL3Table()
+    {
+        return !isSuper() && !comparator.isDense() && comparator.isCompound();
+    }
+
     private static <T> List<T> nullInitializedList(int size)
     {
         List<T> l = new ArrayList<>(size);
@@ -2160,41 +2243,31 @@
         if (isSuper())
             return true;
 
-        for (ColumnDefinition def : column_metadata.values())
+        for (ColumnDefinition def : allColumns())
         {
             // Non-REGULAR ColumnDefinition are not "thrift compatible" per-se, but it's ok because they hold metadata
             // this is only of use to CQL3, so we will just skip them in toThrift.
-            if (def.type == ColumnDefinition.Type.REGULAR && !def.isThriftCompatible())
+            if (def.kind == ColumnDefinition.Kind.REGULAR && !def.isThriftCompatible())
                 return false;
         }
 
         // The table might also have no REGULAR columns (be PK-only), but still be "thrift incompatible". See #7832.
-        if (isCQL3OnlyPKComparator(comparator) && !isDense)
+        if (isCQL3OnlyPKComparator(comparator.asAbstractType()) && !isDense)
             return false;
 
         return true;
     }
 
+    public boolean isCounter()
+    {
+        return defaultValidator.isCounter();
+    }
+
     public boolean hasStaticColumns()
     {
         return !staticColumns.isEmpty();
     }
 
-    public ColumnNameBuilder getStaticColumnNameBuilder()
-    {
-        assert comparator instanceof CompositeType && clusteringKeyColumns().size() > 0;
-        CompositeType.Builder builder = CompositeType.Builder.staticBuilder((CompositeType)comparator);
-        for (int i = 0; i < clusteringKeyColumns().size(); i++)
-            builder.add(ByteBufferUtil.EMPTY_BYTE_BUFFER);
-        return builder;
-    }
-
-    public void validateColumns(Iterable<Column> columns)
-    {
-        for (Column column : columns)
-            column.validateFields(this);
-    }
-
     @Override
     public String toString()
     {
@@ -2206,26 +2279,25 @@
             .append("comparator", comparator)
             .append("comment", comment)
             .append("readRepairChance", readRepairChance)
-            .append("dclocalReadRepairChance", dcLocalReadRepairChance)
-            .append("replicateOnWrite", replicateOnWrite)
+            .append("dcLocalReadRepairChance", dcLocalReadRepairChance)
             .append("gcGraceSeconds", gcGraceSeconds)
             .append("defaultValidator", defaultValidator)
             .append("keyValidator", keyValidator)
             .append("minCompactionThreshold", minCompactionThreshold)
             .append("maxCompactionThreshold", maxCompactionThreshold)
-            .append("column_metadata", column_metadata)
+            .append("columnMetadata", columnMetadata.values())
             .append("compactionStrategyClass", compactionStrategyClass)
             .append("compactionStrategyOptions", compactionStrategyOptions)
-            .append("compressionOptions", compressionParameters.asThriftOptions())
+            .append("compressionParameters", compressionParameters.asThriftOptions())
             .append("bloomFilterFpChance", bloomFilterFpChance)
-            .append("memtable_flush_period_in_ms", memtableFlushPeriod)
+            .append("memtableFlushPeriod", memtableFlushPeriod)
             .append("caching", caching)
             .append("defaultTimeToLive", defaultTimeToLive)
-            .append("speculative_retry", speculativeRetry)
-            .append("indexInterval", indexInterval)
-            .append("populateIoCacheOnFlush", populateIoCacheOnFlush)
+            .append("minIndexInterval", minIndexInterval)
+            .append("maxIndexInterval", maxIndexInterval)
+            .append("speculativeRetry", speculativeRetry)
             .append("droppedColumns", droppedColumns)
-            .append("triggers", triggers)
+            .append("triggers", triggers.values())
             .append("isDense", isDense)
             .toString();
     }

diff --git a/src/java/org/apache/cassandra/config/ColumnDefinition.java b/src/java/org/apache/cassandra/config/ColumnDefinition.java
index 1223db8..cbb3e75 100644
--- a/src/java/org/apache/cassandra/config/ColumnDefinition.java
+++ b/src/java/org/apache/cassandra/config/ColumnDefinition.java

@@ -24,29 +24,28 @@
 import com.google.common.base.Objects;
 import com.google.common.collect.Maps;
 
-import org.apache.cassandra.cql3.ColumnNameBuilder;
-import org.apache.cassandra.cql3.QueryProcessor;
-import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.thrift.ColumnDef;
-import org.apache.cassandra.thrift.IndexType;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 
 import static org.apache.cassandra.utils.FBUtilities.json;
 
-public class ColumnDefinition
+public class ColumnDefinition extends ColumnSpecification
 {
     // system.schema_columns column names
     private static final String COLUMN_NAME = "column_name";
-    private static final String VALIDATOR = "validator";
+    private static final String TYPE = "validator";
     private static final String INDEX_TYPE = "index_type";
     private static final String INDEX_OPTIONS = "index_options";
     private static final String INDEX_NAME = "index_name";
     private static final String COMPONENT_INDEX = "component_index";
-    private static final String TYPE = "type";
+    private static final String KIND = "type";
 
     /*
      * The type of CQL3 column this definition represents.
@@ -59,85 +58,134 @@
      * Note that thrift/CQL2 only know about definitions of type REGULAR (and
      * the ones whose componentIndex == null).
      */
-    public enum Type
+    public enum Kind
     {
         PARTITION_KEY,
-        CLUSTERING_KEY,
+        CLUSTERING_COLUMN,
         REGULAR,
-        COMPACT_VALUE,
-        STATIC
+        STATIC,
+        COMPACT_VALUE;
+
+        public String serialize()
+        {
+            // For backward compatibility we need to special case CLUSTERING_COLUMN
+            return this == CLUSTERING_COLUMN ? "clustering_key" : this.toString().toLowerCase();
+        }
+
+        public static Kind deserialize(String value)
+        {
+            if (value.equalsIgnoreCase("clustering_key"))
+                return CLUSTERING_COLUMN;
+            return Enum.valueOf(Kind.class, value.toUpperCase());
+        }
     }
 
-    public final ByteBuffer name;
-    private AbstractType<?> validator;
+    public final Kind kind;
+
+    private String indexName;
     private IndexType indexType;
     private Map<String,String> indexOptions;
-    private String indexName;
-    public final Type type;
 
     /*
      * If the column comparator is a composite type, indicates to which
      * component this definition refers to. If null, the definition refers to
      * the full column name.
      */
-    public final Integer componentIndex;
+    private final Integer componentIndex;
 
-    public static ColumnDefinition partitionKeyDef(ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    public static ColumnDefinition partitionKeyDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
     {
-        return new ColumnDefinition(name, validator, componentIndex, Type.PARTITION_KEY);
+        return new ColumnDefinition(cfm, name, validator, componentIndex, Kind.PARTITION_KEY);
     }
 
-    public static ColumnDefinition clusteringKeyDef(ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    public static ColumnDefinition partitionKeyDef(String ksName, String cfName, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
     {
-        return new ColumnDefinition(name, validator, componentIndex, Type.CLUSTERING_KEY);
+        return new ColumnDefinition(ksName, cfName, new ColumnIdentifier(name, UTF8Type.instance), validator, null, null, null, componentIndex, Kind.PARTITION_KEY);
     }
 
-    public static ColumnDefinition regularDef(ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    public static ColumnDefinition clusteringKeyDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
     {
-        return new ColumnDefinition(name, validator, componentIndex, Type.REGULAR);
+        return new ColumnDefinition(cfm, name, validator, componentIndex, Kind.CLUSTERING_COLUMN);
     }
 
-    public static ColumnDefinition staticDef(ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    public static ColumnDefinition regularDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
     {
-        return new ColumnDefinition(name, validator, componentIndex, Type.STATIC);
+        return new ColumnDefinition(cfm, name, validator, componentIndex, Kind.REGULAR);
     }
 
-    public static ColumnDefinition compactValueDef(ByteBuffer name, AbstractType<?> validator)
+    public static ColumnDefinition staticDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
     {
-        return new ColumnDefinition(name, validator, null, Type.COMPACT_VALUE);
+        return new ColumnDefinition(cfm, name, validator, componentIndex, Kind.STATIC);
     }
 
-    public ColumnDefinition(ByteBuffer name, AbstractType<?> validator, Integer componentIndex, Type type)
+    public static ColumnDefinition compactValueDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator)
     {
-        this(name, validator, null, null, null, componentIndex, type);
+        return new ColumnDefinition(cfm, name, validator, null, Kind.COMPACT_VALUE);
+    }
+
+    public ColumnDefinition(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex, Kind kind)
+    {
+        this(cfm.ksName,
+             cfm.cfName,
+             new ColumnIdentifier(name, cfm.getComponentComparator(componentIndex, kind)),
+             validator,
+             null,
+             null,
+             null,
+             componentIndex,
+             kind);
     }
 
     @VisibleForTesting
-    public ColumnDefinition(ByteBuffer name,
+    public ColumnDefinition(String ksName,
+                            String cfName,
+                            ColumnIdentifier name,
                             AbstractType<?> validator,
                             IndexType indexType,
                             Map<String, String> indexOptions,
                             String indexName,
                             Integer componentIndex,
-                            Type type)
+                            Kind kind)
     {
+        super(ksName, cfName, name, validator);
         assert name != null && validator != null;
-        this.name = name;
+        this.kind = kind;
         this.indexName = indexName;
-        this.validator = validator;
         this.componentIndex = componentIndex;
         this.setIndexType(indexType, indexOptions);
-        this.type = type;
     }
 
-    public ColumnDefinition clone()
+    public ColumnDefinition copy()
     {
-        return new ColumnDefinition(name, validator, indexType, indexOptions, indexName, componentIndex, type);
+        return new ColumnDefinition(ksName, cfName, name, type, indexType, indexOptions, indexName, componentIndex, kind);
     }
 
-    public ColumnDefinition cloneWithNewName(ByteBuffer newName)
+    public ColumnDefinition withNewName(ColumnIdentifier newName)
     {
-        return new ColumnDefinition(newName, validator, indexType, indexOptions, indexName, componentIndex, type);
+        return new ColumnDefinition(ksName, cfName, newName, type, indexType, indexOptions, indexName, componentIndex, kind);
+    }
+
+    public ColumnDefinition withNewType(AbstractType<?> newType)
+    {
+        return new ColumnDefinition(ksName, cfName, name, newType, indexType, indexOptions, indexName, componentIndex, kind);
+    }
+
+    public boolean isOnAllComponents()
+    {
+        return componentIndex == null;
+    }
+
+    public boolean isStatic()
+    {
+        return kind == Kind.STATIC;
+    }
+
+    // The componentIndex. This never return null however for convenience sake:
+    // if componentIndex == null, this return 0. So caller should first check
+    // isOnAllComponents() to distinguish if that's a possibility.
+    public int position()
+    {
+        return componentIndex == null ? 0 : componentIndex;
     }
 
     @Override
@@ -151,8 +199,11 @@
 
         ColumnDefinition cd = (ColumnDefinition) o;
 
-        return Objects.equal(name, cd.name)
-            && Objects.equal(validator, cd.validator)
+        return Objects.equal(ksName, cd.ksName)
+            && Objects.equal(cfName, cd.cfName)
+            && Objects.equal(name, cd.name)
+            && Objects.equal(type, cd.type)
+            && Objects.equal(kind, cd.kind)
             && Objects.equal(componentIndex, cd.componentIndex)
             && Objects.equal(indexName, cd.indexName)
             && Objects.equal(indexType, cd.indexType)
@@ -162,16 +213,16 @@
     @Override
     public int hashCode()
     {
-        return Objects.hashCode(name, validator, componentIndex, indexName, indexType, indexOptions);
+        return Objects.hashCode(ksName, cfName, name, type, kind, componentIndex, indexName, indexType, indexOptions);
     }
 
     @Override
     public String toString()
     {
         return Objects.toStringHelper(this)
-                      .add("name", ByteBufferUtil.bytesToHex(name))
-                      .add("validator", validator)
+                      .add("name", name)
                       .add("type", type)
+                      .add("kind", kind)
                       .add("componentIndex", componentIndex)
                       .add("indexName", indexName)
                       .add("indexType", indexType)
@@ -180,15 +231,19 @@
 
     public boolean isThriftCompatible()
     {
-        // componentIndex == null should always imply isStatic in practice, but there is no harm in being too careful here.
-        return type == ColumnDefinition.Type.REGULAR && componentIndex == null;
+        return kind == ColumnDefinition.Kind.REGULAR && componentIndex == null;
+    }
+
+    public boolean isPrimaryKeyColumn()
+    {
+        return kind == Kind.PARTITION_KEY || kind == Kind.CLUSTERING_COLUMN;
     }
 
     public static List<ColumnDef> toThrift(Map<ByteBuffer, ColumnDefinition> columns)
     {
         List<ColumnDef> thriftDefs = new ArrayList<>(columns.size());
         for (ColumnDefinition def : columns.values())
-            if (def.type == ColumnDefinition.Type.REGULAR)
+            if (def.kind == ColumnDefinition.Kind.REGULAR)
                 thriftDefs.add(def.toThrift());
         return thriftDefs;
     }
@@ -199,102 +254,120 @@
      */
     public boolean isPartOfCellName()
     {
-        return type == Type.REGULAR || type == Type.STATIC;
+        return kind == Kind.REGULAR || kind == Kind.STATIC;
     }
 
     public ColumnDef toThrift()
     {
         ColumnDef cd = new ColumnDef();
 
-        cd.setName(ByteBufferUtil.clone(name));
-        cd.setValidation_class(validator.toString());
-        cd.setIndex_type(indexType == null ? null : IndexType.valueOf(indexType.name()));
+        cd.setName(ByteBufferUtil.clone(name.bytes));
+        cd.setValidation_class(type.toString());
+        cd.setIndex_type(indexType == null ? null : org.apache.cassandra.thrift.IndexType.valueOf(indexType.name()));
         cd.setIndex_name(indexName == null ? null : indexName);
         cd.setIndex_options(indexOptions == null ? null : Maps.newHashMap(indexOptions));
 
         return cd;
     }
 
-    public static ColumnDefinition fromThrift(ColumnDef thriftColumnDef, boolean isSuper) throws SyntaxException, ConfigurationException
+    public static ColumnDefinition fromThrift(String ksName, String cfName, AbstractType<?> thriftComparator, AbstractType<?> thriftSubcomparator, ColumnDef thriftColumnDef) throws SyntaxException, ConfigurationException
     {
         // For super columns, the componentIndex is 1 because the ColumnDefinition applies to the column component.
-        return new ColumnDefinition(ByteBufferUtil.clone(thriftColumnDef.name),
+        Integer componentIndex = thriftSubcomparator != null ? 1 : null;
+        AbstractType<?> comparator = thriftSubcomparator == null ? thriftComparator : thriftSubcomparator;
+        try
+        {
+            comparator.validate(thriftColumnDef.name);
+        }
+        catch (MarshalException e)
+        {
+            throw new ConfigurationException(String.format("Column name %s is not valid for comparator %s", ByteBufferUtil.bytesToHex(thriftColumnDef.name), comparator));
+        }
+
+        return new ColumnDefinition(ksName,
+                                    cfName,
+                                    new ColumnIdentifier(ByteBufferUtil.clone(thriftColumnDef.name), comparator),
                                     TypeParser.parse(thriftColumnDef.validation_class),
-                                    thriftColumnDef.index_type,
+                                    thriftColumnDef.index_type == null ? null : IndexType.valueOf(thriftColumnDef.index_type.name()),
                                     thriftColumnDef.index_options,
                                     thriftColumnDef.index_name,
-                                    isSuper ? 1 : null,
-                                    Type.REGULAR);
+                                    componentIndex,
+                                    Kind.REGULAR);
     }
 
-    public static Map<ByteBuffer, ColumnDefinition> fromThrift(List<ColumnDef> thriftDefs, boolean isSuper) throws SyntaxException, ConfigurationException
+    public static List<ColumnDefinition> fromThrift(String ksName, String cfName, AbstractType<?> thriftComparator, AbstractType<?> thriftSubcomparator, List<ColumnDef> thriftDefs) throws SyntaxException, ConfigurationException
     {
         if (thriftDefs == null)
-            return new HashMap<>();
+            return Collections.emptyList();
 
-        Map<ByteBuffer, ColumnDefinition> cds = new TreeMap<>();
+        List<ColumnDefinition> defs = new ArrayList<>(thriftDefs.size());
         for (ColumnDef thriftColumnDef : thriftDefs)
-            cds.put(ByteBufferUtil.clone(thriftColumnDef.name), fromThrift(thriftColumnDef, isSuper));
+            defs.add(fromThrift(ksName, cfName, thriftComparator, thriftSubcomparator, thriftColumnDef));
 
-        return cds;
+        return defs;
     }
 
     /**
-     * Drop specified column from the schema using given row.
+     * Drop specified column from the schema using given mutation.
      *
-     * @param rm         The schema row mutation
-     * @param cfName     The name of the parent ColumnFamily
-     * @param timestamp  The timestamp to use for column modification
+     * @param mutation  The schema mutation
+     * @param timestamp The timestamp to use for column modification
      */
-    public void deleteFromSchema(RowMutation rm, String cfName, AbstractType<?> comparator, long timestamp)
+    public void deleteFromSchema(Mutation mutation, long timestamp)
     {
-        ColumnFamily cf = rm.addOrGet(CFMetaData.SchemaColumnsCf);
+        ColumnFamily cf = mutation.addOrGet(CFMetaData.SchemaColumnsCf);
         int ldt = (int) (System.currentTimeMillis() / 1000);
 
-        ColumnNameBuilder builder = CFMetaData.SchemaColumnsCf.getCfDef().getColumnNameBuilder();
-        // Note: the following is necessary for backward compatibility. For CQL3, comparator will be UTF8 and nameBytes == name
-        ByteBuffer nameBytes = ByteBufferUtil.bytes(comparator.getString(name));
-        builder.add(ByteBufferUtil.bytes(cfName)).add(nameBytes);
-        cf.addAtom(new RangeTombstone(builder.build(), builder.buildAsEndOfRange(), timestamp, ldt));
+        // Note: we do want to use name.toString(), not name.bytes directly for backward compatibility (For CQL3, this won't make a difference).
+        Composite prefix = CFMetaData.SchemaColumnsCf.comparator.make(cfName, name.toString());
+        cf.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
     }
 
-    public void toSchema(RowMutation rm, String cfName, AbstractType<?> comparator, long timestamp)
+    public void toSchema(Mutation mutation, long timestamp)
     {
-        ColumnFamily cf = rm.addOrGet(CFMetaData.SchemaColumnsCf);
-        int ldt = (int) (System.currentTimeMillis() / 1000);
+        ColumnFamily cf = mutation.addOrGet(CFMetaData.SchemaColumnsCf);
+        Composite prefix = CFMetaData.SchemaColumnsCf.comparator.make(cfName, name.toString());
+        CFRowAdder adder = new CFRowAdder(cf, prefix, timestamp);
 
-        cf.addColumn(Column.create("", timestamp, cfName, comparator.getString(name), ""));
-        cf.addColumn(Column.create(validator.toString(), timestamp, cfName, comparator.getString(name), VALIDATOR));
-        cf.addColumn(indexType == null ? DeletedColumn.create(ldt, timestamp, cfName, comparator.getString(name), INDEX_TYPE)
-                                       : Column.create(indexType.toString(), timestamp, cfName, comparator.getString(name), INDEX_TYPE));
-        cf.addColumn(indexOptions == null ? DeletedColumn.create(ldt, timestamp, cfName, comparator.getString(name), INDEX_OPTIONS)
-                                          : Column.create(json(indexOptions), timestamp, cfName, comparator.getString(name), INDEX_OPTIONS));
-        cf.addColumn(indexName == null ? DeletedColumn.create(ldt, timestamp, cfName, comparator.getString(name), INDEX_NAME)
-                                       : Column.create(indexName, timestamp, cfName, comparator.getString(name), INDEX_NAME));
-        cf.addColumn(componentIndex == null ? DeletedColumn.create(ldt, timestamp, cfName, comparator.getString(name), COMPONENT_INDEX)
-                                            : Column.create(componentIndex, timestamp, cfName, comparator.getString(name), COMPONENT_INDEX));
-        cf.addColumn(Column.create(type.toString().toLowerCase(), timestamp, cfName, comparator.getString(name), TYPE));
+        adder.add(TYPE, type.toString());
+        adder.add(INDEX_TYPE, indexType == null ? null : indexType.toString());
+        adder.add(INDEX_OPTIONS, json(indexOptions));
+        adder.add(INDEX_NAME, indexName);
+        adder.add(COMPONENT_INDEX, componentIndex);
+        adder.add(KIND, kind.serialize());
     }
 
-    public void apply(ColumnDefinition def, AbstractType<?> comparator)  throws ConfigurationException
+    public ColumnDefinition apply(ColumnDefinition def)  throws ConfigurationException
     {
-        assert type == def.type && Objects.equal(componentIndex, def.componentIndex);
+        assert kind == def.kind && Objects.equal(componentIndex, def.componentIndex);
 
         if (getIndexType() != null && def.getIndexType() != null)
         {
             // If an index is set (and not drop by this update), the validator shouldn't be change to a non-compatible one
             // (and we want true comparator compatibility, not just value one, since the validator is used by LocalPartitioner to order index rows)
-            if (!def.getValidator().isCompatibleWith(getValidator()))
-                throw new ConfigurationException(String.format("Cannot modify validator to a non-order-compatible one for column %s since an index is set", comparator.getString(name)));
+            if (!def.type.isCompatibleWith(type))
+                throw new ConfigurationException(String.format("Cannot modify validator to a non-order-compatible one for column %s since an index is set", name));
 
             assert getIndexName() != null;
             if (!getIndexName().equals(def.getIndexName()))
                 throw new ConfigurationException("Cannot modify index name");
         }
 
-        setValidator(def.getValidator());
-        setIndexType(def.getIndexType(), def.getIndexOptions());
-        setIndexName(def.getIndexName());
+        return new ColumnDefinition(ksName,
+                                    cfName,
+                                    name,
+                                    def.type,
+                                    def.getIndexType(),
+                                    def.getIndexOptions(),
+                                    def.getIndexName(),
+                                    componentIndex,
+                                    kind);
+    }
+
+    public static UntypedResultSet resultify(Row serializedColumns)
+    {
+        String query = String.format("SELECT * FROM %s.%s", Keyspace.SYSTEM_KS, SystemKeyspace.SCHEMA_COLUMNS_CF);
+        return QueryProcessor.resultify(query, serializedColumns);
     }
 
     /**
@@ -303,29 +376,30 @@
      * @param serializedColumns storage-level partition containing the column definitions
      * @return the list of processed ColumnDefinitions
      */
-    public static List<ColumnDefinition> fromSchema(Row serializedColumns, CFMetaData cfm)
+    public static List<ColumnDefinition> fromSchema(UntypedResultSet serializedColumns, String ksName, String cfName, AbstractType<?> rawComparator, boolean isSuper)
     {
         List<ColumnDefinition> cds = new ArrayList<>();
-
-        String query = String.format("SELECT * FROM %s.%s", Keyspace.SYSTEM_KS, SystemKeyspace.SCHEMA_COLUMNS_CF);
-        for (UntypedResultSet.Row row : QueryProcessor.resultify(query, serializedColumns))
+        for (UntypedResultSet.Row row : serializedColumns)
         {
-            Type type = row.has(TYPE)
-                      ? Enum.valueOf(Type.class, row.getString(TYPE).toUpperCase())
-                      : Type.REGULAR;
+            Kind kind = row.has(KIND)
+                      ? Kind.deserialize(row.getString(KIND))
+                      : Kind.REGULAR;
 
             Integer componentIndex = null;
             if (row.has(COMPONENT_INDEX))
                 componentIndex = row.getInt(COMPONENT_INDEX);
-            else if (type == Type.CLUSTERING_KEY && cfm.isSuper())
+            else if (kind == Kind.CLUSTERING_COLUMN && isSuper)
                 componentIndex = 1; // A ColumnDefinition for super columns applies to the column component
 
-            ByteBuffer name = cfm.getComponentComparator(componentIndex, type).fromString(row.getString(COLUMN_NAME));
+            // Note: we save the column name as string, but we should not assume that it is an UTF8 name, we
+            // we need to use the comparator fromString method
+            AbstractType<?> comparator = getComponentComparator(rawComparator, componentIndex, kind);
+            ColumnIdentifier name = new ColumnIdentifier(comparator.fromString(row.getString(COLUMN_NAME)), comparator);
 
             AbstractType<?> validator;
             try
             {
-                validator = TypeParser.parse(row.getString(VALIDATOR));
+                validator = TypeParser.parse(row.getString(TYPE));
             }
             catch (RequestValidationException e)
             {
@@ -344,12 +418,27 @@
             if (row.has(INDEX_NAME))
                 indexName = row.getString(INDEX_NAME);
 
-            cds.add(new ColumnDefinition(name, validator, indexType, indexOptions, indexName, componentIndex, type));
+            cds.add(new ColumnDefinition(ksName, cfName, name, validator, indexType, indexOptions, indexName, componentIndex, kind));
         }
 
         return cds;
     }
 
+    public static AbstractType<?> getComponentComparator(AbstractType<?> rawComparator, Integer componentIndex, ColumnDefinition.Kind kind)
+    {
+        switch (kind)
+        {
+            case REGULAR:
+                if (componentIndex == null || (componentIndex == 0 && !(rawComparator instanceof CompositeType)))
+                    return rawComparator;
+
+                return ((CompositeType)rawComparator).types.get(componentIndex);
+            default:
+                // CQL3 column names are UTF8
+                return UTF8Type.instance;
+        }
+    }
+
     public String getIndexName()
     {
         return indexName;
@@ -387,14 +476,4 @@
     {
         return indexOptions;
     }
-
-    public AbstractType<?> getValidator()
-    {
-        return validator;
-    }
-
-    public void setValidator(AbstractType<?> validator)
-    {
-        this.validator = validator;
-    }
 }

diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index aab5025..e2df89f 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java

@@ -63,40 +63,51 @@
     public String initial_token;
     public Integer num_tokens = 1;
 
-    public volatile Long request_timeout_in_ms = new Long(10000);
+    public volatile Long request_timeout_in_ms = 10000L;
 
-    public Long read_request_timeout_in_ms = new Long(5000);
+    public volatile Long read_request_timeout_in_ms = 5000L;
 
-    public Long range_request_timeout_in_ms = new Long(10000);
+    public volatile Long range_request_timeout_in_ms = 10000L;
 
-    public Long write_request_timeout_in_ms = new Long(2000);
+    public volatile Long write_request_timeout_in_ms = 2000L;
 
-    public Long cas_contention_timeout_in_ms = new Long(1000);
+    public volatile Long counter_write_request_timeout_in_ms = 5000L;
 
-    public Long truncate_request_timeout_in_ms = new Long(60000);
+    public volatile Long cas_contention_timeout_in_ms = 1000L;
 
-    public Integer streaming_socket_timeout_in_ms = new Integer(0);
+    public volatile Long truncate_request_timeout_in_ms = 60000L;
+
+    public Integer streaming_socket_timeout_in_ms = 0;
 
     public boolean cross_node_timeout = false;
 
     public volatile Double phi_convict_threshold = 8.0;
 
-    public Integer concurrent_reads = 8;
+    public Integer concurrent_reads = 32;
     public Integer concurrent_writes = 32;
-    public Integer concurrent_replicates = 32;
+    public Integer concurrent_counter_writes = 32;
 
-    public Integer memtable_flush_writers = null; // will get set to the length of data dirs in DatabaseDescriptor
-    public Integer memtable_total_space_in_mb;
+    @Deprecated
+    public Integer concurrent_replicates = null;
+
+    public Integer memtable_flush_writers = null;
+    public Integer memtable_heap_space_in_mb;
+    public Integer memtable_offheap_space_in_mb;
+    public Float memtable_cleanup_threshold = null;
 
     public Integer storage_port = 7000;
     public Integer ssl_storage_port = 7001;
     public String listen_address;
+    public String listen_interface;
     public String broadcast_address;
     public String internode_authenticator;
 
     public Boolean start_rpc = true;
     public String rpc_address;
+    public String rpc_interface;
+    public String broadcast_rpc_address;
     public Integer rpc_port = 9160;
+    public Integer rpc_listen_backlog = 50;
     public String rpc_server_type = "sync";
     public Boolean rpc_keepalive = true;
     public Integer rpc_min_threads = 16;
@@ -121,10 +132,8 @@
     /* if the size of columns or super-columns are more than this, indexing will kick in */
     public Integer column_index_size_in_kb = 64;
     public Integer batch_size_warn_threshold_in_kb = 5;
-    public Integer in_memory_compaction_limit_in_mb = 64;
-    public Integer concurrent_compactors = FBUtilities.getAvailableProcessors();
+    public Integer concurrent_compactors;
     public volatile Integer compaction_throughput_mb_per_sec = 16;
-    public Boolean multithreaded_compaction = false;
 
     public Integer max_streaming_retries = 3;
 
@@ -167,38 +176,42 @@
     public int hinted_handoff_throttle_in_kb = 1024;
     public int batchlog_replay_throttle_in_kb = 1024;
     public int max_hints_delivery_threads = 1;
-    public boolean compaction_preheat_key_cache = true;
+    public int sstable_preemptive_open_interval_in_mb = 50;
 
     public volatile boolean incremental_backups = false;
-    public int memtable_flush_queue_size = 4;
     public boolean trickle_fsync = false;
     public int trickle_fsync_interval_in_kb = 10240;
 
     public Long key_cache_size_in_mb = null;
     public volatile int key_cache_save_period = 14400;
-    public int key_cache_keys_to_save = Integer.MAX_VALUE;
+    public volatile int key_cache_keys_to_save = Integer.MAX_VALUE;
 
     public long row_cache_size_in_mb = 0;
     public volatile int row_cache_save_period = 0;
-    public int row_cache_keys_to_save = Integer.MAX_VALUE;
+    public volatile int row_cache_keys_to_save = Integer.MAX_VALUE;
+
+    public Long counter_cache_size_in_mb = null;
+    public volatile int counter_cache_save_period = 7200;
+    public volatile int counter_cache_keys_to_save = Integer.MAX_VALUE;
+
     public String memory_allocator = NativeAllocator.class.getSimpleName();
-    public boolean populate_io_cache_on_flush = false; // ignored! see CASSANDRA-4694
 
     private static boolean isClientMode = false;
 
-    public boolean preheat_kernel_page_cache = false;
-
     public Integer file_cache_size_in_mb;
 
     public boolean inter_dc_tcp_nodelay = true;
 
-    public String memtable_allocator = "SlabAllocator";
+    public MemtableAllocationType memtable_allocation_type = MemtableAllocationType.heap_buffers;
 
     private static boolean outboundBindAny = false;
 
     public volatile int tombstone_warn_threshold = 1000;
     public volatile int tombstone_failure_threshold = 100000;
 
+    public volatile Long index_summary_capacity_in_mb;
+    public volatile int index_summary_resize_interval_in_minutes = 60;
+
     private static final CsvPreference STANDARD_SURROUNDING_SPACES_NEED_QUOTES = new CsvPreference.Builder(CsvPreference.STANDARD_PREFERENCE)
                                                                                                   .surroundingSpacesNeedQuotes(true).build();
 
@@ -272,6 +285,14 @@
         standard,
     }
 
+    public static enum MemtableAllocationType
+    {
+        unslabbed_heap_buffers,
+        heap_buffers,
+        offheap_buffers,
+        offheap_objects
+    }
+
     public static enum DiskFailurePolicy
     {
         best_effort,

diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 209d6c9..9f6b469 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java

@@ -21,16 +21,31 @@
 import java.io.FileFilter;
 import java.io.IOException;
 import java.net.InetAddress;
+import java.net.NetworkInterface;
+import java.net.SocketException;
 import java.net.UnknownHostException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Enumeration;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.primitives.Longs;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.auth.*;
+import org.apache.cassandra.auth.AllowAllAuthenticator;
+import org.apache.cassandra.auth.AllowAllAuthorizer;
+import org.apache.cassandra.auth.AllowAllInternodeAuthenticator;
+import org.apache.cassandra.auth.IAuthenticator;
+import org.apache.cassandra.auth.IAuthorizer;
+import org.apache.cassandra.auth.IInternodeAuthenticator;
 import org.apache.cassandra.config.Config.RequestSchedulerId;
 import org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions;
 import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions;
@@ -50,8 +65,12 @@
 import org.apache.cassandra.scheduler.IRequestScheduler;
 import org.apache.cassandra.scheduler.NoScheduler;
 import org.apache.cassandra.service.CacheService;
-import org.apache.cassandra.utils.Allocator;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.memory.HeapPool;
+import org.apache.cassandra.utils.memory.NativePool;
+import org.apache.cassandra.utils.memory.MemtablePool;
+import org.apache.cassandra.utils.memory.SlabPool;
 
 public class DatabaseDescriptor
 {
@@ -67,6 +86,7 @@
     private static InetAddress listenAddress; // leave null so we can fall through to getLocalHost
     private static InetAddress broadcastAddress;
     private static InetAddress rpcAddress;
+    private static InetAddress broadcastRpcAddress;
     private static SeedProvider seedProvider;
     private static IInternodeAuthenticator internodeAuthenticator;
 
@@ -86,13 +106,13 @@
     private static RequestSchedulerOptions requestSchedulerOptions;
 
     private static long keyCacheSizeInMB;
+    private static long counterCacheSizeInMB;
     private static IAllocator memoryAllocator;
+    private static long indexSummaryCapacityInMB;
 
     private static String localDC;
     private static Comparator<InetAddress> localComparator;
 
-    private static Class<? extends Allocator> memtableAllocator;
-
     static
     {
         // In client mode, we use a default configuration. Note that the fields of this class will be
@@ -139,9 +159,6 @@
     {
         conf = config;
 
-        logger.info("Data files directories: " + Arrays.toString(conf.data_file_directories));
-        logger.info("Commit log directory: " + conf.commitlog_directory);
-
         if (conf.commitlog_sync == null)
         {
             throw new ConfigurationException("Missing required directive CommitLogSync");
@@ -157,7 +174,7 @@
             {
                 throw new ConfigurationException("Batch sync specified, but commitlog_sync_period_in_ms found. Only specify commitlog_sync_batch_window_in_ms when using batch sync");
             }
-            logger.debug("Syncing log with a batch window of " + conf.commitlog_sync_batch_window_in_ms);
+            logger.debug("Syncing log with a batch window of {}", conf.commitlog_sync_batch_window_in_ms);
         }
         else
         {
@@ -169,34 +186,31 @@
             {
                 throw new ConfigurationException("commitlog_sync_period_in_ms specified, but commitlog_sync_batch_window_in_ms found.  Only specify commitlog_sync_period_in_ms when using periodic sync.");
             }
-            logger.debug("Syncing log with a period of " + conf.commitlog_sync_period_in_ms);
+            logger.debug("Syncing log with a period of {}", conf.commitlog_sync_period_in_ms);
         }
 
         if (conf.commitlog_total_space_in_mb == null)
-            conf.commitlog_total_space_in_mb = hasLargeAddressSpace() ? 1024 : 32;
+            conf.commitlog_total_space_in_mb = hasLargeAddressSpace() ? 8192 : 32;
 
         /* evaluate the DiskAccessMode Config directive, which also affects indexAccessMode selection */
         if (conf.disk_access_mode == Config.DiskAccessMode.auto)
         {
             conf.disk_access_mode = hasLargeAddressSpace() ? Config.DiskAccessMode.mmap : Config.DiskAccessMode.standard;
             indexAccessMode = conf.disk_access_mode;
-            logger.info("DiskAccessMode 'auto' determined to be " + conf.disk_access_mode + ", indexAccessMode is " + indexAccessMode );
+            logger.info("DiskAccessMode 'auto' determined to be {}, indexAccessMode is {}", conf.disk_access_mode, indexAccessMode);
         }
         else if (conf.disk_access_mode == Config.DiskAccessMode.mmap_index_only)
         {
             conf.disk_access_mode = Config.DiskAccessMode.standard;
             indexAccessMode = Config.DiskAccessMode.mmap;
-            logger.info("DiskAccessMode is " + conf.disk_access_mode + ", indexAccessMode is " + indexAccessMode );
+            logger.info("DiskAccessMode is {}, indexAccessMode is {}", conf.disk_access_mode, indexAccessMode);
         }
         else
         {
             indexAccessMode = conf.disk_access_mode;
-            logger.info("DiskAccessMode is " + conf.disk_access_mode + ", indexAccessMode is " + indexAccessMode );
+            logger.info("DiskAccessMode is {}, indexAccessMode is {}", conf.disk_access_mode, indexAccessMode);
         }
 
-        logger.info("disk_failure_policy is " + conf.disk_failure_policy);
-        logger.info("commit_failure_policy is " + conf.commit_failure_policy);
-
         /* Authentication and authorization backend, implementing IAuthenticator and IAuthorizer */
         if (conf.authenticator != null)
             authenticator = FBUtilities.newAuthenticator(conf.authenticator);
@@ -253,35 +267,37 @@
             throw new ConfigurationException("concurrent_writes must be at least 2");
         }
 
-        if (conf.concurrent_replicates != null && conf.concurrent_replicates < 2)
-        {
-            throw new ConfigurationException("concurrent_replicates must be at least 2");
-        }
+        if (conf.concurrent_counter_writes != null && conf.concurrent_counter_writes < 2)
+            throw new ConfigurationException("concurrent_counter_writes must be at least 2");
+
+        if (conf.concurrent_replicates != null)
+            logger.warn("concurrent_replicates has been deprecated and should be removed from cassandra.yaml");
 
         if (conf.file_cache_size_in_mb == null)
             conf.file_cache_size_in_mb = Math.min(512, (int) (Runtime.getRuntime().maxMemory() / (4 * 1048576)));
 
-        if (conf.memtable_total_space_in_mb == null)
-            conf.memtable_total_space_in_mb = (int) (Runtime.getRuntime().maxMemory() / (4 * 1048576));
-        if (conf.memtable_total_space_in_mb <= 0)
-            throw new ConfigurationException("memtable_total_space_in_mb must be positive");
-        logger.info("Global memtable threshold is enabled at {}MB", conf.memtable_total_space_in_mb);
+        if (conf.memtable_offheap_space_in_mb == null)
+            conf.memtable_offheap_space_in_mb = (int) (Runtime.getRuntime().maxMemory() / (4 * 1048576));
+        if (conf.memtable_offheap_space_in_mb < 0)
+            throw new ConfigurationException("memtable_offheap_space_in_mb must be positive");
+        // for the moment, we default to twice as much on-heap space as off-heap, as heap overhead is very large
+        if (conf.memtable_heap_space_in_mb == null)
+            conf.memtable_heap_space_in_mb = (int) (Runtime.getRuntime().maxMemory() / (4 * 1048576));
+        if (conf.memtable_heap_space_in_mb <= 0)
+            throw new ConfigurationException("memtable_heap_space_in_mb must be positive");
+        logger.info("Global memtable on-heap threshold is enabled at {}MB", conf.memtable_heap_space_in_mb);
+        if (conf.memtable_offheap_space_in_mb == 0)
+            logger.info("Global memtable off-heap threshold is disabled, HeapAllocator will be used instead");
+        else
+            logger.info("Global memtable off-heap threshold is enabled at {}MB", conf.memtable_offheap_space_in_mb);
 
-        /* Memtable flush writer threads */
-        if (conf.memtable_flush_writers != null && conf.memtable_flush_writers < 1)
+        /* Local IP, hostname or interface to bind services to */
+        if (conf.listen_address != null && conf.listen_interface != null)
         {
-            throw new ConfigurationException("memtable_flush_writers must be at least 1");
+            throw new ConfigurationException("Set listen_address OR listen_interface, not both");
         }
-        else if (conf.memtable_flush_writers == null)
+        else if (conf.listen_address != null)
         {
-            conf.memtable_flush_writers = conf.data_file_directories.length;
-        }
-
-        /* Local IP or hostname to bind services to */
-        if (conf.listen_address != null)
-        {
-            if (conf.listen_address.equals("0.0.0.0"))
-                throw new ConfigurationException("listen_address cannot be 0.0.0.0!");
             try
             {
                 listenAddress = InetAddress.getByName(conf.listen_address);
@@ -290,16 +306,29 @@
             {
                 throw new ConfigurationException("Unknown listen_address '" + conf.listen_address + "'");
             }
+
+            if (listenAddress.isAnyLocalAddress())
+                throw new ConfigurationException("listen_address cannot be a wildcard address (" + conf.listen_address + ")!");
+        }
+        else if (conf.listen_interface != null)
+        {
+            try
+            {
+                Enumeration<InetAddress> addrs = NetworkInterface.getByName(conf.listen_interface).getInetAddresses();
+                listenAddress = addrs.nextElement();
+                if (addrs.hasMoreElements())
+                    throw new ConfigurationException("Interface " + conf.listen_interface +" can't have more than one address");
+            }
+            catch (SocketException e)
+            {
+                throw new ConfigurationException("Unknown network interface in listen_interface " + conf.listen_interface);
+            }
+
         }
 
         /* Gossip Address to broadcast */
         if (conf.broadcast_address != null)
         {
-            if (conf.broadcast_address.equals("0.0.0.0"))
-            {
-                throw new ConfigurationException("broadcast_address cannot be 0.0.0.0!");
-            }
-
             try
             {
                 broadcastAddress = InetAddress.getByName(conf.broadcast_address);
@@ -308,10 +337,17 @@
             {
                 throw new ConfigurationException("Unknown broadcast_address '" + conf.broadcast_address + "'");
             }
+
+            if (broadcastAddress.isAnyLocalAddress())
+                throw new ConfigurationException("broadcast_address cannot be a wildcard address (" + conf.broadcast_address + ")!");
         }
 
-        /* Local IP or hostname to bind RPC server to */
-        if (conf.rpc_address != null)
+        /* Local IP, hostname or interface to bind RPC server to */
+        if (conf.rpc_address != null && conf.rpc_interface != null)
+        {
+            throw new ConfigurationException("Set rpc_address OR rpc_interface, not both");
+        }
+        else if (conf.rpc_address != null)
         {
             try
             {
@@ -322,11 +358,48 @@
                 throw new ConfigurationException("Unknown host in rpc_address " + conf.rpc_address);
             }
         }
+        else if (conf.rpc_interface != null)
+        {
+            try
+            {
+                Enumeration<InetAddress> addrs = NetworkInterface.getByName(conf.rpc_interface).getInetAddresses();
+                rpcAddress = addrs.nextElement();
+                if (addrs.hasMoreElements())
+                    throw new ConfigurationException("Interface " + conf.rpc_interface +" can't have more than one address");
+            }
+            catch (SocketException e)
+            {
+                throw new ConfigurationException("Unknown network interface in rpc_interface " + conf.rpc_interface);
+            }
+        }
         else
         {
             rpcAddress = FBUtilities.getLocalAddress();
         }
 
+        /* RPC address to broadcast */
+        if (conf.broadcast_rpc_address != null)
+        {
+            try
+            {
+                broadcastRpcAddress = InetAddress.getByName(conf.broadcast_rpc_address);
+            }
+            catch (UnknownHostException e)
+            {
+                throw new ConfigurationException("Unknown broadcast_rpc_address '" + conf.broadcast_rpc_address + "'");
+            }
+
+            if (broadcastRpcAddress.isAnyLocalAddress())
+                throw new ConfigurationException("broadcast_rpc_address cannot be a wildcard address (" + conf.broadcast_rpc_address + ")!");
+        }
+        else
+        {
+            if (rpcAddress.isAnyLocalAddress())
+                throw new ConfigurationException("If rpc_address is set to a wildcard address (" + conf.rpc_address + "), then " +
+                                                 "you must set broadcast_rpc_address to a value other than " + conf.rpc_address);
+            broadcastRpcAddress = rpcAddress;
+        }
+
         if (conf.thrift_framed_transport_size_in_mb <= 0)
             throw new ConfigurationException("thrift_framed_transport_size_in_mb must be positive");
 
@@ -393,48 +466,64 @@
             requestSchedulerId = RequestSchedulerId.keyspace;
         }
 
-        if (logger.isDebugEnabled() && conf.auto_bootstrap != null)
+        // if data dirs, commitlog dir, or saved caches dir are set in cassandra.yaml, use that.  Otherwise,
+        // use -Dcassandra.storagedir (set in cassandra-env.sh) as the parent dir for data/, commitlog/, and saved_caches/
+        if (conf.commitlog_directory == null)
         {
-            logger.debug("setting auto_bootstrap to " + conf.auto_bootstrap);
+            conf.commitlog_directory = System.getProperty("cassandra.storagedir", null);
+            if (conf.commitlog_directory == null)
+                throw new ConfigurationException("commitlog_directory is missing and -Dcassandra.storagedir is not set");
+            conf.commitlog_directory += File.separator + "commitlog";
+        }
+        if (conf.saved_caches_directory == null)
+        {
+            conf.saved_caches_directory = System.getProperty("cassandra.storagedir", null);
+            if (conf.saved_caches_directory == null)
+                throw new ConfigurationException("saved_caches_directory is missing and -Dcassandra.storagedir is not set");
+            conf.saved_caches_directory += File.separator + "saved_caches";
+        }
+        if (conf.data_file_directories == null)
+        {
+            String defaultDataDir = System.getProperty("cassandra.storagedir", null);
+            if (defaultDataDir == null)
+                throw new ConfigurationException("data_file_directories is not missing and -Dcassandra.storagedir is not set");
+            conf.data_file_directories = new String[]{ defaultDataDir + File.separator + "data" };
         }
 
-        logger.info((conf.multithreaded_compaction ? "" : "Not ") + "using multi-threaded compaction");
-
-        if (conf.in_memory_compaction_limit_in_mb != null && conf.in_memory_compaction_limit_in_mb <= 0)
+        /* data file and commit log directories. they get created later, when they're needed. */
+        for (String datadir : conf.data_file_directories)
         {
-            throw new ConfigurationException("in_memory_compaction_limit_in_mb must be a positive integer");
+            if (datadir.equals(conf.commitlog_directory))
+                throw new ConfigurationException("commitlog_directory must not be the same as any data_file_directories");
+            if (datadir.equals(conf.saved_caches_directory))
+                throw new ConfigurationException("saved_caches_directory must not be the same as any data_file_directories");
         }
 
+        if (conf.commitlog_directory.equals(conf.saved_caches_directory))
+            throw new ConfigurationException("saved_caches_directory must not be the same as the commitlog_directory");
+
+        if (conf.memtable_flush_writers == null)
+            conf.memtable_flush_writers = Math.min(8, Math.max(2, Math.min(FBUtilities.getAvailableProcessors(), conf.data_file_directories.length)));
+
+        if (conf.memtable_flush_writers < 1)
+            throw new ConfigurationException("memtable_flush_writers must be at least 1");
+
+        if (conf.memtable_cleanup_threshold == null)
+            conf.memtable_cleanup_threshold = (float) (1.0 / (1 + conf.memtable_flush_writers));
+
+        if (conf.memtable_cleanup_threshold < 0.01f)
+            throw new ConfigurationException("memtable_cleanup_threshold must be >= 0.01");
+        if (conf.memtable_cleanup_threshold > 0.99f)
+            throw new ConfigurationException("memtable_cleanup_threshold must be <= 0.99");
+        if (conf.memtable_cleanup_threshold < 0.1f)
+            logger.warn("memtable_cleanup_threshold is set very low, which may cause performance degradation");
+
         if (conf.concurrent_compactors == null)
-            conf.concurrent_compactors = FBUtilities.getAvailableProcessors();
+            conf.concurrent_compactors = Math.min(8, Math.max(2, Math.min(FBUtilities.getAvailableProcessors(), conf.data_file_directories.length)));
 
         if (conf.concurrent_compactors <= 0)
             throw new ConfigurationException("concurrent_compactors should be strictly greater than 0");
 
-        /* data file and commit log directories. they get created later, when they're needed. */
-        if (conf.commitlog_directory != null && conf.data_file_directories != null && conf.saved_caches_directory != null)
-        {
-            for (String datadir : conf.data_file_directories)
-            {
-                if (datadir.equals(conf.commitlog_directory))
-                    throw new ConfigurationException("commitlog_directory must not be the same as any data_file_directories");
-                if (datadir.equals(conf.saved_caches_directory))
-                    throw new ConfigurationException("saved_caches_directory must not be the same as any data_file_directories");
-            }
-
-            if (conf.commitlog_directory.equals(conf.saved_caches_directory))
-                throw new ConfigurationException("saved_caches_directory must not be the same as the commitlog_directory");
-        }
-        else
-        {
-            if (conf.commitlog_directory == null)
-                throw new ConfigurationException("commitlog_directory missing");
-            if (conf.data_file_directories == null)
-                throw new ConfigurationException("data_file_directories missing; at least one data directory must be specified");
-            if (conf.saved_caches_directory == null)
-                throw new ConfigurationException("saved_caches_directory missing");
-        }
-
         if (conf.initial_token != null)
             for (String token : tokensFromString(conf.initial_token))
                 partitioner.getTokenFactory().validate(token);
@@ -460,6 +549,31 @@
                     + conf.key_cache_size_in_mb + "', supported values are <integer> >= 0.");
         }
 
+        try
+        {
+            // if counter_cache_size_in_mb option was set to "auto" then size of the cache should be "min(2.5% of Heap (in MB), 50MB)
+            counterCacheSizeInMB = (conf.counter_cache_size_in_mb == null)
+                    ? Math.min(Math.max(1, (int) (Runtime.getRuntime().totalMemory() * 0.025 / 1024 / 1024)), 50)
+                    : conf.counter_cache_size_in_mb;
+
+            if (counterCacheSizeInMB < 0)
+                throw new NumberFormatException(); // to escape duplicating error message
+        }
+        catch (NumberFormatException e)
+        {
+            throw new ConfigurationException("counter_cache_size_in_mb option was set incorrectly to '"
+                    + conf.counter_cache_size_in_mb + "', supported values are <integer> >= 0.");
+        }
+
+        // if set to empty/"auto" then use 5% of Heap size
+        indexSummaryCapacityInMB = (conf.index_summary_capacity_in_mb == null)
+            ? Math.max(1, (int) (Runtime.getRuntime().totalMemory() * 0.05 / 1024 / 1024))
+            : conf.index_summary_capacity_in_mb;
+
+        if (indexSummaryCapacityInMB < 0)
+            throw new ConfigurationException("index_summary_capacity_in_mb option was set incorrectly to '"
+                    + conf.index_summary_capacity_in_mb + "', it should be a non-negative integer.");
+
         memoryAllocator = FBUtilities.newOffHeapAllocator(conf.memory_allocator);
 
         if(conf.encryption_options != null)
@@ -469,11 +583,6 @@
             conf.server_encryption_options = conf.encryption_options;
         }
 
-        String allocatorClass = conf.memtable_allocator;
-        if (!allocatorClass.contains("."))
-            allocatorClass = "org.apache.cassandra.utils." + allocatorClass;
-        memtableAllocator = FBUtilities.classForName(allocatorClass, "allocator");
-
         // Hardcoded system keyspaces
         List<KSMetaData> systemKeyspaces = Arrays.asList(KSMetaData.systemKeyspace());
         assert systemKeyspaces.size() == Schema.systemKeyspaceNames.size();
@@ -609,13 +718,13 @@
         }
         catch (ConfigurationException e)
         {
-            logger.error("Fatal error: " + e.getMessage());
+            logger.error("Fatal error: {}", e.getMessage());
             System.err.println("Bad configuration; unable to start server");
             System.exit(1);
         }
         catch (FSWriteError e)
         {
-            logger.error("Fatal error: " + e.getMessage());
+            logger.error("Fatal error: {}", e.getMessage());
             System.err.println(e.getCause().getMessage() + "; unable to start server");
             System.exit(1);
         }
@@ -757,6 +866,11 @@
         return Integer.parseInt(System.getProperty("cassandra.rpc_port", conf.rpc_port.toString()));
     }
 
+    public static int getRpcListenBacklog()
+    {
+        return conf.rpc_listen_backlog;
+    }
+
     public static long getRpcTimeout()
     {
         return conf.request_timeout_in_ms;
@@ -797,6 +911,16 @@
         conf.write_request_timeout_in_ms = timeOutInMillis;
     }
 
+    public static long getCounterWriteRpcTimeout()
+    {
+        return conf.counter_write_request_timeout_in_ms;
+    }
+
+    public static void setCounterWriteRpcTimeout(Long timeOutInMillis)
+    {
+        conf.counter_write_request_timeout_in_ms = timeOutInMillis;
+    }
+
     public static long getCasContentionTimeout()
     {
         return conf.cas_contention_timeout_in_ms;
@@ -838,8 +962,9 @@
             case PAXOS_COMMIT:
             case PAXOS_PREPARE:
             case PAXOS_PROPOSE:
-            case COUNTER_MUTATION:
                 return getWriteRpcTimeout();
+            case COUNTER_MUTATION:
+                return getCounterWriteRpcTimeout();
             default:
                 return getRpcTimeout();
         }
@@ -850,7 +975,12 @@
      */
     public static long getMinRpcTimeout()
     {
-        return Longs.min(getRpcTimeout(), getReadRpcTimeout(), getRangeRpcTimeout(), getWriteRpcTimeout(), getTruncateRpcTimeout());
+        return Longs.min(getRpcTimeout(),
+                         getReadRpcTimeout(),
+                         getRangeRpcTimeout(),
+                         getWriteRpcTimeout(),
+                         getCounterWriteRpcTimeout(),
+                         getTruncateRpcTimeout());
     }
 
     public static double getPhiConvictThreshold()
@@ -873,9 +1003,9 @@
         return conf.concurrent_writes;
     }
 
-    public static int getConcurrentReplicators()
+    public static int getConcurrentCounterWriters()
     {
-        return conf.concurrent_replicates;
+        return conf.concurrent_counter_writes;
     }
 
     public static int getFlushWriters()
@@ -883,26 +1013,11 @@
             return conf.memtable_flush_writers;
     }
 
-    public static int getInMemoryCompactionLimit()
-    {
-        return conf.in_memory_compaction_limit_in_mb * 1024 * 1024;
-    }
-
-    public static void setInMemoryCompactionLimit(int sizeInMB)
-    {
-        conf.in_memory_compaction_limit_in_mb = sizeInMB;
-    }
-
     public static int getConcurrentCompactors()
     {
         return conf.concurrent_compactors;
     }
 
-    public static boolean isMultithreadedCompaction()
-    {
-        return conf.multithreaded_compaction;
-    }
-
     public static int getCompactionThroughputMbPerSec()
     {
         return conf.compaction_throughput_mb_per_sec;
@@ -1016,6 +1131,16 @@
         return rpcAddress;
     }
 
+    public static void setBroadcastRpcAddress(InetAddress broadcastRPCAddr)
+    {
+        broadcastRpcAddress = broadcastRPCAddr;
+    }
+
+    public static InetAddress getBroadcastRpcAddress()
+    {
+        return broadcastRpcAddress;
+    }
+
     public static String getRpcServerType()
     {
         return conf.rpc_server_type;
@@ -1061,11 +1186,6 @@
         return conf.start_native_transport;
     }
 
-    public static InetAddress getNativeTransportAddress()
-    {
-        return getRpcAddress();
-    }
-
     public static int getNativeTransportPort()
     {
         return Integer.parseInt(System.getProperty("cassandra.native_transport_port", conf.native_transport_port.toString()));
@@ -1211,9 +1331,16 @@
         return conf.index_interval;
     }
 
-    public static File getSerializedCachePath(String ksName, String cfName, CacheService.CacheType cacheType, String version)
+    public static File getSerializedCachePath(String ksName, String cfName, UUID cfId, CacheService.CacheType cacheType, String version)
     {
-        return new File(conf.saved_caches_directory, ksName + "-" + cfName + "-" + cacheType + (version == null ? "" : "-" + version + ".db"));
+        StringBuilder builder = new StringBuilder();
+        builder.append(ksName).append('-');
+        builder.append(cfName).append('-');
+        if (cfId != null)
+            builder.append(ByteBufferUtil.bytesToHex(ByteBufferUtil.bytes(cfId))).append('-');
+        builder.append(cacheType);
+        builder.append((version == null ? "" : "-" + version + ".db"));
+        return new File(conf.saved_caches_directory, builder.toString());
     }
 
     public static int getDynamicUpdateInterval()
@@ -1259,26 +1386,21 @@
         return conf.hinted_handoff_throttle_in_kb;
     }
 
-    public static void setHintedHandoffThrottleInKB(Integer throttleInKB)
-    {
-        conf.hinted_handoff_throttle_in_kb = throttleInKB;
-    }
-
     public static int getBatchlogReplayThrottleInKB()
     {
         return conf.batchlog_replay_throttle_in_kb;
     }
 
+    public static void setHintedHandoffThrottleInKB(Integer throttleInKB)
+    {
+        conf.hinted_handoff_throttle_in_kb = throttleInKB;
+    }
+
     public static int getMaxHintsThread()
     {
         return conf.max_hints_delivery_threads;
     }
 
-    public static boolean getPreheatKeyCache()
-    {
-        return conf.compaction_preheat_key_cache;
-    }
-
     public static boolean isIncrementalBackupsEnabled()
     {
         return conf.incremental_backups;
@@ -1289,28 +1411,21 @@
         conf.incremental_backups = value;
     }
 
-    public static int getFlushQueueSize()
-    {
-        return conf.memtable_flush_queue_size;
-    }
-
     public static int getFileCacheSizeInMB()
     {
         return conf.file_cache_size_in_mb;
     }
 
-    public static int getTotalMemtableSpaceInMB()
-    {
-        // should only be called if estimatesRealMemtableSize() is true
-        assert conf.memtable_total_space_in_mb > 0;
-        return conf.memtable_total_space_in_mb;
-    }
-
     public static long getTotalCommitlogSpaceInMB()
     {
         return conf.commitlog_total_space_in_mb;
     }
 
+    public static int getSSTablePreempiveOpenIntervalInMB()
+    {
+        return conf.sstable_preemptive_open_interval_in_mb;
+    }
+
     public static boolean getTrickleFsync()
     {
         return conf.trickle_fsync;
@@ -1326,6 +1441,11 @@
         return keyCacheSizeInMB;
     }
 
+    public static long getIndexSummaryCapacityInMB()
+    {
+        return indexSummaryCapacityInMB;
+    }
+
     public static int getKeyCacheSavePeriod()
     {
         return conf.key_cache_save_period;
@@ -1366,6 +1486,31 @@
         return conf.row_cache_keys_to_save;
     }
 
+    public static long getCounterCacheSizeInMB()
+    {
+        return counterCacheSizeInMB;
+    }
+
+    public static int getCounterCacheSavePeriod()
+    {
+        return conf.counter_cache_save_period;
+    }
+
+    public static void setCounterCacheSavePeriod(int counterCacheSavePeriod)
+    {
+        conf.counter_cache_save_period = counterCacheSavePeriod;
+    }
+
+    public static int getCounterCacheKeysToSave()
+    {
+        return conf.counter_cache_keys_to_save;
+    }
+
+    public static void setCounterCacheKeysToSave(int counterCacheKeysToSave)
+    {
+        conf.counter_cache_keys_to_save = counterCacheKeysToSave;
+    }
+
     public static IAllocator getoffHeapMemoryAllocator()
     {
         return memoryAllocator;
@@ -1401,21 +1546,33 @@
         return conf.inter_dc_tcp_nodelay;
     }
 
-    public static boolean shouldPreheatPageCache()
+    public static MemtablePool getMemtableAllocatorPool()
     {
-        return conf.preheat_kernel_page_cache;
+        long heapLimit = ((long) conf.memtable_heap_space_in_mb) << 20;
+        long offHeapLimit = ((long) conf.memtable_offheap_space_in_mb) << 20;
+        switch (conf.memtable_allocation_type)
+        {
+            case unslabbed_heap_buffers:
+                return new HeapPool(heapLimit, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());
+            case heap_buffers:
+                return new SlabPool(heapLimit, 0, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());
+            case offheap_buffers:
+                if (!FileUtils.isCleanerAvailable())
+                {
+                    logger.error("Could not free direct byte buffer: offheap_buffers is not a safe memtable_allocation_type without this ability, please adjust your config. This feature is only guaranteed to work on an Oracle JVM. Refusing to start.");
+                    System.exit(-1);
+                }
+                return new SlabPool(heapLimit, offHeapLimit, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());
+            case offheap_objects:
+                return new NativePool(heapLimit, offHeapLimit, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());
+            default:
+                throw new AssertionError();
+        }
     }
 
-    public static Allocator getMemtableAllocator()
+    public static int getIndexSummaryResizeIntervalInMinutes()
     {
-        try
-        {
-            return memtableAllocator.newInstance();
-        }
-        catch (InstantiationException | IllegalAccessException e)
-        {
-            throw new RuntimeException(e);
-        }
+        return conf.index_summary_resize_interval_in_minutes;
     }
 
     public static boolean hasLargeAddressSpace()

diff --git a/src/java/org/apache/cassandra/thrift/RequestType.java b/src/java/org/apache/cassandra/config/IndexType.java
similarity index 88%
rename from src/java/org/apache/cassandra/thrift/RequestType.java
rename to src/java/org/apache/cassandra/config/IndexType.java
index 0d01362..d39dccb 100644
--- a/src/java/org/apache/cassandra/thrift/RequestType.java
+++ b/src/java/org/apache/cassandra/config/IndexType.java

@@ -15,10 +15,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.thrift;
+package org.apache.cassandra.config;
 
-public enum RequestType
+public enum IndexType
 {
-    READ,
-    WRITE
+    KEYS,
+    CUSTOM,
+    COMPOSITES
 }

diff --git a/src/java/org/apache/cassandra/config/KSMetaData.java b/src/java/org/apache/cassandra/config/KSMetaData.java
index 0a32f5c..8c99191 100644
--- a/src/java/org/apache/cassandra/config/KSMetaData.java
+++ b/src/java/org/apache/cassandra/config/KSMetaData.java

@@ -19,9 +19,8 @@
 
 import java.util.*;
 
+import com.google.common.base.Objects;
 import com.google.common.collect.ImmutableMap;
-import org.apache.commons.lang3.ObjectUtils;
-import org.apache.commons.lang3.StringUtils;
 
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
@@ -43,16 +42,29 @@
     private final Map<String, CFMetaData> cfMetaData;
     public final boolean durableWrites;
 
+    public final UTMetaData userTypes;
+
     KSMetaData(String name, Class<? extends AbstractReplicationStrategy> strategyClass, Map<String, String> strategyOptions, boolean durableWrites, Iterable<CFMetaData> cfDefs)
     {
+        this(name, strategyClass, strategyOptions, durableWrites, cfDefs, new UTMetaData());
+    }
+
+    KSMetaData(String name,
+               Class<? extends AbstractReplicationStrategy> strategyClass,
+               Map<String, String> strategyOptions,
+               boolean durableWrites,
+               Iterable<CFMetaData> cfDefs,
+               UTMetaData userTypes)
+    {
         this.name = name;
         this.strategyClass = strategyClass == null ? NetworkTopologyStrategy.class : strategyClass;
         this.strategyOptions = strategyOptions;
-        Map<String, CFMetaData> cfmap = new HashMap<String, CFMetaData>();
+        Map<String, CFMetaData> cfmap = new HashMap<>();
         for (CFMetaData cfm : cfDefs)
             cfmap.put(cfm.cfName, cfm);
         this.cfMetaData = Collections.unmodifiableMap(cfmap);
         this.durableWrites = durableWrites;
+        this.userTypes = userTypes;
     }
 
     // For new user created keyspaces (through CQL)
@@ -67,12 +79,12 @@
 
     public static KSMetaData newKeyspace(String name, Class<? extends AbstractReplicationStrategy> strategyClass, Map<String, String> options, boolean durablesWrites, Iterable<CFMetaData> cfDefs)
     {
-        return new KSMetaData(name, strategyClass, options, durablesWrites, cfDefs);
+        return new KSMetaData(name, strategyClass, options, durablesWrites, cfDefs, new UTMetaData());
     }
 
     public static KSMetaData cloneWith(KSMetaData ksm, Iterable<CFMetaData> cfDefs)
     {
-        return new KSMetaData(ksm.name, ksm.strategyClass, ksm.strategyOptions, ksm.durableWrites, cfDefs);
+        return new KSMetaData(ksm.name, ksm.strategyClass, ksm.strategyOptions, ksm.durableWrites, cfDefs, ksm.userTypes);
     }
 
     public static KSMetaData systemKeyspace()
@@ -84,11 +96,11 @@
                                                 CFMetaData.PeerEventsCf,
                                                 CFMetaData.HintsCf,
                                                 CFMetaData.IndexCf,
-                                                CFMetaData.SchemaTriggersCf,
-                                                CFMetaData.CounterIdCf,
                                                 CFMetaData.SchemaKeyspacesCf,
                                                 CFMetaData.SchemaColumnFamiliesCf,
                                                 CFMetaData.SchemaColumnsCf,
+                                                CFMetaData.SchemaTriggersCf,
+                                                CFMetaData.SchemaUserTypesCf,
                                                 CFMetaData.CompactionLogCf,
                                                 CFMetaData.CompactionHistoryCf,
                                                 CFMetaData.PaxosCf,
@@ -112,21 +124,29 @@
         return new KSMetaData(name, strategyClass, strategyOptions, false, Arrays.asList(cfDefs));
     }
 
+    @Override
     public int hashCode()
     {
-        return name.hashCode();
+        return Objects.hashCode(name, strategyClass, strategyOptions, cfMetaData, durableWrites, userTypes);
     }
 
-    public boolean equals(Object obj)
+    @Override
+    public boolean equals(Object o)
     {
-        if (!(obj instanceof KSMetaData))
+        if (this == o)
+            return true;
+
+        if (!(o instanceof KSMetaData))
             return false;
-        KSMetaData other = (KSMetaData)obj;
-        return other.name.equals(name)
-                && ObjectUtils.equals(other.strategyClass, strategyClass)
-                && ObjectUtils.equals(other.strategyOptions, strategyOptions)
-                && other.cfMetaData.equals(cfMetaData)
-                && other.durableWrites == durableWrites;
+
+        KSMetaData other = (KSMetaData) o;
+
+        return Objects.equal(name, other.name)
+            && Objects.equal(strategyClass, other.strategyClass)
+            && Objects.equal(strategyOptions, other.strategyOptions)
+            && Objects.equal(cfMetaData, other.cfMetaData)
+            && Objects.equal(durableWrites, other.durableWrites)
+            && Objects.equal(userTypes, other.userTypes);
     }
 
     public Map<String, CFMetaData> cfMetaData()
@@ -137,31 +157,19 @@
     @Override
     public String toString()
     {
-        StringBuilder sb = new StringBuilder();
-        sb.append(name)
-          .append(", rep strategy:")
-          .append(strategyClass.getSimpleName())
-          .append("{")
-          .append(StringUtils.join(cfMetaData.values(), ", "))
-          .append("}")
-          .append(", strategy_options: ")
-          .append(strategyOptions.toString())
-          .append(", durable_writes: ")
-          .append(durableWrites);
-        return sb.toString();
-    }
-
-    public static String convertOldStrategyName(String name)
-    {
-        return name.replace("RackUnawareStrategy", "SimpleStrategy")
-                   .replace("RackAwareStrategy", "OldNetworkTopologyStrategy");
+        return Objects.toStringHelper(this)
+                      .add("name", name)
+                      .add("strategyClass", strategyClass.getSimpleName())
+                      .add("strategyOptions", strategyOptions)
+                      .add("cfMetaData", cfMetaData)
+                      .add("durableWrites", durableWrites)
+                      .add("userTypes", userTypes)
+                      .toString();
     }
 
     public static Map<String,String> optsWithRF(final Integer rf)
     {
-        Map<String, String> ret = new HashMap<String,String>();
-        ret.put("replication_factor", rf.toString());
-        return ret;
+        return Collections.singletonMap("replication_factor", rf.toString());
     }
 
     public static KSMetaData fromThrift(KsDef ksd, CFMetaData... cfDefs) throws ConfigurationException
@@ -179,7 +187,7 @@
 
     public KsDef toThrift()
     {
-        List<CfDef> cfDefs = new ArrayList<CfDef>(cfMetaData.size());
+        List<CfDef> cfDefs = new ArrayList<>(cfMetaData.size());
         for (CFMetaData cfm : cfMetaData().values())
         {
             // Don't expose CF that cannot be correctly handle by thrift; see CASSANDRA-4377 for further details
@@ -193,7 +201,7 @@
         return ksdef;
     }
 
-    public RowMutation toSchemaUpdate(KSMetaData newState, long modificationTimestamp)
+    public Mutation toSchemaUpdate(KSMetaData newState, long modificationTimestamp)
     {
         return newState.toSchema(modificationTimestamp);
     }
@@ -214,41 +222,45 @@
         return this;
     }
 
-
     public KSMetaData reloadAttributes()
     {
-        Row ksDefRow = SystemKeyspace.readSchemaRow(name);
+        Row ksDefRow = SystemKeyspace.readSchemaRow(SystemKeyspace.SCHEMA_KEYSPACES_CF, name);
 
         if (ksDefRow.cf == null)
             throw new RuntimeException(String.format("%s not found in the schema definitions keyspaceName (%s).", name, SystemKeyspace.SCHEMA_KEYSPACES_CF));
 
-        return fromSchema(ksDefRow, Collections.<CFMetaData>emptyList());
+        return fromSchema(ksDefRow, Collections.<CFMetaData>emptyList(), userTypes);
     }
 
-    public RowMutation dropFromSchema(long timestamp)
+    public Mutation dropFromSchema(long timestamp)
     {
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(name));
-        rm.delete(SystemKeyspace.SCHEMA_KEYSPACES_CF, timestamp);
-        rm.delete(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF, timestamp);
-        rm.delete(SystemKeyspace.SCHEMA_COLUMNS_CF, timestamp);
-        rm.delete(SystemKeyspace.SCHEMA_TRIGGERS_CF, timestamp);
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(name));
 
-        return rm;
+        mutation.delete(SystemKeyspace.SCHEMA_KEYSPACES_CF, timestamp);
+        mutation.delete(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF, timestamp);
+        mutation.delete(SystemKeyspace.SCHEMA_COLUMNS_CF, timestamp);
+        mutation.delete(SystemKeyspace.SCHEMA_TRIGGERS_CF, timestamp);
+        mutation.delete(SystemKeyspace.SCHEMA_USER_TYPES_CF, timestamp);
+        mutation.delete(SystemKeyspace.INDEX_CF, timestamp);
+
+        return mutation;
     }
 
-    public RowMutation toSchema(long timestamp)
+    public Mutation toSchema(long timestamp)
     {
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(name));
-        ColumnFamily cf = rm.addOrGet(CFMetaData.SchemaKeyspacesCf);
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(name));
+        ColumnFamily cf = mutation.addOrGet(CFMetaData.SchemaKeyspacesCf);
+        CFRowAdder adder = new CFRowAdder(cf, CFMetaData.SchemaKeyspacesCf.comparator.builder().build(), timestamp);
 
-        cf.addColumn(Column.create(durableWrites, timestamp, "durable_writes"));
-        cf.addColumn(Column.create(strategyClass.getName(), timestamp, "strategy_class"));
-        cf.addColumn(Column.create(json(strategyOptions), timestamp, "strategy_options"));
+        adder.add("durable_writes", durableWrites);
+        adder.add("strategy_class", strategyClass.getName());
+        adder.add("strategy_options", json(strategyOptions));
 
         for (CFMetaData cfm : cfMetaData.values())
-            cfm.toSchema(rm, timestamp);
+            cfm.toSchema(mutation, timestamp);
 
-        return rm;
+        userTypes.toSchema(mutation, timestamp);
+        return mutation;
     }
 
     /**
@@ -258,7 +270,7 @@
      *
      * @return deserialized keyspace without cf_defs
      */
-    public static KSMetaData fromSchema(Row row, Iterable<CFMetaData> cfms)
+    public static KSMetaData fromSchema(Row row, Iterable<CFMetaData> cfms, UTMetaData userTypes)
     {
         UntypedResultSet.Row result = QueryProcessor.resultify("SELECT * FROM system.schema_keyspaces", row).one();
         try
@@ -267,7 +279,8 @@
                                   AbstractReplicationStrategy.getClass(result.getString("strategy_class")),
                                   fromJsonMap(result.getString("strategy_options")),
                                   result.getBoolean("durable_writes"),
-                                  cfms);
+                                  cfms,
+                                  userTypes);
         }
         catch (ConfigurationException e)
         {
@@ -283,16 +296,16 @@
      *
      * @return deserialized keyspace with cf_defs
      */
-    public static KSMetaData fromSchema(Row serializedKs, Row serializedCFs)
+    public static KSMetaData fromSchema(Row serializedKs, Row serializedCFs, Row serializedUserTypes)
     {
         Map<String, CFMetaData> cfs = deserializeColumnFamilies(serializedCFs);
-        return fromSchema(serializedKs, cfs.values());
+        UTMetaData userTypes = new UTMetaData(UTMetaData.fromSchema(serializedUserTypes));
+        return fromSchema(serializedKs, cfs.values(), userTypes);
     }
 
     /**
      * Deserialize ColumnFamilies from low-level schema representation, all of them belong to the same keyspace
      *
-     * @param row
      * @return map containing name of the ColumnFamily and it's metadata for faster lookup
      */
     public static Map<String, CFMetaData> deserializeColumnFamilies(Row row)
@@ -300,24 +313,13 @@
         if (row.cf == null)
             return Collections.emptyMap();
 
-        Map<String, CFMetaData> cfms = new HashMap<String, CFMetaData>();
+        Map<String, CFMetaData> cfms = new HashMap<>();
         UntypedResultSet results = QueryProcessor.resultify("SELECT * FROM system.schema_columnfamilies", row);
         for (UntypedResultSet.Row result : results)
         {
             CFMetaData cfm = CFMetaData.fromSchema(result);
             cfms.put(cfm.cfName, cfm);
         }
-
-        for (CFMetaData cfm : cfms.values())
-        {
-            Row columnRow = SystemKeyspace.readSchemaRow(SystemKeyspace.SCHEMA_COLUMNS_CF, cfm.ksName, cfm.cfName);
-            // This may replace some existing definition coming from the old key, column and
-            // value aliases. But that's what we want (see CFMetaData.fromSchemaNoColumnsNoTriggers).
-            for (ColumnDefinition cd : ColumnDefinition.fromSchema(columnRow, cfm))
-                cfm.addOrReplaceColumnDefinition(cd);
-            cfm.rebuild();
-        }
-
         return cfms;
     }
 }

diff --git a/src/java/org/apache/cassandra/config/Schema.java b/src/java/org/apache/cassandra/config/Schema.java
index 0da65ce..8e9802f 100644
--- a/src/java/org/apache/cassandra/config/Schema.java
+++ b/src/java/org/apache/cassandra/config/Schema.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.config;
 
-import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
@@ -31,7 +30,7 @@
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.index.SecondaryIndexManager;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.utils.ConcurrentBiMap;
@@ -130,6 +129,17 @@
         return keyspaceInstances.get(keyspaceName);
     }
 
+    public ColumnFamilyStore getColumnFamilyStoreInstance(UUID cfId)
+    {
+        Pair<String, String> pair = cfIdMap.inverse().get(cfId);
+        if (pair == null)
+            return null;
+        Keyspace instance = getKeyspaceInstance(pair.left);
+        if (instance == null)
+            return null;
+        return instance.getColumnFamilyStore(cfId);
+    }
+
     /**
      * Store given Keyspace instance to the schema
      *
@@ -218,23 +228,6 @@
     }
 
     /**
-     * Get column comparator for ColumnFamily but it's keyspace/name
-     *
-     * @param ksName The keyspace name
-     * @param cfName The ColumnFamily name
-     *
-     * @return The comparator of the ColumnFamily
-     */
-    public AbstractType<?> getComparator(String ksName, String cfName)
-    {
-        assert ksName != null;
-        CFMetaData cfmd = getCFMetaData(ksName, cfName);
-        if (cfmd == null)
-            throw new IllegalArgumentException("Unknown ColumnFamily " + cfName + " in keyspace " + ksName);
-        return cfmd.comparator;
-    }
-
-    /**
      * Get metadata about keyspace by its name
      *
      * @param keyspaceName The name of the keyspace
@@ -309,6 +302,15 @@
     }
 
     /**
+     * @param cfId The identifier of the ColumnFamily to lookup
+     * @return true if the CF id is a known one, false otherwise.
+     */
+    public boolean hasCF(UUID cfId)
+    {
+        return cfIdMap.containsValue(cfId);
+    }
+
+    /**
      * Lookup keyspace/ColumnFamily identifier
      *
      * @param ksName The keyspace name
@@ -346,6 +348,7 @@
     public void purge(CFMetaData cfm)
     {
         cfIdMap.remove(Pair.create(cfm.ksName, cfm.cfName));
+        cfm.markPurged();
     }
 
     /* Version control */
@@ -374,9 +377,8 @@
                     continue;
 
                 // we want to digest only live columns
-                ColumnFamilyStore.removeDeletedColumnsOnly(row.cf, Integer.MAX_VALUE);
+                ColumnFamilyStore.removeDeletedColumnsOnly(row.cf, Integer.MAX_VALUE, SecondaryIndexManager.nullUpdater);
                 row.cf.purgeTombstones(Integer.MAX_VALUE);
-                
                 row.cf.updateDigest(versionDigest);
             }
 
@@ -416,14 +418,14 @@
 
     public static boolean invalidSchemaRow(Row row)
     {
-        return row.cf == null || (row.cf.isMarkedForDelete() && row.cf.getColumnCount() == 0);
+        return row.cf == null || (row.cf.isMarkedForDelete() && !row.cf.hasColumns());
     }
 
     public static boolean ignoredSchemaRow(Row row)
     {
         try
         {
-            return systemKeyspaceNames.contains(ByteBufferUtil.string(row.key.key));
+            return systemKeyspaceNames.contains(ByteBufferUtil.string(row.key.getKey()));
         }
         catch (CharacterCodingException e)
         {

diff --git a/src/java/org/apache/cassandra/config/TriggerDefinition.java b/src/java/org/apache/cassandra/config/TriggerDefinition.java
index 69e06b1..aaaf631 100644
--- a/src/java/org/apache/cassandra/config/TriggerDefinition.java
+++ b/src/java/org/apache/cassandra/config/TriggerDefinition.java

@@ -22,15 +22,13 @@
 
 import com.google.common.base.Objects;
 
-import org.apache.cassandra.cql3.ColumnNameBuilder;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.thrift.TriggerDef;
 
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
-
 public class TriggerDefinition
 {
     private static final String TRIGGER_NAME = "trigger_name";
@@ -74,38 +72,37 @@
     }
 
     /**
-     * Add specified trigger to the schema using given row.
+     * Add specified trigger to the schema using given mutation.
      *
-     * @param rm        The schema row mutation
+     * @param mutation  The schema mutation
      * @param cfName    The name of the parent ColumnFamily
      * @param timestamp The timestamp to use for the columns
      */
-    public void toSchema(RowMutation rm, String cfName, long timestamp)
+    public void toSchema(Mutation mutation, String cfName, long timestamp)
     {
-        ColumnFamily cf = rm.addOrGet(SystemKeyspace.SCHEMA_TRIGGERS_CF);
+        ColumnFamily cf = mutation.addOrGet(SystemKeyspace.SCHEMA_TRIGGERS_CF);
 
-        ColumnNameBuilder builder = CFMetaData.SchemaTriggersCf.getCfDef().getColumnNameBuilder();
-        builder.add(bytes(cfName)).add(bytes(name));
+        CFMetaData cfm = CFMetaData.SchemaTriggersCf;
+        Composite prefix = cfm.comparator.make(cfName, name);
+        CFRowAdder adder = new CFRowAdder(cf, prefix, timestamp);
 
-        cf.addColumn(builder.copy().add(bytes("")).build(), bytes(""), timestamp); // the row marker
-        cf.addColumn(builder.copy().add(bytes(TRIGGER_OPTIONS)).add(bytes(CLASS)).build(), bytes(classOption), timestamp);
+        adder.addMapEntry(TRIGGER_OPTIONS, CLASS, classOption);
     }
 
     /**
-     * Drop specified trigger from the schema using given row.
+     * Drop specified trigger from the schema using given mutation.
      *
-     * @param rm        The schema row mutation
+     * @param mutation  The schema mutation
      * @param cfName    The name of the parent ColumnFamily
      * @param timestamp The timestamp to use for the tombstone
      */
-    public void deleteFromSchema(RowMutation rm, String cfName, long timestamp)
+    public void deleteFromSchema(Mutation mutation, String cfName, long timestamp)
     {
-        ColumnFamily cf = rm.addOrGet(SystemKeyspace.SCHEMA_TRIGGERS_CF);
+        ColumnFamily cf = mutation.addOrGet(SystemKeyspace.SCHEMA_TRIGGERS_CF);
         int ldt = (int) (System.currentTimeMillis() / 1000);
 
-        ColumnNameBuilder builder = CFMetaData.SchemaTriggersCf.getCfDef().getColumnNameBuilder();
-        builder.add(bytes(cfName)).add(bytes(name));
-        cf.addAtom(new RangeTombstone(builder.build(), builder.buildAsEndOfRange(), timestamp, ldt));
+        Composite prefix = CFMetaData.SchemaTriggersCf.comparator.make(cfName, name);
+        cf.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
     }
 
     public static TriggerDefinition fromThrift(TriggerDef thriftDef)

diff --git a/src/java/org/apache/cassandra/config/UTMetaData.java b/src/java/org/apache/cassandra/config/UTMetaData.java
new file mode 100644
index 0000000..ee653a8
--- /dev/null
+++ b/src/java/org/apache/cassandra/config/UTMetaData.java

@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.config;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Defined (and loaded) user types.
+ *
+ * In practice, because user types are global, we have only one instance of
+ * this class that retrieve through the Schema class.
+ */
+public final class UTMetaData
+{
+    private final Map<ByteBuffer, UserType> userTypes;
+
+    public UTMetaData()
+    {
+        this(new HashMap<ByteBuffer, UserType>());
+    }
+
+    UTMetaData(Map<ByteBuffer, UserType> types)
+    {
+        this.userTypes = types;
+    }
+
+    private static UserType fromSchema(UntypedResultSet.Row row)
+    {
+        try
+        {
+            String keyspace = row.getString("keyspace_name");
+            ByteBuffer name = ByteBufferUtil.bytes(row.getString("type_name"));
+            List<String> rawColumns = row.getList("field_names", UTF8Type.instance);
+            List<String> rawTypes = row.getList("field_types", UTF8Type.instance);
+
+            List<ByteBuffer> columns = new ArrayList<>(rawColumns.size());
+            for (String rawColumn : rawColumns)
+                columns.add(ByteBufferUtil.bytes(rawColumn));
+
+            List<AbstractType<?>> types = new ArrayList<>(rawTypes.size());
+            for (String rawType : rawTypes)
+                types.add(TypeParser.parse(rawType));
+
+            return new UserType(keyspace, name, columns, types);
+        }
+        catch (RequestValidationException e)
+        {
+            // If it has been written in the schema, it should be valid
+            throw new AssertionError();
+        }
+    }
+
+    public static Map<ByteBuffer, UserType> fromSchema(Row row)
+    {
+        UntypedResultSet results = QueryProcessor.resultify("SELECT * FROM system." + SystemKeyspace.SCHEMA_USER_TYPES_CF, row);
+        Map<ByteBuffer, UserType> types = new HashMap<>(results.size());
+        for (UntypedResultSet.Row result : results)
+        {
+            UserType type = fromSchema(result);
+            types.put(type.name, type);
+        }
+        return types;
+    }
+
+    public static Mutation toSchema(UserType newType, long timestamp)
+    {
+        return toSchema(new Mutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(newType.keyspace)), newType, timestamp);
+    }
+
+    public static Mutation toSchema(Mutation mutation, UserType newType, long timestamp)
+    {
+        ColumnFamily cf = mutation.addOrGet(SystemKeyspace.SCHEMA_USER_TYPES_CF);
+
+        Composite prefix = CFMetaData.SchemaUserTypesCf.comparator.make(newType.name);
+        CFRowAdder adder = new CFRowAdder(cf, prefix, timestamp);
+
+        adder.resetCollection("field_names");
+        adder.resetCollection("field_types");
+
+        for (int i = 0; i < newType.size(); i++)
+        {
+            adder.addListEntry("field_names", newType.fieldName(i));
+            adder.addListEntry("field_types", newType.fieldType(i).toString());
+        }
+        return mutation;
+    }
+
+    public Mutation toSchema(Mutation mutation, long timestamp)
+    {
+        for (UserType ut : userTypes.values())
+            toSchema(mutation, ut, timestamp);
+        return mutation;
+    }
+
+    public static Mutation dropFromSchema(UserType droppedType, long timestamp)
+    {
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, SystemKeyspace.getSchemaKSKey(droppedType.keyspace));
+        ColumnFamily cf = mutation.addOrGet(SystemKeyspace.SCHEMA_USER_TYPES_CF);
+        int ldt = (int) (System.currentTimeMillis() / 1000);
+
+        Composite prefix = CFMetaData.SchemaUserTypesCf.comparator.make(droppedType.name);
+        cf.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
+
+        return mutation;
+    }
+
+    public UserType getType(ByteBuffer typeName)
+    {
+        return userTypes.get(typeName);
+    }
+
+    public Map<ByteBuffer, UserType> getAllTypes()
+    {
+        // Copy to avoid concurrent modification while iterating. Not intended to be called on a criticial path anyway
+        return new HashMap<>(userTypes);
+    }
+
+    // This is *not* thread safe but is only called in DefsTables that is synchronized.
+    public void addType(UserType type)
+    {
+        UserType old = userTypes.get(type.name);
+        assert old == null || type.isCompatibleWith(old);
+        userTypes.put(type.name, type);
+    }
+
+    // Same remarks than for addType
+    public void removeType(UserType type)
+    {
+        userTypes.remove(type.name);
+    }
+
+    public boolean equals(Object that)
+    {
+        if (!(that instanceof UTMetaData))
+            return false;
+        return userTypes.equals(((UTMetaData) that).userTypes);
+    }
+}

diff --git a/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java b/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java
index b520d07..0b62ff4 100644
--- a/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java
+++ b/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java

@@ -18,16 +18,22 @@
 package org.apache.cassandra.config;
 
 import java.beans.IntrospectionException;
-import java.io.IOException;
+import java.io.ByteArrayInputStream;
+import java.io.File;
 import java.io.InputStream;
+import java.io.IOException;
 import java.net.URL;
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
+import java.util.TreeMap;
 
+import com.google.common.base.Joiner;
+import com.google.common.io.ByteStreams;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.util.FileUtils;
 import org.yaml.snakeyaml.TypeDescription;
 import org.yaml.snakeyaml.Yaml;
 import org.yaml.snakeyaml.error.YAMLException;
@@ -61,7 +67,13 @@
             ClassLoader loader = DatabaseDescriptor.class.getClassLoader();
             url = loader.getResource(configUrl);
             if (url == null)
-                throw new ConfigurationException("Cannot locate " + configUrl);
+            {
+                String required = "file:" + File.separator + File.separator;
+                if (!configUrl.startsWith(required))
+                    throw new ConfigurationException("Expecting URI in variable: [cassandra.config].  Please prefix the file with " + required + File.separator +
+                            " for local files or " + required + "<server>" + File.separator + " for remote files.  Aborting.");
+                throw new ConfigurationException("Cannot locate " + configUrl + ".  If this is a local file, please confirm you've provided " + required + File.separator + " as a URI prefix.");
+            }
         }
 
         return url;
@@ -78,15 +90,19 @@
         try
         {
             logger.info("Loading settings from {}", url);
-            try
+            byte[] configBytes;
+            try (InputStream is = url.openStream())
             {
-                input = url.openStream();
+                configBytes = ByteStreams.toByteArray(is);
             }
             catch (IOException e)
             {
                 // getStorageConfigURL should have ruled this out
                 throw new AssertionError(e);
             }
+            
+            logConfig(configBytes);
+            
             org.yaml.snakeyaml.constructor.Constructor constructor = new org.yaml.snakeyaml.constructor.Constructor(Config.class);
             TypeDescription seedDesc = new TypeDescription(SeedProviderDef.class);
             seedDesc.putMapPropertyType("parameters", String.class, String.class);
@@ -94,7 +110,7 @@
             MissingPropertiesChecker propertiesChecker = new MissingPropertiesChecker();
             constructor.setPropertyUtils(propertiesChecker);
             Yaml yaml = new Yaml(constructor);
-            Config result = yaml.loadAs(input, Config.class);
+            Config result = yaml.loadAs(new ByteArrayInputStream(configBytes), Config.class);
             result.configHintedHandoff();
             propertiesChecker.check();
             return result;
@@ -103,10 +119,20 @@
         {
             throw new ConfigurationException("Invalid yaml", e);
         }
-        finally
+    }
+
+    private void logConfig(byte[] configBytes)
+    {
+        Map<Object, Object> configMap = new TreeMap<>((Map<?, ?>) new Yaml().load(new ByteArrayInputStream(configBytes)));
+        // these keys contain passwords, don't log them
+        for (String sensitiveKey : new String[] { "client_encryption_options", "server_encryption_options" })
         {
-            FileUtils.closeQuietly(input);
+            if (configMap.containsKey(sensitiveKey))
+            {
+                configMap.put(sensitiveKey, "<REDACTED>");
+            }
         }
+        logger.info("Node configuration:[" + Joiner.on("; ").join(configMap.entrySet()) + "]");
     }
     
     private static class MissingPropertiesChecker extends PropertyUtils 

diff --git a/src/java/org/apache/cassandra/cql/AlterTableStatement.java b/src/java/org/apache/cassandra/cql/AlterTableStatement.java
index 48e64c8..5bc7011 100644
--- a/src/java/org/apache/cassandra/cql/AlterTableStatement.java
+++ b/src/java/org/apache/cassandra/cql/AlterTableStatement.java

@@ -17,15 +17,21 @@
  */
 package org.apache.cassandra.cql;
 
-import org.apache.cassandra.config.*;
-import org.apache.cassandra.db.marshal.TypeParser;
-import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.io.compress.CompressionParameters;
-
 import java.nio.ByteBuffer;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.apache.cassandra.cache.CachingOptions;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.marshal.TypeParser;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.io.compress.CompressionParameters;
+
 public class AlterTableStatement
 {
     public static enum OperationType
@@ -66,21 +72,21 @@
     public CFMetaData getCFMetaData(String keyspace) throws ConfigurationException, InvalidRequestException, SyntaxException
     {
         CFMetaData meta = Schema.instance.getCFMetaData(keyspace, columnFamily);
-        CFMetaData cfm = meta.clone();
+        CFMetaData cfm = meta.copy();
 
         ByteBuffer columnName = this.oType == OperationType.OPTS ? null
-                                                                 : meta.comparator.fromStringCQL2(this.columnName);
+                                                                 : meta.comparator.subtype(0).fromStringCQL2(this.columnName);
 
         switch (oType)
         {
             case ADD:
-                cfm.addColumnDefinition(ColumnDefinition.regularDef(columnName, TypeParser.parse(validator), null));
+                cfm.addColumnDefinition(ColumnDefinition.regularDef(cfm, columnName, TypeParser.parse(validator), null));
                 break;
 
             case ALTER:
                 // We only look for the first key alias which is ok for CQL2
                 ColumnDefinition partionKeyDef = cfm.partitionKeyColumns().get(0);
-                if (partionKeyDef.name.equals(columnName))
+                if (partionKeyDef.name.bytes.equals(columnName))
                 {
                     cfm.keyValidator(TypeParser.parse(validator));
                 }
@@ -90,7 +96,7 @@
 
                     for (ColumnDefinition columnDef : cfm.regularColumns())
                     {
-                        if (columnDef.name.equals(columnName))
+                        if (columnDef.name.bytes.equals(columnName))
                         {
                             toUpdate = columnDef;
                             break;
@@ -102,7 +108,7 @@
                                     this.columnName,
                                     columnFamily));
 
-                    toUpdate.setValidator(TypeParser.parse(validator));
+                    cfm.addOrReplaceColumnDefinition(toUpdate.withNewType(TypeParser.parse(validator)));
                 }
                 break;
 
@@ -111,7 +117,7 @@
 
                 for (ColumnDefinition columnDef : cfm.regularColumns())
                 {
-                    if (columnDef.name.equals(columnName))
+                    if (columnDef.name.bytes.equals(columnName))
                     {
                         toDelete = columnDef;
                     }
@@ -173,17 +179,15 @@
         cfm.readRepairChance(cfProps.getPropertyDouble(CFPropDefs.KW_READREPAIRCHANCE, cfm.getReadRepairChance()));
         cfm.dcLocalReadRepairChance(cfProps.getPropertyDouble(CFPropDefs.KW_DCLOCALREADREPAIRCHANCE, cfm.getDcLocalReadRepair()));
         cfm.gcGraceSeconds(cfProps.getPropertyInt(CFPropDefs.KW_GCGRACESECONDS, cfm.getGcGraceSeconds()));
-        cfm.replicateOnWrite(cfProps.getPropertyBoolean(CFPropDefs.KW_REPLICATEONWRITE, cfm.getReplicateOnWrite()));
         int minCompactionThreshold = cfProps.getPropertyInt(CFPropDefs.KW_MINCOMPACTIONTHRESHOLD, cfm.getMinCompactionThreshold());
         int maxCompactionThreshold = cfProps.getPropertyInt(CFPropDefs.KW_MAXCOMPACTIONTHRESHOLD, cfm.getMaxCompactionThreshold());
         if (minCompactionThreshold <= 0 || maxCompactionThreshold <= 0)
             throw new ConfigurationException("Disabling compaction by setting compaction thresholds to 0 has been deprecated, set the compaction option 'enabled' to false instead.");
         cfm.minCompactionThreshold(minCompactionThreshold);
         cfm.maxCompactionThreshold(maxCompactionThreshold);
-        cfm.caching(CFMetaData.Caching.fromString(cfProps.getPropertyString(CFPropDefs.KW_CACHING, cfm.getCaching().toString())));
+        cfm.caching(CachingOptions.fromString(cfProps.getPropertyString(CFPropDefs.KW_CACHING, cfm.getCaching().toString())));
         cfm.defaultTimeToLive(cfProps.getPropertyInt(CFPropDefs.KW_DEFAULT_TIME_TO_LIVE, cfm.getDefaultTimeToLive()));
         cfm.speculativeRetry(CFMetaData.SpeculativeRetry.fromString(cfProps.getPropertyString(CFPropDefs.KW_SPECULATIVE_RETRY, cfm.getSpeculativeRetry().toString())));
-        cfm.populateIoCacheOnFlush(cfProps.getPropertyBoolean(CFPropDefs.KW_POPULATE_IO_CACHE_ON_FLUSH, cfm.populateIoCacheOnFlush()));
         cfm.bloomFilterFpChance(cfProps.getPropertyDouble(CFPropDefs.KW_BF_FP_CHANCE, cfm.getBloomFilterFpChance()));
         cfm.memtableFlushPeriod(cfProps.getPropertyInt(CFPropDefs.KW_MEMTABLE_FLUSH_PERIOD, cfm.getMemtableFlushPeriod()));
 

diff --git a/src/java/org/apache/cassandra/cql/CFPropDefs.java b/src/java/org/apache/cassandra/cql/CFPropDefs.java
index a7d3147..f65cb94 100644
--- a/src/java/org/apache/cassandra/cql/CFPropDefs.java
+++ b/src/java/org/apache/cassandra/cql/CFPropDefs.java

@@ -47,12 +47,11 @@
     public static final String KW_DEFAULTVALIDATION = "default_validation";
     public static final String KW_MINCOMPACTIONTHRESHOLD = "min_compaction_threshold";
     public static final String KW_MAXCOMPACTIONTHRESHOLD = "max_compaction_threshold";
-    public static final String KW_REPLICATEONWRITE = "replicate_on_write";
     public static final String KW_COMPACTION_STRATEGY_CLASS = "compaction_strategy_class";
     public static final String KW_CACHING = "caching";
+    public static final String KW_ROWS_PER_PARTITION_TO_CACHE = "rows_per_partition_to_cache";
     public static final String KW_DEFAULT_TIME_TO_LIVE = "default_time_to_live";
     public static final String KW_SPECULATIVE_RETRY = "speculative_retry";
-    public static final String KW_POPULATE_IO_CACHE_ON_FLUSH = "populate_io_cache_on_flush";
     public static final String KW_BF_FP_CHANCE = "bloom_filter_fp_chance";
     public static final String KW_MEMTABLE_FLUSH_PERIOD = "memtable_flush_period_in_ms";
 
@@ -90,12 +89,11 @@
         keywords.add(KW_DEFAULTVALIDATION);
         keywords.add(KW_MINCOMPACTIONTHRESHOLD);
         keywords.add(KW_MAXCOMPACTIONTHRESHOLD);
-        keywords.add(KW_REPLICATEONWRITE);
         keywords.add(KW_COMPACTION_STRATEGY_CLASS);
         keywords.add(KW_CACHING);
+        keywords.add(KW_ROWS_PER_PARTITION_TO_CACHE);
         keywords.add(KW_DEFAULT_TIME_TO_LIVE);
         keywords.add(KW_SPECULATIVE_RETRY);
-        keywords.add(KW_POPULATE_IO_CACHE_ON_FLUSH);
         keywords.add(KW_BF_FP_CHANCE);
         keywords.add(KW_MEMTABLE_FLUSH_PERIOD);
 
@@ -107,6 +105,8 @@
         obsoleteKeywords.add("memtable_operations_in_millions");
         obsoleteKeywords.add("memtable_flush_after_mins");
         obsoleteKeywords.add("row_cache_provider");
+        obsoleteKeywords.add("replicate_on_write");
+        obsoleteKeywords.add("populate_io_cache_on_flush");
 
         allowedKeywords.addAll(keywords);
         allowedKeywords.addAll(obsoleteKeywords);

diff --git a/src/java/org/apache/cassandra/cql/Cql.g b/src/java/org/apache/cassandra/cql/Cql.g
index 3c41f44..f76be27 100644
--- a/src/java/org/apache/cassandra/cql/Cql.g
+++ b/src/java/org/apache/cassandra/cql/Cql.g

@@ -86,7 +86,7 @@
     public Token nextToken() {
         super.nextToken();
         if (tokens.size() == 0)
-            return Token.EOF_TOKEN;
+            return new CommonToken(Token.EOF);
         return tokens.remove(0);
     }
     

diff --git a/src/java/org/apache/cassandra/cql/CreateColumnFamilyStatement.java b/src/java/org/apache/cassandra/cql/CreateColumnFamilyStatement.java
index dd56387..4cb9eba 100644
--- a/src/java/org/apache/cassandra/cql/CreateColumnFamilyStatement.java
+++ b/src/java/org/apache/cassandra/cql/CreateColumnFamilyStatement.java

@@ -24,8 +24,10 @@
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
 import org.apache.cassandra.db.ColumnFamilyType;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.TypeParser;
@@ -122,20 +124,20 @@
     }
 
     // Column definitions
-    private Map<ByteBuffer, ColumnDefinition> getColumns(AbstractType<?> comparator) throws InvalidRequestException
+    private List<ColumnDefinition> getColumns(CFMetaData cfm) throws InvalidRequestException
     {
-        Map<ByteBuffer, ColumnDefinition> columnDefs = new HashMap<ByteBuffer, ColumnDefinition>();
+        List<ColumnDefinition> columnDefs = new ArrayList<>(columns.size());
 
         for (Map.Entry<Term, String> col : columns.entrySet())
         {
             try
             {
-                ByteBuffer columnName = comparator.fromStringCQL2(col.getKey().getText());
+                ByteBuffer columnName = cfm.comparator.asAbstractType().fromStringCQL2(col.getKey().getText());
                 String validatorClassName = CFPropDefs.comparators.containsKey(col.getValue())
                                           ? CFPropDefs.comparators.get(col.getValue())
                                           : col.getValue();
                 AbstractType<?> validator = TypeParser.parse(validatorClassName);
-                columnDefs.put(columnName, ColumnDefinition.regularDef(columnName, validator, null));
+                columnDefs.add(ColumnDefinition.regularDef(cfm, columnName, validator, null));
             }
             catch (ConfigurationException e)
             {
@@ -174,8 +176,7 @@
             newCFMD = new CFMetaData(keyspace,
                                      name,
                                      ColumnFamilyType.Standard,
-                                     comparator,
-                                     null);
+                                     new SimpleDenseCellNameType(comparator));
 
             if (CFMetaData.DEFAULT_COMPRESSOR != null && cfProps.compressionParameters.isEmpty())
                 cfProps.compressionParameters.put(CompressionParameters.SSTABLE_COMPRESSION, CFMetaData.DEFAULT_COMPRESSOR);
@@ -184,29 +185,27 @@
             if (minCompactionThreshold <= 0 || maxCompactionThreshold <= 0)
                 throw new ConfigurationException("Disabling compaction by setting compaction thresholds to 0 has been deprecated, set the compaction option 'enabled' to false instead.");
 
-            newCFMD.comment(cfProps.getProperty(CFPropDefs.KW_COMMENT))
+            newCFMD.addAllColumnDefinitions(getColumns(newCFMD))
+                   .comment(cfProps.getProperty(CFPropDefs.KW_COMMENT))
                    .readRepairChance(getPropertyDouble(CFPropDefs.KW_READREPAIRCHANCE, CFMetaData.DEFAULT_READ_REPAIR_CHANCE))
                    .dcLocalReadRepairChance(getPropertyDouble(CFPropDefs.KW_DCLOCALREADREPAIRCHANCE, CFMetaData.DEFAULT_DCLOCAL_READ_REPAIR_CHANCE))
-                   .replicateOnWrite(getPropertyBoolean(CFPropDefs.KW_REPLICATEONWRITE, CFMetaData.DEFAULT_REPLICATE_ON_WRITE))
                    .gcGraceSeconds(getPropertyInt(CFPropDefs.KW_GCGRACESECONDS, CFMetaData.DEFAULT_GC_GRACE_SECONDS))
                    .defaultValidator(cfProps.getValidator())
                    .minCompactionThreshold(minCompactionThreshold)
                    .maxCompactionThreshold(maxCompactionThreshold)
-                   .columnMetadata(getColumns(comparator))
                    .keyValidator(TypeParser.parse(CFPropDefs.comparators.get(getKeyType())))
                    .compactionStrategyClass(cfProps.compactionStrategyClass)
                    .compactionStrategyOptions(cfProps.compactionStrategyOptions)
                    .compressionParameters(CompressionParameters.create(cfProps.compressionParameters))
-                   .caching(CFMetaData.Caching.fromString(getPropertyString(CFPropDefs.KW_CACHING, CFMetaData.DEFAULT_CACHING_STRATEGY.toString())))
+                   .caching(CachingOptions.fromString(getPropertyString(CFPropDefs.KW_CACHING, CFMetaData.DEFAULT_CACHING_STRATEGY.toString())))
                    .speculativeRetry(CFMetaData.SpeculativeRetry.fromString(getPropertyString(CFPropDefs.KW_SPECULATIVE_RETRY, CFMetaData.DEFAULT_SPECULATIVE_RETRY.toString())))
                    .bloomFilterFpChance(getPropertyDouble(CFPropDefs.KW_BF_FP_CHANCE, null))
                    .memtableFlushPeriod(getPropertyInt(CFPropDefs.KW_MEMTABLE_FLUSH_PERIOD, 0))
-                   .defaultTimeToLive(getPropertyInt(CFPropDefs.KW_DEFAULT_TIME_TO_LIVE, CFMetaData.DEFAULT_DEFAULT_TIME_TO_LIVE))
-                   .populateIoCacheOnFlush(getPropertyBoolean(CFPropDefs.KW_POPULATE_IO_CACHE_ON_FLUSH, CFMetaData.DEFAULT_POPULATE_IO_CACHE_ON_FLUSH));
+                   .defaultTimeToLive(getPropertyInt(CFPropDefs.KW_DEFAULT_TIME_TO_LIVE, CFMetaData.DEFAULT_DEFAULT_TIME_TO_LIVE));
 
             // CQL2 can have null keyAliases
             if (keyAlias != null)
-                newCFMD.addColumnDefinition(ColumnDefinition.partitionKeyDef(keyAlias, newCFMD.getKeyValidator(), null));
+                newCFMD.addColumnDefinition(ColumnDefinition.partitionKeyDef(newCFMD, keyAlias, newCFMD.getKeyValidator(), null));
         }
         catch (ConfigurationException e)
         {

diff --git a/src/java/org/apache/cassandra/cql/DeleteStatement.java b/src/java/org/apache/cassandra/cql/DeleteStatement.java
index 0a1f90c..71942e4 100644
--- a/src/java/org/apache/cassandra/cql/DeleteStatement.java
+++ b/src/java/org/apache/cassandra/cql/DeleteStatement.java

@@ -24,8 +24,9 @@
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.composites.CellName;
 import org.apache.cassandra.db.IMutation;
-import org.apache.cassandra.db.RowMutation;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
@@ -75,40 +76,39 @@
         clientState.hasColumnFamilyAccess(keyspace, columnFamily, Permission.MODIFY);
         AbstractType<?> keyType = Schema.instance.getCFMetaData(keyspace, columnFamily).getKeyValidator();
 
-        List<IMutation> rowMutations = new ArrayList<IMutation>(keys.size());
+        List<IMutation> mutations = new ArrayList<IMutation>(keys.size());
 
         for (Term key : keys)
-        {
-            rowMutations.add(mutationForKey(key.getByteBuffer(keyType, variables), keyspace, timestamp, clientState, variables, metadata));
-        }
+            mutations.add(mutationForKey(key.getByteBuffer(keyType, variables), keyspace, timestamp, clientState, variables, metadata));
 
-        return rowMutations;
+        return mutations;
     }
 
-    public RowMutation mutationForKey(ByteBuffer key, String keyspace, Long timestamp, ThriftClientState clientState, List<ByteBuffer> variables, CFMetaData metadata)
+    public Mutation mutationForKey(ByteBuffer key, String keyspace, Long timestamp, ThriftClientState clientState, List<ByteBuffer> variables, CFMetaData metadata)
     throws InvalidRequestException
     {
-        RowMutation rm = new RowMutation(keyspace, key);
+        Mutation mutation = new Mutation(keyspace, key);
 
         QueryProcessor.validateKeyAlias(metadata, keyName);
 
         if (columns.size() < 1)
         {
-            // No columns, delete the row
-            rm.delete(columnFamily, (timestamp == null) ? getTimestamp(clientState) : timestamp);
+            // No columns, delete the partition
+            mutation.delete(columnFamily, (timestamp == null) ? getTimestamp(clientState) : timestamp);
         }
         else
         {
             // Delete specific columns
+            AbstractType<?> at = metadata.comparator.asAbstractType();
             for (Term column : columns)
             {
-                ByteBuffer columnName = column.getByteBuffer(metadata.comparator, variables);
+                CellName columnName = metadata.comparator.cellFromByteBuffer(column.getByteBuffer(at, variables));
                 validateColumnName(columnName);
-                rm.delete(columnFamily, columnName, (timestamp == null) ? getTimestamp(clientState) : timestamp);
+                mutation.delete(columnFamily, columnName, (timestamp == null) ? getTimestamp(clientState) : timestamp);
             }
         }
 
-        return rm;
+        return mutation;
     }
 
     public String toString()

diff --git a/src/java/org/apache/cassandra/cql/DropIndexStatement.java b/src/java/org/apache/cassandra/cql/DropIndexStatement.java
index bc9bbbc..1a24b7e 100644
--- a/src/java/org/apache/cassandra/cql/DropIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql/DropIndexStatement.java

@@ -49,7 +49,7 @@
     {
         ColumnDefinition column = findIndexedColumn(cfm);
         assert column != null;
-        CFMetaData cloned = cfm.clone();
+        CFMetaData cloned = cfm.copy();
         ColumnDefinition toChange = cloned.getColumnDefinition(column.name);
         assert toChange.getIndexName() != null && toChange.getIndexName().equals(indexName);
         toChange.setIndexName(null);

diff --git a/src/java/org/apache/cassandra/cql/QueryProcessor.java b/src/java/org/apache/cassandra/cql/QueryProcessor.java
index f160374..3c1d555 100644
--- a/src/java/org/apache/cassandra/cql/QueryProcessor.java
+++ b/src/java/org/apache/cassandra/cql/QueryProcessor.java

@@ -33,8 +33,11 @@
 import org.apache.cassandra.cql.hooks.PostPreparationHook;
 import org.apache.cassandra.cql.hooks.PreExecutionHook;
 import org.apache.cassandra.cql.hooks.PreparationContext;
-import org.apache.cassandra.db.CounterColumn;
+import org.apache.cassandra.db.CounterCell;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.db.filter.*;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
@@ -53,9 +56,6 @@
 import org.apache.cassandra.thrift.CqlResultType;
 import org.apache.cassandra.thrift.CqlRow;
 import org.apache.cassandra.thrift.CqlPreparedResult;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
-import org.apache.cassandra.thrift.IndexType;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.thrift.ThriftClientState;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -106,7 +106,7 @@
         // ...of a list of column names
         if (!select.isColumnRange())
         {
-            SortedSet<ByteBuffer> columnNames = getColumnNames(select, metadata, variables);
+            SortedSet<CellName> columnNames = getColumnNames(select, metadata, variables);
             validateColumnNames(columnNames);
 
             for (Term rawKey: select.getKeys())
@@ -120,9 +120,9 @@
         // ...a range (slice) of column names
         else
         {
-            AbstractType<?> comparator = select.getComparator(metadata.ksName);
-            ByteBuffer start = select.getColumnStart().getByteBuffer(comparator,variables);
-            ByteBuffer finish = select.getColumnFinish().getByteBuffer(comparator,variables);
+            AbstractType<?> at = metadata.comparator.asAbstractType();
+            Composite start = metadata.comparator.fromByteBuffer(select.getColumnStart().getByteBuffer(at,variables));
+            Composite finish = metadata.comparator.fromByteBuffer(select.getColumnFinish().getByteBuffer(at,variables));
 
             for (Term rawKey : select.getKeys())
             {
@@ -141,17 +141,17 @@
         return StorageProxy.read(commands, select.getConsistencyLevel());
     }
 
-    private static SortedSet<ByteBuffer> getColumnNames(SelectStatement select, CFMetaData metadata, List<ByteBuffer> variables)
+    private static SortedSet<CellName> getColumnNames(SelectStatement select, CFMetaData metadata, List<ByteBuffer> variables)
     throws InvalidRequestException
     {
         String keyString = metadata.getCQL2KeyName();
         List<Term> selectColumnNames = select.getColumnNames();
-        SortedSet<ByteBuffer> columnNames = new TreeSet<ByteBuffer>(metadata.comparator);
+        SortedSet<CellName> columnNames = new TreeSet<>(metadata.comparator);
         for (Term column : selectColumnNames)
         {
             // skip the key for the slice op; we'll add it to the resultset in extractThriftColumns
             if (!column.getText().equalsIgnoreCase(keyString))
-                columnNames.add(column.getByteBuffer(metadata.comparator,variables));
+                columnNames.add(metadata.comparator.cellFromByteBuffer(column.getByteBuffer(metadata.comparator.asAbstractType(),variables)));
         }
         return columnNames;
     }
@@ -171,7 +171,7 @@
                                     ? select.getKeyFinish().getByteBuffer(keyType,variables)
                                     : null;
 
-        RowPosition startKey = RowPosition.forKey(startKeyBytes, p), finishKey = RowPosition.forKey(finishKeyBytes, p);
+        RowPosition startKey = RowPosition.ForKey.get(startKeyBytes, p), finishKey = RowPosition.ForKey.get(finishKeyBytes, p);
         if (startKey.compareTo(finishKey) > 0 && !finishKey.isMinimum(p))
         {
             if (p instanceof RandomPartitioner)
@@ -189,11 +189,11 @@
         for (Relation columnRelation : columnRelations)
         {
             // Left and right side of relational expression encoded according to comparator/validator.
-            ByteBuffer entity = columnRelation.getEntity().getByteBuffer(metadata.comparator, variables);
-            ByteBuffer value = columnRelation.getValue().getByteBuffer(metadata.getValueValidatorFromColumnName(entity), variables);
+            ByteBuffer entity = columnRelation.getEntity().getByteBuffer(metadata.comparator.asAbstractType(), variables);
+            ByteBuffer value = columnRelation.getValue().getByteBuffer(metadata.getValueValidator(metadata.comparator.cellFromByteBuffer(entity)), variables);
 
             expressions.add(new IndexExpression(entity,
-                                                IndexOperator.valueOf(columnRelation.operator().toString()),
+                                                IndexExpression.Operator.valueOf(columnRelation.operator().toString()),
                                                 value));
         }
 
@@ -213,7 +213,7 @@
         // if start key was set and relation was "greater than"
         if (select.getKeyStart() != null && !select.includeStartKey() && !rows.isEmpty())
         {
-            if (rows.get(0).key.key.equals(startKeyBytes))
+            if (rows.get(0).key.getKey().equals(startKeyBytes))
                 rows.remove(0);
         }
 
@@ -221,7 +221,7 @@
         if (select.getKeyFinish() != null && !select.includeFinishKey() && !rows.isEmpty())
         {
             int lastIndex = rows.size() - 1;
-            if (rows.get(lastIndex).key.key.equals(finishKeyBytes))
+            if (rows.get(lastIndex).key.getKey().equals(finishKeyBytes))
                 rows.remove(lastIndex);
         }
 
@@ -233,8 +233,9 @@
     {
         if (select.isColumnRange() || select.getColumnNames().size() == 0)
         {
-            return new SliceQueryFilter(select.getColumnStart().getByteBuffer(metadata.comparator, variables),
-                                        select.getColumnFinish().getByteBuffer(metadata.comparator, variables),
+            AbstractType<?> comparator = metadata.comparator.asAbstractType();
+            return new SliceQueryFilter(metadata.comparator.fromByteBuffer(select.getColumnStart().getByteBuffer(comparator, variables)),
+                                        metadata.comparator.fromByteBuffer(select.getColumnFinish().getByteBuffer(comparator, variables)),
                                         select.isColumnsReversed(),
                                         select.getColumnsLimit());
         }
@@ -267,12 +268,14 @@
 
         if (select.getColumnRelations().size() > 0)
         {
-            AbstractType<?> comparator = select.getComparator(keyspace);
-            SecondaryIndexManager idxManager = Keyspace.open(keyspace).getColumnFamilyStore(select.getColumnFamily()).indexManager;
+            ColumnFamilyStore cfstore = Keyspace.open(keyspace).getColumnFamilyStore(select.getColumnFamily());
+            CellNameType comparator = cfstore.metadata.comparator;
+            AbstractType<?> at = comparator.asAbstractType();
+            SecondaryIndexManager idxManager = cfstore.indexManager;
             for (Relation relation : select.getColumnRelations())
             {
-                ByteBuffer name = relation.getEntity().getByteBuffer(comparator, variables);
-                if ((relation.operator() == RelationType.EQ) && idxManager.indexes(name))
+                ByteBuffer name = relation.getEntity().getByteBuffer(at, variables);
+                if ((relation.operator() == RelationType.EQ) && idxManager.indexes(comparator.cellFromByteBuffer(name)))
                     return;
             }
             throw new InvalidRequestException("No indexed columns present in by-columns clause with \"equals\" operator");
@@ -302,31 +305,31 @@
             throw new InvalidRequestException(String.format("Expected key '%s' to be present in WHERE clause for '%s'", realKeyAlias, cfm.cfName));
     }
 
-    private static void validateColumnNames(Iterable<ByteBuffer> columns)
+    private static void validateColumnNames(Iterable<CellName> columns)
     throws InvalidRequestException
     {
-        for (ByteBuffer name : columns)
+        for (CellName name : columns)
         {
-            if (name.remaining() > org.apache.cassandra.db.Column.MAX_NAME_LENGTH)
+            if (name.dataSize() > org.apache.cassandra.db.Cell.MAX_NAME_LENGTH)
                 throw new InvalidRequestException(String.format("column name is too long (%s > %s)",
-                                                                name.remaining(),
-                                                                org.apache.cassandra.db.Column.MAX_NAME_LENGTH));
-            if (name.remaining() == 0)
+                                                                name.dataSize(),
+                                                                org.apache.cassandra.db.Cell.MAX_NAME_LENGTH));
+            if (name.isEmpty())
                 throw new InvalidRequestException("zero-length column name");
         }
     }
 
-    public static void validateColumnName(ByteBuffer column)
+    public static void validateColumnName(CellName column)
     throws InvalidRequestException
     {
         validateColumnNames(Arrays.asList(column));
     }
 
-    public static void validateColumn(CFMetaData metadata, ByteBuffer name, ByteBuffer value)
+    public static void validateColumn(CFMetaData metadata, CellName name, ByteBuffer value)
     throws InvalidRequestException
     {
         validateColumnName(name);
-        AbstractType<?> validator = metadata.getValueValidatorFromColumnName(name);
+        AbstractType<?> validator = metadata.getValueValidator(name);
 
         try
         {
@@ -336,7 +339,7 @@
         catch (MarshalException me)
         {
             throw new InvalidRequestException(String.format("Invalid column value for column (name=%s); %s",
-                                                            ByteBufferUtil.bytesToHex(name),
+                                                            ByteBufferUtil.bytesToHex(name.toByteBuffer()),
                                                             me.getMessage()));
         }
     }
@@ -356,12 +359,12 @@
         validateSliceFilter(metadata, range.start(), range.finish(), range.reversed);
     }
 
-    private static void validateSliceFilter(CFMetaData metadata, ByteBuffer start, ByteBuffer finish, boolean reversed)
+    private static void validateSliceFilter(CFMetaData metadata, Composite start, Composite finish, boolean reversed)
     throws InvalidRequestException
     {
-        AbstractType<?> comparator = metadata.comparator;
-        Comparator<ByteBuffer> orderedComparator = reversed ? comparator.reverseComparator: comparator;
-        if (start.remaining() > 0 && finish.remaining() > 0 && orderedComparator.compare(start, finish) > 0)
+        CellNameType comparator = metadata.comparator;
+        Comparator<Composite> orderedComparator = reversed ? comparator.reverseComparator(): comparator;
+        if (!start.isEmpty() && !finish.isEmpty() && orderedComparator.compare(start, finish) > 0)
             throw new InvalidRequestException("range finish must come after start in traversal order");
     }
 
@@ -444,7 +447,7 @@
                 // otherwise create resultset from query results
                 result.schema = new CqlMetadata(new HashMap<ByteBuffer, String>(),
                                                 new HashMap<ByteBuffer, String>(),
-                                                TypeParser.getShortName(metadata.comparator),
+                                                TypeParser.getShortName(metadata.comparator.asAbstractType()),
                                                 TypeParser.getShortName(metadata.getDefaultValidator()));
                 List<CqlRow> cqlRows = new ArrayList<CqlRow>(rows.size());
                 for (org.apache.cassandra.db.Row row : rows)
@@ -456,7 +459,7 @@
                         {
                             // prepend key
                             ByteBuffer keyName = ByteBufferUtil.bytes(metadata.getCQL2KeyName());
-                            thriftColumns.add(new Column(keyName).setValue(row.key.key).setTimestamp(-1));
+                            thriftColumns.add(new Column(keyName).setValue(row.key.getKey()).setTimestamp(-1));
                             result.schema.name_types.put(keyName, TypeParser.getShortName(AsciiType.instance));
                             result.schema.value_types.put(keyName, TypeParser.getShortName(metadata.getKeyValidator()));
                         }
@@ -464,14 +467,14 @@
                         // preserve comparator order
                         if (row.cf != null)
                         {
-                            for (org.apache.cassandra.db.Column c : row.cf.getSortedColumns())
+                            for (org.apache.cassandra.db.Cell c : row.cf.getSortedColumns())
                             {
-                                if (c.isMarkedForDelete(now))
+                                if (!c.isLive(now))
                                     continue;
 
-                                ColumnDefinition cd = metadata.getColumnDefinitionFromColumnName(c.name());
+                                ColumnDefinition cd = metadata.getColumnDefinition(c.name());
                                 if (cd != null)
-                                    result.schema.value_types.put(c.name(), TypeParser.getShortName(cd.getValidator()));
+                                    result.schema.value_types.put(c.name().toByteBuffer(), TypeParser.getShortName(cd.type));
 
                                 thriftColumns.add(thriftify(c));
                             }
@@ -488,7 +491,7 @@
                             {
                                 // preserve case of key as it was requested
                                 ByteBuffer requestedKey = ByteBufferUtil.bytes(term.getText());
-                                thriftColumns.add(new Column(requestedKey).setValue(row.key.key).setTimestamp(-1));
+                                thriftColumns.add(new Column(requestedKey).setValue(row.key.getKey()).setTimestamp(-1));
                                 result.schema.name_types.put(requestedKey, TypeParser.getShortName(AsciiType.instance));
                                 result.schema.value_types.put(requestedKey, TypeParser.getShortName(metadata.getKeyValidator()));
                                 continue;
@@ -497,22 +500,23 @@
                             if (row.cf == null)
                                 continue;
 
-                            ByteBuffer name;
+                            ByteBuffer nameBytes;
                             try
                             {
-                                name = term.getByteBuffer(metadata.comparator, variables);
+                                nameBytes = term.getByteBuffer(metadata.comparator.asAbstractType(), variables);
                             }
                             catch (InvalidRequestException e)
                             {
                                 throw new AssertionError(e);
                             }
 
-                            ColumnDefinition cd = metadata.getColumnDefinitionFromColumnName(name);
+                            CellName name = metadata.comparator.cellFromByteBuffer(nameBytes);
+                            ColumnDefinition cd = metadata.getColumnDefinition(name);
                             if (cd != null)
-                                result.schema.value_types.put(name, TypeParser.getShortName(cd.getValidator()));
-                            org.apache.cassandra.db.Column c = row.cf.getColumn(name);
-                            if (c == null || c.isMarkedForDelete(now))
-                                thriftColumns.add(new Column().setName(name));
+                                result.schema.value_types.put(nameBytes, TypeParser.getShortName(cd.type));
+                            org.apache.cassandra.db.Cell c = row.cf.getColumn(name);
+                            if (c == null || !c.isLive())
+                                thriftColumns.add(new Column().setName(nameBytes));
                             else
                                 thriftColumns.add(thriftify(c));
                         }
@@ -520,7 +524,7 @@
 
                     // Create a new row, add the columns to it, and then add it to the list of rows
                     CqlRow cqlRow = new CqlRow();
-                    cqlRow.key = row.key.key;
+                    cqlRow.key = row.key.getKey();
                     cqlRow.columns = thriftColumns;
                     if (select.isColumnsReversed())
                         Collections.reverse(cqlRow.columns);
@@ -677,15 +681,15 @@
                 boolean columnExists = false;
                 ByteBuffer columnName = createIdx.getColumnName().getByteBuffer();
                 // mutating oldCfm directly would be bad, but mutating a copy is fine.
-                CFMetaData cfm = oldCfm.clone();
+                CFMetaData cfm = oldCfm.copy();
                 for (ColumnDefinition cd : cfm.regularColumns())
                 {
-                    if (cd.name.equals(columnName))
+                    if (cd.name.bytes.equals(columnName))
                     {
                         if (cd.getIndexType() != null)
                             throw new InvalidRequestException("Index already exists");
                         if (logger.isDebugEnabled())
-                            logger.debug("Updating column {} definition for index {}", cfm.comparator.getString(columnName), createIdx.getIndexName());
+                            logger.debug("Updating column {} definition for index {}", cfm.comparator.getString(cfm.comparator.fromByteBuffer(columnName)), createIdx.getIndexName());
                         cd.setIndexType(IndexType.KEYS, Collections.<String, String>emptyMap());
                         cd.setIndexName(createIdx.getIndexName());
                         columnExists = true;
@@ -693,7 +697,7 @@
                     }
                 }
                 if (!columnExists)
-                    throw new InvalidRequestException("No column definition found for column " + oldCfm.comparator.getString(columnName));
+                    throw new InvalidRequestException("No column definition found for column " + oldCfm.comparator.getString(cfm.comparator.fromByteBuffer(columnName)));
 
                 try
                 {
@@ -850,12 +854,12 @@
         return cql.hashCode();
     }
 
-    private static Column thriftify(org.apache.cassandra.db.Column c)
+    private static Column thriftify(org.apache.cassandra.db.Cell c)
     {
-        ByteBuffer value = (c instanceof CounterColumn)
+        ByteBuffer value = (c instanceof CounterCell)
                            ? ByteBufferUtil.bytes(CounterContext.instance().total(c.value()))
                            : c.value();
-        return new Column(c.name()).setValue(value).setTimestamp(c.timestamp());
+        return new Column(c.name().toByteBuffer()).setValue(value).setTimestamp(c.timestamp());
     }
 
     private static CQLStatement getStatement(String queryStr) throws SyntaxException

diff --git a/src/java/org/apache/cassandra/cql/SelectStatement.java b/src/java/org/apache/cassandra/cql/SelectStatement.java
index 7dd5592..b4ccd56 100644
--- a/src/java/org/apache/cassandra/cql/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql/SelectStatement.java

@@ -21,8 +21,6 @@
 import java.util.Set;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.ConsistencyLevel;
 
 /**
@@ -171,11 +169,6 @@
         clause.extractKeysFromColumns(cfm);
     }
 
-    public AbstractType<?> getComparator(String keyspace)
-    {
-        return Schema.instance.getComparator(keyspace, columnFamily);
-    }
-
     public List<Relation> getClauseRelations()
     {
         return clause.getClauseRelations();

diff --git a/src/java/org/apache/cassandra/cql/UpdateStatement.java b/src/java/org/apache/cassandra/cql/UpdateStatement.java
index f99f5c2..16a0d76 100644
--- a/src/java/org/apache/cassandra/cql/UpdateStatement.java
+++ b/src/java/org/apache/cassandra/cql/UpdateStatement.java

@@ -23,10 +23,9 @@
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.CounterMutation;
-import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.db.IMutation;
-import org.apache.cassandra.db.RowMutation;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
@@ -149,18 +148,16 @@
 
         clientState.hasColumnFamilyAccess(keyspace, columnFamily, Permission.MODIFY);
 
-        List<IMutation> rowMutations = new LinkedList<IMutation>();
+        List<IMutation> mutations = new LinkedList<>();
 
         for (Term key: keys)
-        {
-            rowMutations.add(mutationForKey(keyspace, key.getByteBuffer(getKeyType(keyspace),variables), metadata, timestamp, clientState, variables));
-        }
+            mutations.add(mutationForKey(keyspace, key.getByteBuffer(getKeyType(keyspace),variables), metadata, timestamp, clientState, variables));
 
-        return rowMutations;
+        return mutations;
     }
 
     /**
-     * Compute a row mutation for a single key
+     * Compute a mutation for a single key
      *
      *
      * @param keyspace working keyspace
@@ -169,7 +166,7 @@
      * @param timestamp global timestamp to use for every key mutation
      *
      * @param clientState
-     * @return row mutation
+     * @return mutation
      *
      * @throws InvalidRequestException on the wrong request
      */
@@ -177,15 +174,16 @@
     throws InvalidRequestException
     {
         validateKey(key);
-        AbstractType<?> comparator = getComparator(keyspace);
+        CellNameType comparator = metadata.comparator;
+        AbstractType<?> at = comparator.asAbstractType();
 
-        // if true we need to wrap RowMutation into CounterMutation
+        // if true we need to wrap Mutation into CounterMutation
         boolean hasCounterColumn = false;
-        RowMutation rm = new RowMutation(keyspace, key);
+        Mutation mutation = new Mutation(keyspace, key);
 
         for (Map.Entry<Term, Operation> column : getColumns().entrySet())
         {
-            ByteBuffer colName = column.getKey().getByteBuffer(comparator, variables);
+            CellName colName = comparator.cellFromByteBuffer(column.getKey().getByteBuffer(at, variables));
             Operation op = column.getValue();
 
             if (op.isUnary())
@@ -193,14 +191,14 @@
                 if (hasCounterColumn)
                     throw new InvalidRequestException("Mix of commutative and non-commutative operations is not allowed.");
 
-                ByteBuffer colValue = op.a.getByteBuffer(metadata.getValueValidatorFromColumnName(colName),variables);
+                ByteBuffer colValue = op.a.getByteBuffer(metadata.getValueValidator(colName),variables);
 
                 validateColumn(metadata, colName, colValue);
-                rm.add(columnFamily,
-                       colName,
-                       colValue,
-                       (timestamp == null) ? getTimestamp(clientState) : timestamp,
-                       getTimeToLive());
+                mutation.add(columnFamily,
+                             colName,
+                             colValue,
+                             (timestamp == null) ? getTimestamp(clientState) : timestamp,
+                             getTimeToLive());
             }
             else
             {
@@ -221,11 +219,11 @@
                                                       op.b.getText()));
                 }
 
-                rm.addCounter(columnFamily, colName, value);
+                mutation.addCounter(columnFamily, colName, value);
             }
         }
 
-        return (hasCounterColumn) ? new CounterMutation(rm, getConsistencyLevel()) : rm;
+        return (hasCounterColumn) ? new CounterMutation(mutation, getConsistencyLevel()) : mutation;
     }
 
     public String getColumnFamily()
@@ -277,11 +275,6 @@
         return Schema.instance.getCFMetaData(keyspace, columnFamily).getKeyValidator();
     }
 
-    public AbstractType<?> getComparator(String keyspace)
-    {
-        return Schema.instance.getComparator(keyspace, columnFamily);
-    }
-
     public List<Term> getColumnNames()
     {
         return columnNames;

diff --git a/src/java/org/apache/cassandra/cql3/AbstractMarker.java b/src/java/org/apache/cassandra/cql3/AbstractMarker.java
index 4329ed9..10a4dff 100644
--- a/src/java/org/apache/cassandra/cql3/AbstractMarker.java
+++ b/src/java/org/apache/cassandra/cql3/AbstractMarker.java

@@ -57,7 +57,7 @@
             this.bindIndex = bindIndex;
         }
 
-        public AbstractMarker prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public AbstractMarker prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
             if (!(receiver.type instanceof CollectionType))
                 return new Constants.Marker(bindIndex, receiver);
@@ -71,7 +71,7 @@
             throw new AssertionError();
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
         {
             return true;
         }
@@ -103,11 +103,8 @@
         }
 
         @Override
-        public AbstractMarker prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public AbstractMarker prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            if (receiver.type instanceof CollectionType)
-                throw new InvalidRequestException("Collection columns do not support IN relations");
-
             return new Lists.Marker(bindIndex, makeInReceiver(receiver));
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/AssignementTestable.java b/src/java/org/apache/cassandra/cql3/AssignementTestable.java
index d707809..2253cf7 100644
--- a/src/java/org/apache/cassandra/cql3/AssignementTestable.java
+++ b/src/java/org/apache/cassandra/cql3/AssignementTestable.java

@@ -17,10 +17,12 @@
  */
 package org.apache.cassandra.cql3;
 
+import org.apache.cassandra.exceptions.InvalidRequestException;
+
 public interface AssignementTestable
 {
     /**
      * @return whether this object can be assigned to the provided receiver
      */
-    public boolean isAssignableTo(ColumnSpecification receiver);
+    public boolean isAssignableTo(String keyspace, ColumnSpecification receiver) throws InvalidRequestException;
 }

diff --git a/src/java/org/apache/cassandra/cql3/Attributes.java b/src/java/org/apache/cassandra/cql3/Attributes.java
index a92cc80..435757b 100644
--- a/src/java/org/apache/cassandra/cql3/Attributes.java
+++ b/src/java/org/apache/cassandra/cql3/Attributes.java

@@ -20,7 +20,7 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
-import org.apache.cassandra.db.ExpiringColumn;
+import org.apache.cassandra.db.ExpiringCell;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -56,12 +56,12 @@
         return timeToLive != null;
     }
 
-    public long getTimestamp(long now, List<ByteBuffer> variables) throws InvalidRequestException
+    public long getTimestamp(long now, QueryOptions options) throws InvalidRequestException
     {
         if (timestamp == null)
             return now;
 
-        ByteBuffer tval = timestamp.bindAndGet(variables);
+        ByteBuffer tval = timestamp.bindAndGet(options);
         if (tval == null)
             throw new InvalidRequestException("Invalid null value of timestamp");
 
@@ -77,12 +77,12 @@
         return LongType.instance.compose(tval);
     }
 
-    public int getTimeToLive(List<ByteBuffer> variables) throws InvalidRequestException
+    public int getTimeToLive(QueryOptions options) throws InvalidRequestException
     {
         if (timeToLive == null)
             return 0;
 
-        ByteBuffer tval = timeToLive.bindAndGet(variables);
+        ByteBuffer tval = timeToLive.bindAndGet(options);
         if (tval == null)
             throw new InvalidRequestException("Invalid null value of TTL");
 
@@ -99,8 +99,8 @@
         if (ttl < 0)
             throw new InvalidRequestException("A TTL must be greater or equal to 0");
 
-        if (ttl > ExpiringColumn.MAX_TTL)
-            throw new InvalidRequestException(String.format("ttl is too large. requested (%d) maximum (%d)", ttl, ExpiringColumn.MAX_TTL));
+        if (ttl > ExpiringCell.MAX_TTL)
+            throw new InvalidRequestException(String.format("ttl is too large. requested (%d) maximum (%d)", ttl, ExpiringCell.MAX_TTL));
 
         return ttl;
     }
@@ -120,8 +120,8 @@
 
         public Attributes prepare(String ksName, String cfName) throws InvalidRequestException
         {
-            Term ts = timestamp == null ? null : timestamp.prepare(timestampReceiver(ksName, cfName));
-            Term ttl = timeToLive == null ? null : timeToLive.prepare(timeToLiveReceiver(ksName, cfName));
+            Term ts = timestamp == null ? null : timestamp.prepare(ksName, timestampReceiver(ksName, cfName));
+            Term ttl = timeToLive == null ? null : timeToLive.prepare(ksName, timeToLiveReceiver(ksName, cfName));
             return new Attributes(ts, ttl);
         }
 

diff --git a/src/java/org/apache/cassandra/cql3/BatchQueryOptions.java b/src/java/org/apache/cassandra/cql3/BatchQueryOptions.java
index cbf5e92..2bb8071 100644
--- a/src/java/org/apache/cassandra/cql3/BatchQueryOptions.java
+++ b/src/java/org/apache/cassandra/cql3/BatchQueryOptions.java

@@ -18,38 +18,95 @@
 package org.apache.cassandra.cql3;
 
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.service.pager.PagingState;
 
-/**
- * Options for a batch (at the protocol level) queries.
- */
-public class BatchQueryOptions
+public abstract class BatchQueryOptions
 {
-    private final ConsistencyLevel consistency;
-    private final List<List<ByteBuffer>> values;
+    public static BatchQueryOptions DEFAULT = withoutPerStatementVariables(QueryOptions.DEFAULT);
+
+    protected final QueryOptions wrapped;
     private final List<Object> queryOrIdList;
 
-    public BatchQueryOptions(ConsistencyLevel cl, List<List<ByteBuffer>> values, List<Object> queryOrIdList)
+    protected BatchQueryOptions(QueryOptions wrapped, List<Object> queryOrIdList)
     {
-        this.consistency = cl;
-        this.values = values;
+        this.wrapped = wrapped;
         this.queryOrIdList = queryOrIdList;
     }
 
-    public ConsistencyLevel getConsistency()
+    public static BatchQueryOptions withoutPerStatementVariables(QueryOptions options)
     {
-        return consistency;
+        return new WithoutPerStatementVariables(options, Collections.<Object>emptyList());
     }
 
-    public List<List<ByteBuffer>> getValues()
+    public static BatchQueryOptions withPerStatementVariables(QueryOptions options, List<List<ByteBuffer>> variables, List<Object> queryOrIdList)
     {
-        return values;
+        return new WithPerStatementVariables(options, variables, queryOrIdList);
+    }
+
+    public abstract QueryOptions forStatement(int i);
+
+    public ConsistencyLevel getConsistency()
+    {
+        return wrapped.getConsistency();
+    }
+
+    public ConsistencyLevel getSerialConsistency()
+    {
+        return wrapped.getSerialConsistency();
     }
 
     public List<Object> getQueryOrIdList()
     {
         return queryOrIdList;
     }
+
+    public long getTimestamp(QueryState state)
+    {
+        return wrapped.getTimestamp(state);
+    }
+
+    private static class WithoutPerStatementVariables extends BatchQueryOptions
+    {
+        private WithoutPerStatementVariables(QueryOptions wrapped, List<Object> queryOrIdList)
+        {
+            super(wrapped, queryOrIdList);
+        }
+
+        public QueryOptions forStatement(int i)
+        {
+            return wrapped;
+        }
+    }
+
+    private static class WithPerStatementVariables extends BatchQueryOptions
+    {
+        private final List<QueryOptions> perStatementOptions;
+
+        private WithPerStatementVariables(QueryOptions wrapped, List<List<ByteBuffer>> variables, List<Object> queryOrIdList)
+        {
+            super(wrapped, queryOrIdList);
+            this.perStatementOptions = new ArrayList<>(variables.size());
+            for (final List<ByteBuffer> vars : variables)
+            {
+                perStatementOptions.add(new QueryOptions.QueryOptionsWrapper(wrapped)
+                {
+                    public List<ByteBuffer> getValues()
+                    {
+                        return vars;
+                    }
+                });
+            }
+        }
+
+        public QueryOptions forStatement(int i)
+        {
+            return perStatementOptions.get(i);
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/CFDefinition.java b/src/java/org/apache/cassandra/cql3/CFDefinition.java
deleted file mode 100644
index 23bedaf..0000000
--- a/src/java/org/apache/cassandra/cql3/CFDefinition.java
+++ /dev/null

@@ -1,362 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Objects;
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.ColumnToCollectionType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-/**
- * Holds metadata on a CF preprocessed for use by CQL queries.
- */
-public class CFDefinition implements Iterable<CFDefinition.Name>
-{
-    public static final AbstractType<?> definitionType = UTF8Type.instance;
-
-    // Keep static and regular columns lexicographically ordered so that wildcard expansion have a deterministic order
-    private static final Comparator<ColumnIdentifier> identifierComparator = new Comparator<ColumnIdentifier>()
-    {
-        public int compare(ColumnIdentifier id1, ColumnIdentifier id2)
-        {
-            return ByteBufferUtil.compareUnsigned(id1.key, id2.key);
-        }
-    };
-
-    public final CFMetaData cfm;
-    // LinkedHashMap because the order does matter (it is the order in the composite type)
-    private final LinkedHashMap<ColumnIdentifier, Name> partitionKeys = new LinkedHashMap<ColumnIdentifier, Name>();
-    private final LinkedHashMap<ColumnIdentifier, Name> clusteringColumns = new LinkedHashMap<ColumnIdentifier, Name>();
-    private final Name compactValue;
-    // Keep metadata lexicographically ordered so that wildcard expansion have a deterministic order
-    private final Map<ColumnIdentifier, Name> staticColumns = new TreeMap<ColumnIdentifier, Name>(identifierComparator);
-    private final Map<ColumnIdentifier, Name> regularColumns = new TreeMap<ColumnIdentifier, Name>(identifierComparator);
-
-    public final boolean isComposite;
-    public final boolean hasCompositeKey;
-    // Note that isCompact means here that no componet of the comparator correspond to the column names
-    // defined in the CREATE TABLE QUERY. This is not exactly equivalent to the 'WITH COMPACT STORAGE'
-    // option when creating a table in that "static CF" without a composite type will have isCompact == false
-    // even though one must use 'WITH COMPACT STORAGE' to declare them.
-    public final boolean isCompact;
-    public final boolean hasCollections;
-
-    public CFDefinition(CFMetaData cfm)
-    {
-        this.cfm = cfm;
-
-        this.hasCompositeKey = cfm.getKeyValidator() instanceof CompositeType;
-        for (int i = 0; i < cfm.partitionKeyColumns().size(); ++i)
-        {
-            ColumnIdentifier id = new ColumnIdentifier(cfm.partitionKeyColumns().get(i).name, definitionType);
-            this.partitionKeys.put(id, new Name(cfm.ksName, cfm.cfName, id, Name.Kind.KEY_ALIAS, i, cfm.getKeyValidator().getComponents().get(i)));
-        }
-
-        this.isComposite = cfm.comparator instanceof CompositeType;
-        this.hasCollections = cfm.comparator.getComponents().get(cfm.comparator.componentsCount() - 1) instanceof ColumnToCollectionType;
-        this.isCompact = cfm.clusteringKeyColumns().size() == cfm.comparator.componentsCount();
-        for (int i = 0; i < cfm.clusteringKeyColumns().size(); ++i)
-        {
-            ColumnIdentifier id = new ColumnIdentifier(cfm.clusteringKeyColumns().get(i).name, definitionType);
-            this.clusteringColumns.put(id, new Name(cfm.ksName, cfm.cfName, id, Name.Kind.COLUMN_ALIAS, i, cfm.comparator.getComponents().get(i)));
-        }
-
-        if (isCompact)
-        {
-            this.compactValue = createValue(cfm);
-        }
-        else
-        {
-            this.compactValue = null;
-            for (ColumnDefinition def : cfm.regularColumns())
-            {
-                ColumnIdentifier id = new ColumnIdentifier(def.name, cfm.getColumnDefinitionComparator(def));
-                this.regularColumns.put(id, new Name(cfm.ksName, cfm.cfName, id, Name.Kind.COLUMN_METADATA, def.getValidator()));
-            }
-            for (ColumnDefinition def : cfm.staticColumns())
-            {
-                ColumnIdentifier id = new ColumnIdentifier(def.name, cfm.getColumnDefinitionComparator(def));
-                this.staticColumns.put(id, new Name(cfm.ksName, cfm.cfName, id, Name.Kind.STATIC, def.getValidator()));
-            }
-        }
-    }
-
-    public ColumnToCollectionType getCollectionType()
-    {
-        if (!hasCollections)
-            return null;
-
-        CompositeType composite = (CompositeType)cfm.comparator;
-        return (ColumnToCollectionType)composite.types.get(composite.types.size() - 1);
-    }
-
-    private static Name createValue(CFMetaData cfm)
-    {
-        ColumnIdentifier alias = new ColumnIdentifier(cfm.compactValueColumn().name, definitionType);
-        // That's how we distinguish between 'no value alias because coming from thrift' and 'I explicitely did not
-        // define a value' (see CreateTableStatement)
-        return alias.key.equals(ByteBufferUtil.EMPTY_BYTE_BUFFER)
-               ? null
-               : new Name(cfm.ksName, cfm.cfName, alias, Name.Kind.VALUE_ALIAS, cfm.getDefaultValidator());
-    }
-
-    public int partitionKeyCount()
-    {
-        return partitionKeys.size();
-    }
-
-    public Collection<Name> partitionKeys()
-    {
-        return partitionKeys.values();
-    }
-
-    public int clusteringColumnsCount()
-    {
-        return clusteringColumns.size();
-    }
-
-    public Collection<Name> clusteringColumns()
-    {
-        return clusteringColumns.values();
-    }
-
-    public Collection<Name> regularColumns()
-    {
-        return regularColumns.values();
-    }
-
-    public Collection<Name> staticColumns()
-    {
-        return staticColumns.values();
-    }
-
-    public Name compactValue()
-    {
-        return compactValue;
-    }
-
-    public Name get(ColumnIdentifier name)
-    {
-        CFDefinition.Name def = partitionKeys.get(name);
-        if (def != null)
-            return def;
-        if (compactValue != null && name.equals(compactValue.name))
-            return compactValue;
-        def = clusteringColumns.get(name);
-        if (def != null)
-            return def;
-        def = regularColumns.get(name);
-        if (def != null)
-            return def;
-        return staticColumns.get(name);
-    }
-
-    public Iterator<Name> iterator()
-    {
-        return new AbstractIterator<Name>()
-        {
-            private final Iterator<Name> keyIter = partitionKeys.values().iterator();
-            private final Iterator<Name> clusteringIter = clusteringColumns.values().iterator();
-            private boolean valueDone;
-            private final Iterator<Name> staticIter = staticColumns.values().iterator();
-            private final Iterator<Name> regularIter = regularColumns.values().iterator();
-
-            protected Name computeNext()
-            {
-                if (keyIter.hasNext())
-                    return keyIter.next();
-
-                if (clusteringIter.hasNext())
-                    return clusteringIter.next();
-
-                if (compactValue != null && !valueDone)
-                {
-                    valueDone = true;
-                    return compactValue;
-                }
-
-                if (staticIter.hasNext())
-                    return staticIter.next();
-
-                if (regularIter.hasNext())
-                    return regularIter.next();
-
-                return endOfData();
-            }
-        };
-    }
-
-    public ColumnNameBuilder getKeyNameBuilder()
-    {
-        return hasCompositeKey
-             ? new CompositeType.Builder((CompositeType)cfm.getKeyValidator())
-             : new NonCompositeBuilder(cfm.getKeyValidator());
-    }
-
-    public ColumnNameBuilder getColumnNameBuilder()
-    {
-        return isComposite
-             ? new CompositeType.Builder((CompositeType)cfm.comparator)
-             : new NonCompositeBuilder(cfm.comparator);
-    }
-
-    public static class Name extends ColumnSpecification
-    {
-        public static enum Kind
-        {
-            KEY_ALIAS, COLUMN_ALIAS, VALUE_ALIAS, COLUMN_METADATA, STATIC
-        }
-
-        private Name(String ksName, String cfName, ColumnIdentifier name, Kind kind, AbstractType<?> type)
-        {
-            this(ksName, cfName, name, kind, -1, type);
-        }
-
-        private Name(String ksName, String cfName, ColumnIdentifier name, Kind kind, int position, AbstractType<?> type)
-        {
-            super(ksName, cfName, name, type);
-            this.kind = kind;
-            this.position = position;
-        }
-
-        public final Kind kind;
-        public final int position; // only make sense for KEY_ALIAS and COLUMN_ALIAS
-
-        @Override
-        public boolean equals(Object o)
-        {
-            if(!(o instanceof Name))
-                return false;
-            Name that = (Name)o;
-            return Objects.equal(ksName, that.ksName)
-                && Objects.equal(cfName, that.cfName)
-                && Objects.equal(name, that.name)
-                && Objects.equal(type, that.type)
-                && kind == that.kind
-                && position == that.position;
-        }
-
-        @Override
-        public final int hashCode()
-        {
-            return Objects.hashCode(ksName, cfName, name, type, kind, position);
-        }
-
-        public boolean isPrimaryKeyColumn()
-        {
-            return kind == Kind.KEY_ALIAS || kind == Kind.COLUMN_ALIAS;
-        }
-    }
-
-    @Override
-    public String toString()
-    {
-        StringBuilder sb = new StringBuilder();
-        sb.append(Joiner.on(", ").join(partitionKeys.values()));
-        if (!clusteringColumns.isEmpty())
-            sb.append(", ").append(Joiner.on(", ").join(clusteringColumns.values()));
-        sb.append(" => ");
-        if (compactValue != null)
-            sb.append(compactValue.name);
-        sb.append("{");
-        sb.append(Joiner.on(", ").join(staticColumns.values()));
-        if (!staticColumns.isEmpty())
-            sb.append(", ");
-        sb.append(Joiner.on(", ").join(regularColumns.values()));
-        sb.append("}");
-        return sb.toString();
-    }
-
-    private static class NonCompositeBuilder implements ColumnNameBuilder
-    {
-        private final AbstractType<?> type;
-        private ByteBuffer columnName;
-
-        private NonCompositeBuilder(AbstractType<?> type)
-        {
-            this.type = type;
-        }
-
-        public NonCompositeBuilder add(ByteBuffer bb)
-        {
-            if (columnName != null)
-                throw new IllegalStateException("Column name is already constructed");
-
-            columnName = bb;
-            return this;
-        }
-
-        public int componentCount()
-        {
-            return columnName == null ? 0 : 1;
-        }
-
-        public int remainingCount()
-        {
-            return columnName == null ? 1 : 0;
-        }
-
-        public ByteBuffer get(int i)
-        {
-            if (i < 0 || i >= (columnName == null ? 0 : 1))
-                throw new IllegalArgumentException();
-
-            return columnName;
-        }
-
-        public ByteBuffer build()
-        {
-            return columnName == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : columnName;
-        }
-
-        public ByteBuffer buildAsEndOfRange()
-        {
-            return build();
-        }
-
-        public ByteBuffer buildForRelation(Relation.Type op)
-        {
-            return build();
-        }
-
-        public NonCompositeBuilder copy()
-        {
-            NonCompositeBuilder newBuilder = new NonCompositeBuilder(type);
-            newBuilder.columnName = columnName;
-            return newBuilder;
-        }
-
-        public ByteBuffer getComponent(int i)
-        {
-            if (i != 0 || columnName == null)
-                throw new IllegalArgumentException();
-
-            return columnName;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/Allocator.java b/src/java/org/apache/cassandra/cql3/CQL3Row.java
similarity index 62%
copy from src/java/org/apache/cassandra/utils/Allocator.java
copy to src/java/org/apache/cassandra/cql3/CQL3Row.java
index 7134353..6fa2b64 100644
--- a/src/java/org/apache/cassandra/utils/Allocator.java
+++ b/src/java/org/apache/cassandra/cql3/CQL3Row.java

@@ -15,27 +15,27 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.utils;
+package org.apache.cassandra.cql3;
 
 import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.List;
 
-public abstract class Allocator
+import org.apache.cassandra.db.Cell;
+
+public interface CQL3Row
 {
-    /**
-     * Allocate a slice of the given length.
-     */
-    public ByteBuffer clone(ByteBuffer buffer)
-    {
-        assert buffer != null;
-        ByteBuffer cloned = allocate(buffer.remaining());
+    public ByteBuffer getClusteringColumn(int i);
+    public Cell getColumn(ColumnIdentifier name);
+    public List<Cell> getCollection(ColumnIdentifier name);
 
-        cloned.mark();
-        cloned.put(buffer.duplicate());
-        cloned.reset();
-        return cloned;
+    public interface Builder
+    {
+        public RowIterator group(Iterator<Cell> cells);
     }
 
-    public abstract ByteBuffer allocate(int size);
-
-    public abstract long getMinimumSize();
+    public interface RowIterator extends Iterator<CQL3Row>
+    {
+        public CQL3Row getStaticRow();
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/CQL3Type.java b/src/java/org/apache/cassandra/cql3/CQL3Type.java
index 9a6336e..6d55285 100644
--- a/src/java/org/apache/cassandra/cql3/CQL3Type.java
+++ b/src/java/org/apache/cassandra/cql3/CQL3Type.java

@@ -17,6 +17,11 @@
  */
 package org.apache.cassandra.cql3;
 
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.ConfigurationException;
@@ -25,7 +30,6 @@
 public interface CQL3Type
 {
     public boolean isCollection();
-    public boolean isCounter();
     public AbstractType<?> getType();
 
     public enum Native implements CQL3Type
@@ -64,11 +68,6 @@
             return type;
         }
 
-        public boolean isCounter()
-        {
-            return this == COUNTER;
-        }
-
         @Override
         public String toString()
         {
@@ -100,11 +99,6 @@
             return type;
         }
 
-        public boolean isCounter()
-        {
-            return false;
-        }
-
         @Override
         public final boolean equals(Object o)
         {
@@ -130,56 +124,21 @@
 
     public static class Collection implements CQL3Type
     {
-        CollectionType type;
+        private final CollectionType type;
 
         public Collection(CollectionType type)
         {
             this.type = type;
         }
 
-        public static Collection map(CQL3Type t1, CQL3Type t2) throws InvalidRequestException
-        {
-            if (t1.isCollection() || t2.isCollection())
-                throw new InvalidRequestException("map type cannot contain another collection");
-            if (t1.isCounter() || t2.isCounter())
-                throw new InvalidRequestException("counters are not allowed inside a collection");
-
-            return new Collection(MapType.getInstance(t1.getType(), t2.getType()));
-        }
-
-        public static Collection list(CQL3Type t) throws InvalidRequestException
-        {
-            if (t.isCollection())
-                throw new InvalidRequestException("list type cannot contain another collection");
-            if (t.isCounter())
-                throw new InvalidRequestException("counters are not allowed inside a collection");
-
-            return new Collection(ListType.getInstance(t.getType()));
-        }
-
-        public static Collection set(CQL3Type t) throws InvalidRequestException
-        {
-            if (t.isCollection())
-                throw new InvalidRequestException("set type cannot contain another collection");
-            if (t.isCounter())
-                throw new InvalidRequestException("counters are not allowed inside a collection");
-
-            return new Collection(SetType.getInstance(t.getType()));
-        }
-
-        public boolean isCollection()
-        {
-            return true;
-        }
-
         public AbstractType<?> getType()
         {
             return type;
         }
 
-        public boolean isCounter()
+        public boolean isCollection()
         {
-            return false;
+            return true;
         }
 
         @Override
@@ -214,4 +173,370 @@
             throw new AssertionError();
         }
     }
+
+    public static class UserDefined implements CQL3Type
+    {
+        // Keeping this separatly from type just to simplify toString()
+        private final String name;
+        private final UserType type;
+
+        private UserDefined(String name, UserType type)
+        {
+            this.name = name;
+            this.type = type;
+        }
+
+        public static UserDefined create(UserType type)
+        {
+            return new UserDefined(UTF8Type.instance.compose(type.name), type);
+        }
+
+        public boolean isCollection()
+        {
+            return false;
+        }
+
+        public AbstractType<?> getType()
+        {
+            return type;
+        }
+
+        @Override
+        public final boolean equals(Object o)
+        {
+            if(!(o instanceof UserDefined))
+                return false;
+
+            UserDefined that = (UserDefined)o;
+            return type.equals(that.type);
+        }
+
+        @Override
+        public final int hashCode()
+        {
+            return type.hashCode();
+        }
+
+        @Override
+        public String toString()
+        {
+            return name;
+        }
+    }
+
+    public static class Tuple implements CQL3Type
+    {
+        private final TupleType type;
+
+        private Tuple(TupleType type)
+        {
+            this.type = type;
+        }
+
+        public static Tuple create(TupleType type)
+        {
+            return new Tuple(type);
+        }
+
+        public boolean isCollection()
+        {
+            return false;
+        }
+
+        public AbstractType<?> getType()
+        {
+            return type;
+        }
+
+        @Override
+        public final boolean equals(Object o)
+        {
+            if(!(o instanceof Tuple))
+                return false;
+
+            Tuple that = (Tuple)o;
+            return type.equals(that.type);
+        }
+
+        @Override
+        public final int hashCode()
+        {
+            return type.hashCode();
+        }
+
+        @Override
+        public String toString()
+        {
+            StringBuilder sb = new StringBuilder();
+            sb.append("tuple<");
+            for (int i = 0; i < type.size(); i++)
+            {
+                if (i > 0)
+                    sb.append(", ");
+                sb.append(type.type(i).asCQL3Type());
+            }
+            sb.append(">");
+            return sb.toString();
+        }
+    }
+
+    // For UserTypes, we need to know the current keyspace to resolve the
+    // actual type used, so Raw is a "not yet prepared" CQL3Type.
+    public abstract class Raw
+    {
+        protected boolean frozen;
+
+        public boolean isCollection()
+        {
+            return false;
+        }
+
+        public boolean isCounter()
+        {
+            return false;
+        }
+
+        public Raw freeze()
+        {
+            frozen = true;
+            return this;
+        }
+
+        public abstract CQL3Type prepare(String keyspace) throws InvalidRequestException;
+
+        public static Raw from(CQL3Type type)
+        {
+            return new RawType(type);
+        }
+
+        public static Raw userType(UTName name)
+        {
+            return new RawUT(name);
+        }
+
+        public static Raw map(CQL3Type.Raw t1, CQL3Type.Raw t2) throws InvalidRequestException
+        {
+            if (t1.isCollection() || t2.isCollection())
+                throw new InvalidRequestException("map type cannot contain another collection");
+            if (t1.isCounter() || t2.isCounter())
+                throw new InvalidRequestException("counters are not allowed inside a collection");
+
+            return new RawCollection(CollectionType.Kind.MAP, t1, t2);
+        }
+
+        public static Raw list(CQL3Type.Raw t) throws InvalidRequestException
+        {
+            if (t.isCollection())
+                throw new InvalidRequestException("list type cannot contain another collection");
+            if (t.isCounter())
+                throw new InvalidRequestException("counters are not allowed inside a collection");
+
+            return new RawCollection(CollectionType.Kind.LIST, null, t);
+        }
+
+        public static Raw set(CQL3Type.Raw t) throws InvalidRequestException
+        {
+            if (t.isCollection())
+                throw new InvalidRequestException("set type cannot contain another collection");
+            if (t.isCounter())
+                throw new InvalidRequestException("counters are not allowed inside a collection");
+
+            return new RawCollection(CollectionType.Kind.SET, null, t);
+        }
+
+        public static Raw tuple(List<CQL3Type.Raw> ts) throws InvalidRequestException
+        {
+            for (int i = 0; i < ts.size(); i++)
+                if (ts.get(i) != null && ts.get(i).isCounter())
+                    throw new InvalidRequestException("counters are not allowed inside tuples");
+
+            return new RawTuple(ts);
+        }
+
+        public static Raw frozen(CQL3Type.Raw t) throws InvalidRequestException
+        {
+            if (t instanceof RawUT)
+                return ((RawUT)t).freeze();
+            if (t instanceof RawTuple)
+                return ((RawTuple)t).freeze();
+
+            throw new InvalidRequestException("frozen<> is only currently only allowed on User-Defined and tuple types");
+        }
+
+        private static class RawType extends Raw
+        {
+            private CQL3Type type;
+
+            private RawType(CQL3Type type)
+            {
+                this.type = type;
+            }
+
+            public CQL3Type prepare(String keyspace) throws InvalidRequestException
+            {
+                return type;
+            }
+
+            public boolean isCounter()
+            {
+                return type == Native.COUNTER;
+            }
+
+            @Override
+            public String toString()
+            {
+                return type.toString();
+            }
+        }
+
+        private static class RawCollection extends Raw
+        {
+            private final CollectionType.Kind kind;
+            private final CQL3Type.Raw keys;
+            private final CQL3Type.Raw values;
+
+            private RawCollection(CollectionType.Kind kind, CQL3Type.Raw keys, CQL3Type.Raw values)
+            {
+                this.kind = kind;
+                this.keys = keys;
+                this.values = values;
+            }
+
+            public Raw freeze()
+            {
+                if (keys != null)
+                    keys.freeze();
+                values.freeze();
+                return super.freeze();
+            }
+
+            public boolean isCollection()
+            {
+                return true;
+            }
+
+            public CQL3Type prepare(String keyspace) throws InvalidRequestException
+            {
+                switch (kind)
+                {
+                    case LIST: return new Collection(ListType.getInstance(values.prepare(keyspace).getType()));
+                    case SET:  return new Collection(SetType.getInstance(values.prepare(keyspace).getType()));
+                    case MAP:  return new Collection(MapType.getInstance(keys.prepare(keyspace).getType(), values.prepare(keyspace).getType()));
+                }
+                throw new AssertionError();
+            }
+
+            @Override
+            public String toString()
+            {
+                switch (kind)
+                {
+                    case LIST: return "list<" + values + ">";
+                    case SET:  return "set<" + values + ">";
+                    case MAP:  return "map<" + keys + ", " + values + ">";
+                }
+                throw new AssertionError();
+            }
+        }
+
+        private static class RawUT extends Raw
+        {
+            private final UTName name;
+
+            private RawUT(UTName name)
+            {
+                this.name = name;
+            }
+
+            public CQL3Type prepare(String keyspace) throws InvalidRequestException
+            {
+                if (name.hasKeyspace())
+                {
+                    // The provided keyspace is the one of the current statement this is part of. If it's different from the keyspace of
+                    // the UTName, we reject since we want to limit user types to their own keyspace (see #6643)
+                    if (!keyspace.equals(name.getKeyspace()))
+                        throw new InvalidRequestException(String.format("Statement on keyspace %s cannot refer to a user type in keyspace %s; "
+                                                                        + "user types can only be used in the keyspace they are defined in",
+                                                                        keyspace, name.getKeyspace()));
+                }
+                else
+                {
+                    name.setKeyspace(keyspace);
+                }
+
+                KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+                if (ksm == null)
+                    throw new InvalidRequestException("Unknown keyspace " + name.getKeyspace());
+                UserType type = ksm.userTypes.getType(name.getUserTypeName());
+                if (type == null)
+                    throw new InvalidRequestException("Unknown type " + name);
+
+                if (!frozen)
+                    throw new InvalidRequestException("Non-frozen User-Defined types are not supported, please use frozen<>");
+
+                return new UserDefined(name.toString(), type);
+            }
+
+            public boolean isUDT()
+            {
+                return true;
+            }
+
+            @Override
+            public String toString()
+            {
+                return name.toString();
+            }
+        }
+
+        private static class RawTuple extends Raw
+        {
+            private final List<CQL3Type.Raw> types;
+
+            private RawTuple(List<CQL3Type.Raw> types)
+            {
+                this.types = types;
+            }
+
+            public Raw freeze()
+            {
+                for (CQL3Type.Raw t : types)
+                    if (t != null)
+                        t.freeze();
+                return super.freeze();
+            }
+
+            public boolean isCollection()
+            {
+                return false;
+            }
+
+            public CQL3Type prepare(String keyspace) throws InvalidRequestException
+            {
+                List<AbstractType<?>> ts = new ArrayList<>(types.size());
+                for (CQL3Type.Raw t : types)
+                    ts.add(t.prepare(keyspace).getType());
+
+                if (!frozen)
+                    throw new InvalidRequestException("Non-frozen tuples are not supported, please use frozen<>");
+
+                return new Tuple(new TupleType(ts));
+            }
+
+            @Override
+            public String toString()
+            {
+                StringBuilder sb = new StringBuilder();
+                sb.append("tuple<");
+                for (int i = 0; i < types.size(); i++)
+                {
+                    if (i > 0)
+                        sb.append(", ");
+                    sb.append(types.get(i));
+                }
+                sb.append(">");
+                return sb.toString();
+            }
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/ColumnCondition.java b/src/java/org/apache/cassandra/cql3/ColumnCondition.java
index adc8e3a..703ac5b 100644
--- a/src/java/org/apache/cassandra/cql3/ColumnCondition.java
+++ b/src/java/org/apache/cassandra/cql3/ColumnCondition.java

@@ -23,41 +23,77 @@
 import com.google.common.base.Objects;
 import com.google.common.base.Predicate;
 import com.google.common.collect.Iterators;
+import static com.google.common.collect.Lists.newArrayList;
 
-import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.ColumnSlice;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * A CQL3 condition.
  */
 public class ColumnCondition
 {
-    public final CFDefinition.Name column;
+    private static final Logger logger = LoggerFactory.getLogger(ColumnCondition.class);
+
+    public final ColumnDefinition column;
 
     // For collection, when testing the equality of a specific element, null otherwise.
     private final Term collectionElement;
 
-    private final Term value;
+    private final Term value;  // a single value or a marker for a list of IN values
+    private final List<Term> inValues;
 
-    private ColumnCondition(CFDefinition.Name column, Term collectionElement, Term value)
+    public final Relation.Type operator;
+
+    private ColumnCondition(ColumnDefinition column, Term collectionElement, Term value, List<Term> inValues, Relation.Type op)
     {
         this.column = column;
         this.collectionElement = collectionElement;
         this.value = value;
+        this.inValues = inValues;
+        this.operator = op;
+
+        if (!operator.equals(Relation.Type.IN))
+            assert this.inValues == null;
     }
 
-    public static ColumnCondition equal(CFDefinition.Name column, Term value)
+    public static ColumnCondition condition(ColumnDefinition column, Term value, Relation.Type op)
     {
-        return new ColumnCondition(column, null, value);
+        return new ColumnCondition(column, null, value, null, op);
     }
 
-    public static ColumnCondition equal(CFDefinition.Name column, Term collectionElement, Term value)
+    public static ColumnCondition condition(ColumnDefinition column, Term collectionElement, Term value, Relation.Type op)
     {
-        return new ColumnCondition(column, collectionElement, value);
+        return new ColumnCondition(column, collectionElement, value, null, op);
+    }
+
+    public static ColumnCondition inCondition(ColumnDefinition column, List<Term> inValues)
+    {
+        return new ColumnCondition(column, null, null, inValues, Relation.Type.IN);
+    }
+
+    public static ColumnCondition inCondition(ColumnDefinition column, Term collectionElement, List<Term> inValues)
+    {
+        return new ColumnCondition(column, collectionElement, null, inValues, Relation.Type.IN);
+    }
+
+    public static ColumnCondition inCondition(ColumnDefinition column, Term inMarker)
+    {
+        return new ColumnCondition(column, null, inMarker, null, Relation.Type.IN);
+    }
+
+    public static ColumnCondition inCondition(ColumnDefinition column, Term collectionElement, Term inMarker)
+    {
+        return new ColumnCondition(column, collectionElement, inMarker, null, Relation.Type.IN);
     }
 
     /**
@@ -70,55 +106,107 @@
     {
         if (collectionElement != null)
             collectionElement.collectMarkerSpecification(boundNames);
-        value.collectMarkerSpecification(boundNames);
+
+        if (operator.equals(Relation.Type.IN) && inValues != null)
+        {
+            for (Term value : inValues)
+                value.collectMarkerSpecification(boundNames);
+        }
+        else
+        {
+            value.collectMarkerSpecification(boundNames);
+        }
     }
 
-    public ColumnCondition.Bound bind(List<ByteBuffer> variables) throws InvalidRequestException
+    public ColumnCondition.Bound bind(QueryOptions options) throws InvalidRequestException
     {
-        return column.type instanceof CollectionType
-             ? (collectionElement == null ? new CollectionBound(this, variables) : new ElementAccessBound(this, variables))
-             : new SimpleBound(this, variables);
+        boolean isInCondition = operator.equals(Relation.Type.IN);
+        if (column.type instanceof CollectionType)
+        {
+            if (collectionElement == null)
+                return isInCondition ? new CollectionInBound(this, options) : new CollectionBound(this, options);
+            else
+                return isInCondition ? new ElementAccessInBound(this, options) : new ElementAccessBound(this, options);
+        }
+        return isInCondition ? new SimpleInBound(this, options) : new SimpleBound(this, options);
     }
 
     public static abstract class Bound
     {
-        public final CFDefinition.Name column;
+        public final ColumnDefinition column;
+        public final Relation.Type operator;
 
-        protected Bound(CFDefinition.Name column)
+        protected Bound(ColumnDefinition column, Relation.Type operator)
         {
             this.column = column;
+            this.operator = operator;
         }
 
         /**
          * Validates whether this condition applies to {@code current}.
          */
-        public abstract boolean appliesTo(ColumnNameBuilder rowPrefix, ColumnFamily current, long now) throws InvalidRequestException;
+        public abstract boolean appliesTo(Composite rowPrefix, ColumnFamily current, long now) throws InvalidRequestException;
 
         public ByteBuffer getCollectionElementValue()
         {
             return null;
         }
 
-        protected ColumnNameBuilder copyOrUpdatePrefix(CFMetaData cfm, ColumnNameBuilder rowPrefix)
+        protected boolean isSatisfiedByValue(ByteBuffer value, Cell c, AbstractType<?> type, Relation.Type operator, long now) throws InvalidRequestException
         {
-            return column.kind == CFDefinition.Name.Kind.STATIC ? cfm.getStaticColumnNameBuilder() : rowPrefix.copy();
+            ByteBuffer columnValue = (c == null || !c.isLive(now)) ? null : c.value();
+            return compareWithOperator(operator, type, value, columnValue);
         }
 
-        protected boolean equalsValue(ByteBuffer value, Column c, AbstractType<?> type, long now)
+        /** Returns true if the operator is satisfied (i.e. "value operator otherValue == true"), false otherwise. */
+        protected boolean compareWithOperator(Relation.Type operator, AbstractType<?> type, ByteBuffer value, ByteBuffer otherValue) throws InvalidRequestException
         {
-            return value == null
-                 ? c == null || !c.isLive(now)
-                 : c != null && c.isLive(now) && type.compare(c.value(), value) == 0;
+            if (value == null)
+            {
+                switch (operator)
+                {
+                    case EQ:
+                        return otherValue == null;
+                    case NEQ:
+                        return otherValue != null;
+                    default:
+                        throw new InvalidRequestException(String.format("Invalid comparison with null for operator \"%s\"", operator));
+                }
+            }
+            else if (otherValue == null)
+            {
+                // the condition value is not null, so only NEQ can return true
+                return operator.equals(Relation.Type.NEQ);
+            }
+            int comparison = type.compare(otherValue, value);
+            switch (operator)
+            {
+                case EQ:
+                    return comparison == 0;
+                case LT:
+                    return comparison < 0;
+                case LTE:
+                    return comparison <= 0;
+                case GT:
+                    return comparison > 0;
+                case GTE:
+                    return comparison >= 0;
+                case NEQ:
+                    return comparison != 0;
+                default:
+                    // we shouldn't get IN, CONTAINS, or CONTAINS KEY here
+                    throw new AssertionError();
+            }
         }
 
-        protected Iterator<Column> collectionColumns(ColumnNameBuilder collectionPrefix, ColumnFamily cf, final long now)
+        protected Iterator<Cell> collectionColumns(CellName collection, ColumnFamily cf, final long now)
         {
             // We are testing for collection equality, so we need to have the expected values *and* only those.
-            ColumnSlice[] collectionSlice = new ColumnSlice[]{ new ColumnSlice(collectionPrefix.build(), collectionPrefix.buildAsEndOfRange()) };
+            ColumnSlice[] collectionSlice = new ColumnSlice[]{ collection.slice() };
             // Filter live columns, this makes things simpler afterwards
-            return Iterators.filter(cf.iterator(collectionSlice), new Predicate<Column>()
+            return Iterators.filter(cf.iterator(collectionSlice), new Predicate<Cell>()
             {
-                public boolean apply(Column c)
+                public boolean apply(Cell c)
                 {
                     // we only care about live columns
                     return c.isLive(now);
@@ -127,87 +215,123 @@
         }
     }
 
-    private static class SimpleBound extends Bound
+    /**
+     * A condition on a single non-collection column. This does not support IN operators (see SimpleInBound).
+     */
+    static class SimpleBound extends Bound
     {
         public final ByteBuffer value;
 
-        private SimpleBound(ColumnCondition condition, List<ByteBuffer> variables) throws InvalidRequestException
+        private SimpleBound(ColumnCondition condition, QueryOptions options) throws InvalidRequestException
         {
-            super(condition.column);
+            super(condition.column, condition.operator);
             assert !(column.type instanceof CollectionType) && condition.collectionElement == null;
-            this.value = condition.value.bindAndGet(variables);
+            assert !condition.operator.equals(Relation.Type.IN);
+            this.value = condition.value.bindAndGet(options);
         }
 
-        public boolean appliesTo(ColumnNameBuilder rowPrefix, ColumnFamily current, long now) throws InvalidRequestException
+        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, long now) throws InvalidRequestException
         {
-            ColumnNameBuilder prefix = copyOrUpdatePrefix(current.metadata(), rowPrefix);
-            ByteBuffer columnName = column.kind == CFDefinition.Name.Kind.VALUE_ALIAS
-                                  ? prefix.build()
-                                  : prefix.add(column.name.key).build();
-
-            return equalsValue(value, current.getColumn(columnName), column.type, now);
-        }
-
-        @Override
-        public boolean equals(Object o)
-        {
-            if (!(o instanceof SimpleBound))
-                return false;
-
-            SimpleBound that = (SimpleBound)o;
-            if (!column.equals(that.column))
-                return false;
-
-            return value == null || that.value == null
-                 ? value == null && that.value == null
-                 : column.type.compare(value, that.value) == 0;
+            CellName name = current.metadata().comparator.create(rowPrefix, column);
+            return isSatisfiedByValue(value, current.getColumn(name), column.type, operator, now);
         }
 
         @Override
         public int hashCode()
         {
-            return Objects.hashCode(column, value);
+            return Objects.hashCode(column, value, operator);
         }
     }
 
-    private static class ElementAccessBound extends Bound
+    /**
+     * An IN condition on a single non-collection column.
+     */
+    static class SimpleInBound extends Bound
+    {
+        public final List<ByteBuffer> inValues;
+
+        private SimpleInBound(ColumnCondition condition, QueryOptions options) throws InvalidRequestException
+        {
+            super(condition.column, condition.operator);
+            assert !(column.type instanceof CollectionType) && condition.collectionElement == null;
+            assert condition.operator.equals(Relation.Type.IN);
+            if (condition.inValues == null)
+                this.inValues = ((Lists.Marker) condition.value).bind(options).getElements();
+            else
+            {
+                this.inValues = new ArrayList<>(condition.inValues.size());
+                for (Term value : condition.inValues)
+                    this.inValues.add(value.bindAndGet(options));
+            }
+        }
+
+        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, long now) throws InvalidRequestException
+        {
+            CellName name = current.metadata().comparator.create(rowPrefix, column);
+            for (ByteBuffer value : inValues)
+            {
+                if (isSatisfiedByValue(value, current.getColumn(name), column.type, Relation.Type.EQ, now))
+                    return true;
+            }
+            return false;
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(column, inValues, operator);
+        }
+    }
+
+    /** A condition on an element of a collection column. IN operators are not supported here, see ElementAccessInBound. */
+    static class ElementAccessBound extends Bound
     {
         public final ByteBuffer collectionElement;
         public final ByteBuffer value;
 
-        private ElementAccessBound(ColumnCondition condition, List<ByteBuffer> variables) throws InvalidRequestException
+        private ElementAccessBound(ColumnCondition condition, QueryOptions options) throws InvalidRequestException
         {
-            super(condition.column);
+            super(condition.column, condition.operator);
             assert column.type instanceof CollectionType && condition.collectionElement != null;
-            this.collectionElement = condition.collectionElement.bindAndGet(variables);
-            this.value = condition.value.bindAndGet(variables);
+            assert !condition.operator.equals(Relation.Type.IN);
+            this.collectionElement = condition.collectionElement.bindAndGet(options);
+            this.value = condition.value.bindAndGet(options);
         }
 
-        public boolean appliesTo(ColumnNameBuilder rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
+        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
         {
             if (collectionElement == null)
                 throw new InvalidRequestException("Invalid null value for " + (column.type instanceof MapType ? "map" : "list") + " element access");
 
-            ColumnNameBuilder collectionPrefix = copyOrUpdatePrefix(current.metadata(), rowPrefix).add(column.name.key);
             if (column.type instanceof MapType)
-                return equalsValue(value, current.getColumn(collectionPrefix.add(collectionElement).build()), ((MapType)column.type).values, now);
+            {
+                Cell cell = current.getColumn(current.metadata().comparator.create(rowPrefix, column, collectionElement));
+                return isSatisfiedByValue(value, cell, ((MapType) column.type).values, operator, now);
+            }
 
+            // sets don't have element access, so it's a list
             assert column.type instanceof ListType;
+            ByteBuffer columnValue = getListItem(
+                    collectionColumns(current.metadata().comparator.create(rowPrefix, column), current, now),
+                    getListIndex(collectionElement));
+            return compareWithOperator(operator, ((ListType)column.type).elements, value, columnValue);
+        }
+
+        static int getListIndex(ByteBuffer collectionElement) throws InvalidRequestException
+        {
             int idx = ByteBufferUtil.toInt(collectionElement);
             if (idx < 0)
                 throw new InvalidRequestException(String.format("Invalid negative list index %d", idx));
+            return idx;
+        }
 
-            Iterator<Column> iter = collectionColumns(collectionPrefix, current, now);
-            int adv = Iterators.advance(iter, idx);
-            if (adv != idx || !iter.hasNext())
-                throw new InvalidRequestException(String.format("List index %d out of bound, list has size %d", idx, adv));
-
-            // We don't support null values inside collections, so a condition like 'IF l[3] = null' can only
-            // be false. We do special case though, as the compare below might mind getting a null.
-            if (value == null)
-                return false;
-
-            return ((ListType)column.type).elements.compare(iter.next().value(), value) == 0;
+        static ByteBuffer getListItem(Iterator<Cell> iter, int index) throws InvalidRequestException
+        {
+            int adv = Iterators.advance(iter, index);
+            if (adv == index && iter.hasNext())
+                return iter.next().value();
+            else
+                return null;
         }
 
         public ByteBuffer getCollectionElementValue()
@@ -216,189 +340,436 @@
         }
 
         @Override
-        public boolean equals(Object o)
+        public int hashCode()
         {
-            if (!(o instanceof ElementAccessBound))
-                return false;
+            return Objects.hashCode(column, collectionElement, value, operator);
+        }
+    }
 
-            ElementAccessBound that = (ElementAccessBound)o;
-            if (!column.equals(that.column))
-                return false;
+    static class ElementAccessInBound extends Bound
+    {
+        public final ByteBuffer collectionElement;
+        public final List<ByteBuffer> inValues;
 
-            if ((collectionElement == null) != (that.collectionElement == null))
-                return false;
+        private ElementAccessInBound(ColumnCondition condition, QueryOptions options) throws InvalidRequestException
+        {
+            super(condition.column, condition.operator);
+            assert column.type instanceof CollectionType && condition.collectionElement != null;
+            this.collectionElement = condition.collectionElement.bindAndGet(options);
 
-            if (collectionElement != null)
+            if (condition.inValues == null)
+                this.inValues = ((Lists.Marker) condition.value).bind(options).getElements();
+            else
             {
-                assert column.type instanceof ListType || column.type instanceof MapType;
-                AbstractType<?> comparator = column.type instanceof ListType
-                                           ? Int32Type.instance
-                                           : ((MapType)column.type).keys;
+                this.inValues = new ArrayList<>(condition.inValues.size());
+                for (Term value : condition.inValues)
+                    this.inValues.add(value.bindAndGet(options));
+            }
+        }
 
-                if (comparator.compare(collectionElement, that.collectionElement) != 0)
-                    return false;
+        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
+        {
+            if (collectionElement == null)
+                throw new InvalidRequestException("Invalid null value for " + (column.type instanceof MapType ? "map" : "list") + " element access");
+
+            CellNameType nameType = current.metadata().comparator;
+            if (column.type instanceof MapType)
+            {
+                CellName name = nameType.create(rowPrefix, column, collectionElement);
+                Cell item = current.getColumn(name);
+                AbstractType<?> valueType = ((MapType) column.type).values;
+                for (ByteBuffer value : inValues)
+                {
+                    if (isSatisfiedByValue(value, item, valueType, Relation.Type.EQ, now))
+                        return true;
+                }
+                return false;
             }
 
-            return column.type.compare(value, that.value) == 0;
+            assert column.type instanceof ListType;
+            ByteBuffer columnValue = ElementAccessBound.getListItem(
+                    collectionColumns(nameType.create(rowPrefix, column), current, now),
+                    ElementAccessBound.getListIndex(collectionElement));
+
+            AbstractType<?> valueType = ((ListType) column.type).elements;
+            for (ByteBuffer value : inValues)
+            {
+                if (compareWithOperator(Relation.Type.EQ, valueType, value, columnValue))
+                    return true;
+            }
+            return false;
         }
 
         @Override
         public int hashCode()
         {
-            return Objects.hashCode(column, collectionElement, value);
+            return Objects.hashCode(column, collectionElement, inValues, operator);
         }
     }
 
-    private static class CollectionBound extends Bound
+    /** A condition on an entire collection column. IN operators are not supported here, see CollectionInBound. */
+    static class CollectionBound extends Bound
     {
         public final Term.Terminal value;
 
-        private CollectionBound(ColumnCondition condition, List<ByteBuffer> variables) throws InvalidRequestException
+        private CollectionBound(ColumnCondition condition, QueryOptions options) throws InvalidRequestException
         {
-            super(condition.column);
+            super(condition.column, condition.operator);
             assert column.type instanceof CollectionType && condition.collectionElement == null;
-            this.value = condition.value.bind(variables);
+            assert !condition.operator.equals(Relation.Type.IN);
+            this.value = condition.value.bind(options);
         }
 
-        public boolean appliesTo(ColumnNameBuilder rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
+        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
         {
             CollectionType type = (CollectionType)column.type;
-            CFMetaData cfm = current.metadata();
 
-            ColumnNameBuilder collectionPrefix = copyOrUpdatePrefix(cfm, rowPrefix).add(column.name.key);
+            Iterator<Cell> iter = collectionColumns(current.metadata().comparator.create(rowPrefix, column), current, now);
+            if (value == null)
+            {
+                if (operator.equals(Relation.Type.EQ))
+                    return !iter.hasNext();
+                else if (operator.equals(Relation.Type.NEQ))
+                    return iter.hasNext();
+                else
+                    throw new InvalidRequestException(String.format("Invalid comparison with null for operator \"%s\"", operator));
+            }
 
-            Iterator<Column> iter = collectionColumns(collectionPrefix, current, now);
+            return valueAppliesTo(type, iter, value, operator);
+        }
+
+        static boolean valueAppliesTo(CollectionType type, Iterator<Cell> iter, Term.Terminal value, Relation.Type operator)
+        {
             if (value == null)
                 return !iter.hasNext();
 
             switch (type.kind)
             {
-                case LIST: return listAppliesTo((ListType)type, cfm, iter, ((Lists.Value)value).elements);
-                case SET: return setAppliesTo((SetType)type, cfm, iter, ((Sets.Value)value).elements);
-                case MAP: return mapAppliesTo((MapType)type, cfm, iter, ((Maps.Value)value).map);
+                case LIST: return listAppliesTo((ListType)type, iter, ((Lists.Value)value).elements, operator);
+                case SET: return setAppliesTo((SetType)type, iter, ((Sets.Value)value).elements, operator);
+                case MAP: return mapAppliesTo((MapType)type, iter, ((Maps.Value)value).map, operator);
             }
             throw new AssertionError();
         }
 
-        private ByteBuffer collectionKey(CFMetaData cfm, Column c)
+        private static boolean setOrListAppliesTo(AbstractType<?> type, Iterator<Cell> iter, Iterator<ByteBuffer> conditionIter, Relation.Type operator, boolean isSet)
         {
-            ByteBuffer[] bbs = ((CompositeType)cfm.comparator).split(c.name());
-            return bbs[bbs.length - 1];
-        }
-
-        private boolean listAppliesTo(ListType type, CFMetaData cfm, Iterator<Column> iter, List<ByteBuffer> elements)
-        {
-            for (ByteBuffer e : elements)
-                if (!iter.hasNext() || type.elements.compare(iter.next().value(), e) != 0)
-                    return false;
-            // We must not have more elements than expected
-            return !iter.hasNext();
-        }
-
-        private boolean setAppliesTo(SetType type, CFMetaData cfm, Iterator<Column> iter, Set<ByteBuffer> elements)
-        {
-            Set<ByteBuffer> remaining = new TreeSet<>(type.elements);
-            remaining.addAll(elements);
-            while (iter.hasNext())
+            while(iter.hasNext())
             {
-                if (remaining.isEmpty())
-                    return false;
+                if (!conditionIter.hasNext())
+                    return operator.equals(Relation.Type.GT) || operator.equals(Relation.Type.GTE) || operator.equals(Relation.Type.NEQ);
 
-                if (!remaining.remove(collectionKey(cfm, iter.next())))
-                    return false;
+                // for lists we use the cell value; for sets we use the cell name
+                ByteBuffer cellValue = isSet? iter.next().name().collectionElement() : iter.next().value();
+                int comparison = type.compare(cellValue, conditionIter.next());
+                if (comparison != 0)
+                    return evaluateComparisonWithOperator(comparison, operator);
             }
-            return remaining.isEmpty();
+
+            if (conditionIter.hasNext())
+                return operator.equals(Relation.Type.LT) || operator.equals(Relation.Type.LTE) || operator.equals(Relation.Type.NEQ);
+
+            // they're equal
+            return operator == Relation.Type.EQ || operator == Relation.Type.LTE || operator == Relation.Type.GTE;
         }
 
-        private boolean mapAppliesTo(MapType type, CFMetaData cfm, Iterator<Column> iter, Map<ByteBuffer, ByteBuffer> elements)
+        private static boolean evaluateComparisonWithOperator(int comparison, Relation.Type operator)
         {
-            Map<ByteBuffer, ByteBuffer> remaining = new TreeMap<>(type.keys);
-            remaining.putAll(elements);
-            while (iter.hasNext())
+            // called when comparison != 0
+            switch (operator)
             {
-                if (remaining.isEmpty())
+                case EQ:
                     return false;
-
-                Column c = iter.next();
-                ByteBuffer previous = remaining.remove(collectionKey(cfm, c));
-                if (previous == null || type.values.compare(previous, c.value()) != 0)
-                    return false;
+                case LT:
+                case LTE:
+                    return comparison < 0;
+                case GT:
+                case GTE:
+                    return comparison > 0;
+                case NEQ:
+                    return true;
+                default:
+                    throw new AssertionError();
             }
-            return remaining.isEmpty();
         }
 
-        @Override
-        public boolean equals(Object o)
+        static boolean listAppliesTo(ListType type, Iterator<Cell> iter, List<ByteBuffer> elements, Relation.Type operator)
         {
-            if (!(o instanceof CollectionBound))
-                return false;
+            return setOrListAppliesTo(type.elements, iter, elements.iterator(), operator, false);
+        }
 
-            CollectionBound that = (CollectionBound)o;
-            if (!column.equals(that.column))
-                return false;
+        static boolean setAppliesTo(SetType type, Iterator<Cell> iter, Set<ByteBuffer> elements, Relation.Type operator)
+        {
+            ArrayList<ByteBuffer> sortedElements = new ArrayList<>(elements.size());
+            sortedElements.addAll(elements);
+            Collections.sort(sortedElements, type.elements);
+            return setOrListAppliesTo(type.elements, iter, sortedElements.iterator(), operator, true);
+        }
 
-            // Slightly inefficient because it serialize the collection just for the sake of comparison.
-            // We could improve by adding an equals() method to Lists.Value, Sets.Value and Maps.Value but
-            // this method is only called when there is 2 conditions on the same collection to make sure
-            // both are not incompatible, so overall it's probably not worth the effort.
-            ByteBuffer thisVal = value.get();
-            ByteBuffer thatVal = that.value.get();
-            return thisVal == null || thatVal == null
-                 ? thisVal == null && thatVal == null
-                 : column.type.compare(thisVal, thatVal) == 0;
+        static boolean mapAppliesTo(MapType type, Iterator<Cell> iter, Map<ByteBuffer, ByteBuffer> elements, Relation.Type operator)
+        {
+            Iterator<Map.Entry<ByteBuffer, ByteBuffer>> conditionIter = elements.entrySet().iterator();
+            while(iter.hasNext())
+            {
+                if (!conditionIter.hasNext())
+                    return operator.equals(Relation.Type.GT) || operator.equals(Relation.Type.GTE) || operator.equals(Relation.Type.NEQ);
+
+                Map.Entry<ByteBuffer, ByteBuffer> conditionEntry = conditionIter.next();
+                Cell c = iter.next();
+
+                // compare the keys
+                int comparison = type.keys.compare(c.name().collectionElement(), conditionEntry.getKey());
+                if (comparison != 0)
+                    return evaluateComparisonWithOperator(comparison, operator);
+
+                // compare the values
+                comparison = type.values.compare(c.value(), conditionEntry.getValue());
+                if (comparison != 0)
+                    return evaluateComparisonWithOperator(comparison, operator);
+            }
+
+            if (conditionIter.hasNext())
+                return operator.equals(Relation.Type.LT) || operator.equals(Relation.Type.LTE) || operator.equals(Relation.Type.NEQ);
+
+            // they're equal
+            return operator == Relation.Type.EQ || operator == Relation.Type.LTE || operator == Relation.Type.GTE;
         }
 
         @Override
         public int hashCode()
         {
-            return Objects.hashCode(column, value.get());
+            Object val = null;
+            if (value != null)
+            {
+                switch (((CollectionType)column.type).kind)
+                {
+                    case LIST:
+                        val = ((Lists.Value)value).elements.hashCode();
+                        break;
+                    case SET:
+                        val = ((Sets.Value)value).elements.hashCode();
+                        break;
+                    case MAP:
+                        val = ((Maps.Value)value).map.hashCode();
+                        break;
+                }
+            }
+            return Objects.hashCode(column, val);
+        }
+    }
+
+    public static class CollectionInBound extends Bound
+    {
+        public final List<Term.Terminal> inValues;
+
+        private CollectionInBound(ColumnCondition condition, QueryOptions options) throws InvalidRequestException
+        {
+            super(condition.column, condition.operator);
+            assert column.type instanceof CollectionType && condition.collectionElement == null;
+            assert condition.operator.equals(Relation.Type.IN);
+            inValues = new ArrayList<>();
+            if (condition.inValues == null)
+            {
+                // We have a list of serialized collections that need to be deserialized for later comparisons
+                CollectionType collectionType = (CollectionType) column.type;
+                Lists.Marker inValuesMarker = (Lists.Marker) condition.value;
+                if (column.type instanceof ListType)
+                {
+                    ListType deserializer = ListType.getInstance(collectionType.valueComparator());
+                    for (ByteBuffer buffer : inValuesMarker.bind(options).elements)
+                    {
+                        if (buffer == null)
+                            this.inValues.add(null);
+                        else
+                            this.inValues.add(Lists.Value.fromSerialized(buffer, deserializer, options.getProtocolVersion()));
+                    }
+                }
+                else if (column.type instanceof MapType)
+                {
+                    MapType deserializer = MapType.getInstance(collectionType.nameComparator(), collectionType.valueComparator());
+                    for (ByteBuffer buffer : inValuesMarker.bind(options).elements)
+                    {
+                        if (buffer == null)
+                            this.inValues.add(null);
+                        else
+                            this.inValues.add(Maps.Value.fromSerialized(buffer, deserializer, options.getProtocolVersion()));
+                    }
+                }
+                else if (column.type instanceof SetType)
+                {
+                    SetType deserializer = SetType.getInstance(collectionType.valueComparator());
+                    for (ByteBuffer buffer : inValuesMarker.bind(options).elements)
+                    {
+                        if (buffer == null)
+                            this.inValues.add(null);
+                        else
+                            this.inValues.add(Sets.Value.fromSerialized(buffer, deserializer, options.getProtocolVersion()));
+                    }
+                }
+            }
+            else
+            {
+                for (Term value : condition.inValues)
+                    this.inValues.add(value.bind(options));
+            }
+        }
+
+        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
+        {
+            CollectionType type = (CollectionType)column.type;
+            CellName name = current.metadata().comparator.create(rowPrefix, column);
+
+            // copy iterator contents so that we can properly reuse them for each comparison with an IN value
+            List<Cell> cells = newArrayList(collectionColumns(name, current, now));
+            for (Term.Terminal value : inValues)
+            {
+                if (CollectionBound.valueAppliesTo(type, cells.iterator(), value, Relation.Type.EQ))
+                    return true;
+            }
+            return false;
+        }
+
+        @Override
+        public int hashCode()
+        {
+            List<Collection<ByteBuffer>> inValueBuffers = new ArrayList<>(inValues.size());
+            switch (((CollectionType)column.type).kind)
+            {
+                case LIST:
+                    for (Term.Terminal term : inValues)
+                        inValueBuffers.add(term == null ? null : ((Lists.Value)term).elements);
+                    break;
+                case SET:
+                    for (Term.Terminal term : inValues)
+                        inValueBuffers.add(term == null ? null : ((Sets.Value)term).elements);
+                    break;
+                case MAP:
+                    for (Term.Terminal term : inValues)
+                    {
+                        if (term != null)
+                        {
+                            inValueBuffers.add(((Maps.Value)term).map.keySet());
+                            inValueBuffers.add(((Maps.Value)term).map.values());
+                        }
+                        else
+                            inValueBuffers.add(null);
+                    }
+                    break;
+            }
+            return Objects.hashCode(column, inValueBuffers, operator);
         }
     }
 
     public static class Raw
     {
         private final Term.Raw value;
+        private final List<Term.Raw> inValues;
+        private final AbstractMarker.INRaw inMarker;
 
         // Can be null, only used with the syntax "IF m[e] = ..." (in which case it's 'e')
         private final Term.Raw collectionElement;
 
-        private Raw(Term.Raw value, Term.Raw collectionElement)
+        private final Relation.Type operator;
+
+        private Raw(Term.Raw value, List<Term.Raw> inValues, AbstractMarker.INRaw inMarker, Term.Raw collectionElement, Relation.Type op)
         {
             this.value = value;
+            this.inValues = inValues;
+            this.inMarker = inMarker;
             this.collectionElement = collectionElement;
+            this.operator = op;
         }
 
-        public static Raw simpleEqual(Term.Raw value)
+        /** A condition on a column. For example: "IF col = 'foo'" */
+        public static Raw simpleCondition(Term.Raw value, Relation.Type op)
         {
-            return new Raw(value, null);
+            return new Raw(value, null, null, null, op);
         }
 
-        public static Raw collectionEqual(Term.Raw value, Term.Raw collectionElement)
+        /** An IN condition on a column. For example: "IF col IN ('foo', 'bar', ...)" */
+        public static Raw simpleInCondition(List<Term.Raw> inValues)
         {
-            return new Raw(value, collectionElement);
+            return new Raw(null, inValues, null, null, Relation.Type.IN);
         }
 
-        public ColumnCondition prepare(CFDefinition.Name receiver) throws InvalidRequestException
+        /** An IN condition on a column with a single marker. For example: "IF col IN ?" */
+        public static Raw simpleInCondition(AbstractMarker.INRaw inMarker)
+        {
+            return new Raw(null, null, inMarker, null, Relation.Type.IN);
+        }
+
+        /** A condition on a collection element. For example: "IF col['key'] = 'foo'" */
+        public static Raw collectionCondition(Term.Raw value, Term.Raw collectionElement, Relation.Type op)
+        {
+            return new Raw(value, null, null, collectionElement, op);
+        }
+
+        /** An IN condition on a collection element. For example: "IF col['key'] IN ('foo', 'bar', ...)" */
+        public static Raw collectionInCondition(Term.Raw collectionElement, List<Term.Raw> inValues)
+        {
+            return new Raw(null, inValues, null, collectionElement, Relation.Type.IN);
+        }
+
+        /** An IN condition on a collection element with a single marker. For example: "IF col['key'] IN ?" */
+        public static Raw collectionInCondition(Term.Raw collectionElement, AbstractMarker.INRaw inMarker)
+        {
+            return new Raw(null, null, inMarker, collectionElement, Relation.Type.IN);
+        }
+
+        public ColumnCondition prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
         {
             if (receiver.type instanceof CounterColumnType)
-                throw new InvalidRequestException("Condtions on counters are not supported");
+                throw new InvalidRequestException("Conditions on counters are not supported");
 
             if (collectionElement == null)
-                return ColumnCondition.equal(receiver, value.prepare(receiver));
+            {
+                if (operator.equals(Relation.Type.IN))
+                {
+                    if (inValues == null)
+                        return ColumnCondition.inCondition(receiver, inMarker.prepare(keyspace, receiver));
+                    List<Term> terms = new ArrayList<>(inValues.size());
+                    for (Term.Raw value : inValues)
+                        terms.add(value.prepare(keyspace, receiver));
+                    return ColumnCondition.inCondition(receiver, terms);
+                }
+                else
+                {
+                    return ColumnCondition.condition(receiver, value.prepare(keyspace, receiver), operator);
+                }
+            }
 
             if (!(receiver.type.isCollection()))
                 throw new InvalidRequestException(String.format("Invalid element access syntax for non-collection column %s", receiver.name));
 
-            switch (((CollectionType)receiver.type).kind)
+            ColumnSpecification elementSpec, valueSpec;
+            switch ((((CollectionType)receiver.type).kind))
             {
                 case LIST:
-                    return ColumnCondition.equal(receiver, collectionElement.prepare(Lists.indexSpecOf(receiver)), value.prepare(Lists.valueSpecOf(receiver)));
+                    elementSpec = Lists.indexSpecOf(receiver);
+                    valueSpec = Lists.valueSpecOf(receiver);
+                    break;
+                case MAP:
+                    elementSpec = Maps.keySpecOf(receiver);
+                    valueSpec = Maps.valueSpecOf(receiver);
+                    break;
                 case SET:
                     throw new InvalidRequestException(String.format("Invalid element access syntax for set column %s", receiver.name));
-                case MAP:
-                    return ColumnCondition.equal(receiver, collectionElement.prepare(Maps.keySpecOf(receiver)), value.prepare(Maps.valueSpecOf(receiver)));
+                default:
+                    throw new AssertionError();
             }
-            throw new AssertionError();
+            if (operator.equals(Relation.Type.IN))
+            {
+                if (inValues == null)
+                    return ColumnCondition.inCondition(receiver, collectionElement.prepare(keyspace, elementSpec), inMarker.prepare(keyspace, valueSpec));
+                List<Term> terms = new ArrayList<>(inValues.size());
+                for (Term.Raw value : inValues)
+                    terms.add(value.prepare(keyspace, valueSpec));
+                return ColumnCondition.inCondition(receiver, collectionElement.prepare(keyspace, elementSpec), terms);
+            }
+            else
+            {
+                return ColumnCondition.condition(receiver, collectionElement.prepare(keyspace, elementSpec), value.prepare(keyspace, valueSpec), operator);
+            }
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/ColumnIdentifier.java b/src/java/org/apache/cassandra/cql3/ColumnIdentifier.java
index a8a25cf..fa151d2 100644
--- a/src/java/org/apache/cassandra/cql3/ColumnIdentifier.java
+++ b/src/java/org/apache/cassandra/cql3/ColumnIdentifier.java

@@ -20,43 +20,60 @@
 import java.util.Locale;
 import java.nio.ByteBuffer;
 
+import org.apache.cassandra.cache.IMeasurableMemory;
 import org.apache.cassandra.cql3.statements.Selectable;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
 
 /**
  * Represents an identifer for a CQL column definition.
+ * TODO : should support light-weight mode without text representation for when not interned
  */
-public class ColumnIdentifier implements Selectable
+public class ColumnIdentifier implements Selectable, IMeasurableMemory
 {
-    public final ByteBuffer key;
+    public final ByteBuffer bytes;
     private final String text;
 
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new ColumnIdentifier("", true));
+
     public ColumnIdentifier(String rawText, boolean keepCase)
     {
         this.text = keepCase ? rawText : rawText.toLowerCase(Locale.US);
-        this.key = ByteBufferUtil.bytes(this.text);
+        this.bytes = ByteBufferUtil.bytes(this.text);
     }
 
-    public ColumnIdentifier(ByteBuffer key, AbstractType<?> type)
+    public ColumnIdentifier(ByteBuffer bytes, AbstractType<?> type)
     {
-        this.key = key;
-        this.text = type.getString(key);
+        this.bytes = bytes;
+        this.text = type.getString(bytes);
+    }
+
+    public ColumnIdentifier(ByteBuffer bytes, String text)
+    {
+        this.bytes = bytes;
+        this.text = text;
     }
 
     @Override
     public final int hashCode()
     {
-        return key.hashCode();
+        return bytes.hashCode();
     }
 
     @Override
     public final boolean equals(Object o)
     {
+        // Note: it's worth checking for reference equality since we intern those
+        // in SparseCellNameType
+        if (this == o)
+            return true;
+
         if(!(o instanceof ColumnIdentifier))
             return false;
         ColumnIdentifier that = (ColumnIdentifier)o;
-        return key.equals(that.key);
+        return bytes.equals(that.bytes);
     }
 
     @Override
@@ -64,4 +81,24 @@
     {
         return text;
     }
+
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE
+             + ObjectSizes.sizeOnHeapOf(bytes)
+             + ObjectSizes.sizeOf(text);
+    }
+
+    public long unsharedHeapSizeExcludingData()
+    {
+        return EMPTY_SIZE
+             + ObjectSizes.sizeOnHeapExcludingData(bytes)
+             + ObjectSizes.sizeOf(text);
+    }
+
+    public ColumnIdentifier clone(AbstractAllocator allocator)
+    {
+        return new ColumnIdentifier(allocator.clone(bytes), text);
+    }
+
 }

diff --git a/src/java/org/apache/cassandra/cql3/ColumnNameBuilder.java b/src/java/org/apache/cassandra/cql3/ColumnNameBuilder.java
deleted file mode 100644
index 3d5eff6..0000000
--- a/src/java/org/apache/cassandra/cql3/ColumnNameBuilder.java
+++ /dev/null

@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import java.nio.ByteBuffer;
-
-/**
- * Build a potentially composite column name.
- */
-public interface ColumnNameBuilder
-{
-    /**
-     * Add a new ByteBuffer as the next component for this name.
-     * @param bb the ByteBuffer to add
-     * @throws IllegalStateException if the builder if full, i.e. if enough component has been added.
-     * @return this builder
-     */
-    public ColumnNameBuilder add(ByteBuffer bb);
-
-    /**
-     * Returns the number of component already added to this builder.
-     * @return the number of component in this Builder
-     */
-    public int componentCount();
-
-    /**
-     * @return the maximum number of component that can still be added to this Builder
-     */
-    public int remainingCount();
-
-    /**
-     * @return the ith component in this builder.
-     */
-    public ByteBuffer get(int idx);
-
-    /**
-     * Build the column name.
-     * @return the built column name
-     */
-    public ByteBuffer build();
-
-    /**
-     * Build the column name so that the result sorts at the end of the range
-     * represented by this (uncomplete) column name.
-     * @throws IllegalStateException if the builder is empty or full.
-     */
-    public ByteBuffer buildAsEndOfRange();
-
-    public ByteBuffer buildForRelation(Relation.Type op);
-
-    /**
-     * Clone this builder.
-     * @return the cloned builder.
-     */
-    public ColumnNameBuilder copy();
-
-    /**
-     * Returns the ith component added to this builder.
-     *
-     * @param i the component to return
-     * @return the ith component added to this builder.
-     * @throws IllegalArgumentException if i >= componentCount().
-     */
-    public ByteBuffer getComponent(int i);
-
-}

diff --git a/src/java/org/apache/cassandra/cql3/ColumnSpecification.java b/src/java/org/apache/cassandra/cql3/ColumnSpecification.java
index 4dae701..d2e08f9 100644
--- a/src/java/org/apache/cassandra/cql3/ColumnSpecification.java
+++ b/src/java/org/apache/cassandra/cql3/ColumnSpecification.java

@@ -33,11 +33,4 @@
         this.name = name;
         this.type = type;
     }
-
-    @Override
-    public String toString()
-    {
-        // Not fully conventional, but convenient (for error message to users in particular)
-        return name.toString();
-    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/Constants.java b/src/java/org/apache/cassandra/cql3/Constants.java
index 5189517..a8f0120 100644
--- a/src/java/org/apache/cassandra/cql3/Constants.java
+++ b/src/java/org/apache/cassandra/cql3/Constants.java

@@ -20,18 +20,20 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
-import org.apache.cassandra.serializers.MarshalException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CollectionType;
 import org.apache.cassandra.db.marshal.CounterColumnType;
 import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.db.marshal.ReversedType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 /**
@@ -51,22 +53,28 @@
         private final Term.Terminal NULL_VALUE = new Value(null)
         {
             @Override
-            public Terminal bind(List<ByteBuffer> values)
+            public Terminal bind(QueryOptions options)
             {
                 // We return null because that makes life easier for collections
                 return null;
             }
+
+            @Override
+            public String toString()
+            {
+                return "null";
+            }
         };
 
-        public Term prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            if (!isAssignableTo(receiver))
+            if (!isAssignableTo(keyspace, receiver))
                 throw new InvalidRequestException("Invalid null value for counter increment/decrement");
 
             return NULL_VALUE;
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
         {
             return !(receiver.type instanceof CounterColumnType);
         }
@@ -74,7 +82,7 @@
         @Override
         public String toString()
         {
-            return null;
+            return "null";
         }
     };
 
@@ -120,10 +128,10 @@
             return new Literal(Type.HEX, text);
         }
 
-        public Value prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public Value prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            if (!isAssignableTo(receiver))
-                throw new InvalidRequestException(String.format("Invalid %s constant (%s) for %s of type %s", type, text, receiver, receiver.type.asCQL3Type()));
+            if (!isAssignableTo(keyspace, receiver))
+                throw new InvalidRequestException(String.format("Invalid %s constant (%s) for \"%s\" of type %s", type, text, receiver.name, receiver.type.asCQL3Type()));
 
             return new Value(parsedValue(receiver.type));
         }
@@ -152,7 +160,7 @@
             return text;
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
         {
             CQL3Type receiverType = receiver.type.asCQL3Type();
             if (receiverType.isCollection())
@@ -244,13 +252,13 @@
             this.bytes = bytes;
         }
 
-        public ByteBuffer get()
+        public ByteBuffer get(QueryOptions options)
         {
             return bytes;
         }
 
         @Override
-        public ByteBuffer bindAndGet(List<ByteBuffer> values)
+        public ByteBuffer bindAndGet(QueryOptions options)
         {
             return bytes;
         }
@@ -267,15 +275,15 @@
         protected Marker(int bindIndex, ColumnSpecification receiver)
         {
             super(bindIndex, receiver);
-            assert !(receiver.type instanceof CollectionType);
+            assert !receiver.type.isCollection();
         }
 
         @Override
-        public ByteBuffer bindAndGet(List<ByteBuffer> values) throws InvalidRequestException
+        public ByteBuffer bindAndGet(QueryOptions options) throws InvalidRequestException
         {
             try
             {
-                ByteBuffer value = values.get(bindIndex);
+                ByteBuffer value = options.getValues().get(bindIndex);
                 if (value != null)
                     receiver.type.validate(value);
                 return value;
@@ -286,58 +294,56 @@
             }
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        public Value bind(QueryOptions options) throws InvalidRequestException
         {
-            ByteBuffer bytes = bindAndGet(values);
+            ByteBuffer bytes = bindAndGet(options);
             return bytes == null ? null : new Constants.Value(bytes);
         }
     }
 
     public static class Setter extends Operation
     {
-        public Setter(ColumnIdentifier column, Term t)
+        public Setter(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            prefix = maybeUpdatePrefix(cf.metadata(), prefix);
-            ByteBuffer cname = columnName == null ? prefix.build() : prefix.add(columnName.key).build();
-            ByteBuffer value = t.bindAndGet(params.variables);
+            CellName cname = cf.getComparator().create(prefix, column);
+            ByteBuffer value = t.bindAndGet(params.options);
             cf.addColumn(value == null ? params.makeTombstone(cname) : params.makeColumn(cname, value));
         }
     }
 
     public static class Adder extends Operation
     {
-        public Adder(ColumnIdentifier column, Term t)
+        public Adder(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            ByteBuffer bytes = t.bindAndGet(params.variables);
+            ByteBuffer bytes = t.bindAndGet(params.options);
             if (bytes == null)
                 throw new InvalidRequestException("Invalid null value for counter increment");
             long increment = ByteBufferUtil.toLong(bytes);
-            prefix = maybeUpdatePrefix(cf.metadata(), prefix);
-            ByteBuffer cname = columnName == null ? prefix.build() : prefix.add(columnName.key).build();
+            CellName cname = cf.getComparator().create(prefix, column);
             cf.addColumn(params.makeCounter(cname, increment));
         }
     }
 
     public static class Substracter extends Operation
     {
-        public Substracter(ColumnIdentifier column, Term t)
+        public Substracter(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            ByteBuffer bytes = t.bindAndGet(params.variables);
+            ByteBuffer bytes = t.bindAndGet(params.options);
             if (bytes == null)
                 throw new InvalidRequestException("Invalid null value for counter increment");
 
@@ -345,8 +351,7 @@
             if (increment == Long.MIN_VALUE)
                 throw new InvalidRequestException("The negation of " + increment + " overflows supported counter precision (signed 8 bytes integer)");
 
-            prefix = maybeUpdatePrefix(cf.metadata(), prefix);
-            ByteBuffer cname = columnName == null ? prefix.build() : prefix.add(columnName.key).build();
+            CellName cname = cf.getComparator().create(prefix, column);
             cf.addColumn(params.makeCounter(cname, -increment));
         }
     }
@@ -355,22 +360,18 @@
     // duplicating this further
     public static class Deleter extends Operation
     {
-        private final boolean isCollection;
-
-        public Deleter(ColumnIdentifier column, boolean isCollection)
+        public Deleter(ColumnDefinition column)
         {
             super(column, null);
-            this.isCollection = isCollection;
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            ColumnNameBuilder column = maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key);
-
-            if (isCollection)
-                cf.addAtom(params.makeRangeTombstone(column.build(), column.buildAsEndOfRange()));
+            CellName cname = cf.getComparator().create(prefix, column);
+            if (column.type.isCollection())
+                cf.addAtom(params.makeRangeTombstone(cname.slice()));
             else
-                cf.addColumn(params.makeTombstone(column.build()));
+                cf.addColumn(params.makeTombstone(cname));
         }
     };
 }

diff --git a/src/java/org/apache/cassandra/cql3/Cql.g b/src/java/org/apache/cassandra/cql3/Cql.g
index 06dbc3a..2c90c18 100644
--- a/src/java/org/apache/cassandra/cql3/Cql.g
+++ b/src/java/org/apache/cassandra/cql3/Cql.g

@@ -30,6 +30,7 @@
     import java.util.Arrays;
     import java.util.Collections;
     import java.util.EnumSet;
+    import java.util.HashSet;
     import java.util.HashMap;
     import java.util.LinkedHashMap;
     import java.util.List;
@@ -50,9 +51,21 @@
 }
 
 @members {
-    private final List<String> recognitionErrors = new ArrayList<String>();
+    private final List<ErrorListener> listeners = new ArrayList<ErrorListener>();
     private final List<ColumnIdentifier> bindVariables = new ArrayList<ColumnIdentifier>();
 
+    public static final Set<String> reservedTypeNames = new HashSet<String>()
+    {{
+        add("byte");
+        add("smallint");
+        add("complex");
+        add("enum");
+        add("date");
+        add("interval");
+        add("macaddr");
+        add("bitstring");
+    }};
+
     public AbstractMarker.Raw newBindVariables(ColumnIdentifier name)
     {
         AbstractMarker.Raw marker = new AbstractMarker.Raw(bindVariables.size());
@@ -81,27 +94,26 @@
         return marker;
     }
 
+    public void addErrorListener(ErrorListener listener)
+    {
+        this.listeners.add(listener);
+    }
+
+    public void removeErrorListener(ErrorListener listener)
+    {
+        this.listeners.remove(listener);
+    }
+
     public void displayRecognitionError(String[] tokenNames, RecognitionException e)
     {
-        String hdr = getErrorHeader(e);
-        String msg = getErrorMessage(e, tokenNames);
-        recognitionErrors.add(hdr + " " + msg);
+        for (int i = 0, m = listeners.size(); i < m; i++)
+            listeners.get(i).syntaxError(this, tokenNames, e);
     }
 
-    public void addRecognitionError(String msg)
+    private void addRecognitionError(String msg)
     {
-        recognitionErrors.add(msg);
-    }
-
-    public List<String> getRecognitionErrors()
-    {
-        return recognitionErrors;
-    }
-
-    public void throwLastRecognitionError() throws SyntaxException
-    {
-        if (recognitionErrors.size() > 0)
-            throw new SyntaxException(recognitionErrors.get((recognitionErrors.size()-1)));
+        for (int i = 0, m = listeners.size(); i < m; i++)
+            listeners.get(i).syntaxError(this, msg);
     }
 
     public Map<String, String> convertPropertyMap(Maps.Literal map)
@@ -172,28 +184,26 @@
     {
         super.nextToken();
         if (tokens.size() == 0)
-            return Token.EOF_TOKEN;
+            return new CommonToken(Token.EOF);
         return tokens.remove(0);
     }
 
-    private List<String> recognitionErrors = new ArrayList<String>();
+    private final List<ErrorListener> listeners = new ArrayList<ErrorListener>();
+
+    public void addErrorListener(ErrorListener listener)
+    {
+        this.listeners.add(listener);
+    }
+
+    public void removeErrorListener(ErrorListener listener)
+    {
+        this.listeners.remove(listener);
+    }
 
     public void displayRecognitionError(String[] tokenNames, RecognitionException e)
     {
-        String hdr = getErrorHeader(e);
-        String msg = getErrorMessage(e, tokenNames);
-        recognitionErrors.add(hdr + " " + msg);
-    }
-
-    public List<String> getRecognitionErrors()
-    {
-        return recognitionErrors;
-    }
-
-    public void throwLastRecognitionError() throws SyntaxException
-    {
-        if (recognitionErrors.size() > 0)
-            throw new SyntaxException(recognitionErrors.get((recognitionErrors.size()-1)));
+        for (int i = 0, m = listeners.size(); i < m; i++)
+            listeners.get(i).syntaxError(this, tokenNames, e);
     }
 }
 
@@ -229,6 +239,9 @@
     | st22=listUsersStatement          { $stmt = st22; }
     | st23=createTriggerStatement      { $stmt = st23; }
     | st24=dropTriggerStatement        { $stmt = st24; }
+    | st25=createTypeStatement         { $stmt = st25; }
+    | st26=alterTypeStatement          { $stmt = st26; }
+    | st27=dropTypeStatement           { $stmt = st27; }
     ;
 
 /*
@@ -281,17 +294,19 @@
     ;
 
 unaliasedSelector returns [Selectable s]
-    : c=cident                                  { $s = c; }
-    | K_WRITETIME '(' c=cident ')'              { $s = new Selectable.WritetimeOrTTL(c, true); }
-    | K_TTL       '(' c=cident ')'              { $s = new Selectable.WritetimeOrTTL(c, false); }
-    | f=functionName args=selectionFunctionArgs { $s = new Selectable.WithFunction(f, args); }
+    @init { Selectable tmp = null; }
+    :  ( c=cident                                  { tmp = c; }
+       | K_WRITETIME '(' c=cident ')'              { tmp = new Selectable.WritetimeOrTTL(c, true); }
+       | K_TTL       '(' c=cident ')'              { tmp = new Selectable.WritetimeOrTTL(c, false); }
+       | f=functionName args=selectionFunctionArgs { tmp = new Selectable.WithFunction(f, args); }
+       ) ( '.' fi=cident { tmp = new Selectable.WithFieldSelection(tmp, fi); } )* { $s = tmp; }
     ;
 
 selectionFunctionArgs returns [List<Selectable> a]
     : '(' ')' { $a = Collections.emptyList(); }
     | '(' s1=unaliasedSelector { List<Selectable> args = new ArrayList<Selectable>(); args.add(s1); }
           ( ',' sn=unaliasedSelector { args.add(sn); } )*
-       ')' { $a = args; }
+      ')' { $a = args; }
     ;
 
 selectCountClause returns [List<RawSelector> expr]
@@ -523,6 +538,26 @@
     : k=cident (K_ASC | K_DESC { reversed=true;} ) { $expr.setOrdering(k, reversed); }
     ;
 
+
+/**
+ * CREATE TYPE foo (
+ *    <name1> <type1>,
+ *    <name2> <type2>,
+ *    ....
+ * )
+ */
+createTypeStatement returns [CreateTypeStatement expr]
+    @init { boolean ifNotExists = false; }
+    : K_CREATE K_TYPE (K_IF K_NOT K_EXISTS { ifNotExists = true; } )?
+         tn=userTypeName { $expr = new CreateTypeStatement(tn, ifNotExists); }
+         '(' typeColumns[expr] ( ',' typeColumns[expr]? )* ')'
+    ;
+
+typeColumns[CreateTypeStatement expr]
+    : k=cident v=comparatorType { $expr.addDefinition(k, v); }
+    ;
+
+
 /**
  * CREATE INDEX [IF NOT EXISTS] [indexName] ON <columnFamily> (<columnName>);
  * CREATE CUSTOM INDEX [IF NOT EXISTS] [indexName] ON <columnFamily> (<columnName>) USING <indexClass>;
@@ -533,26 +568,37 @@
         boolean ifNotExists = false;
     }
     : K_CREATE (K_CUSTOM { props.isCustom = true; })? K_INDEX (K_IF K_NOT K_EXISTS { ifNotExists = true; } )?
-        (idxName=IDENT)? K_ON cf=columnFamilyName '(' id=cident ')'
+        (idxName=IDENT)? K_ON cf=columnFamilyName '(' id=indexIdent ')'
         (K_USING cls=STRING_LITERAL { props.customClass = $cls.text; })?
         (K_WITH properties[props])?
       { $expr = new CreateIndexStatement(cf, $idxName.text, id, props, ifNotExists); }
     ;
 
+indexIdent returns [IndexTarget id]
+    : c=cident                { $id = IndexTarget.of(c); }
+    | K_KEYS '(' c=cident ')' { $id = IndexTarget.keysOf(c); }
+    ;
+
+
 /**
  * CREATE TRIGGER triggerName ON columnFamily USING 'triggerClass';
  */
 createTriggerStatement returns [CreateTriggerStatement expr]
-    : K_CREATE K_TRIGGER (name=IDENT) K_ON cf=columnFamilyName K_USING cls=STRING_LITERAL
-      { $expr = new CreateTriggerStatement(cf, $name.text, $cls.text); }
+    @init {
+        boolean ifNotExists = false;
+    }
+    : K_CREATE K_TRIGGER (K_IF K_NOT K_EXISTS { ifNotExists = true; } )? (name=cident)
+        K_ON cf=columnFamilyName K_USING cls=STRING_LITERAL
+      { $expr = new CreateTriggerStatement(cf, name.toString(), $cls.text, ifNotExists); }
     ;
 
 /**
- * DROP TRIGGER triggerName ON columnFamily;
+ * DROP TRIGGER [IF EXISTS] triggerName ON columnFamily;
  */
 dropTriggerStatement returns [DropTriggerStatement expr]
-    : K_DROP K_TRIGGER (name=IDENT) K_ON cf=columnFamilyName
-      { $expr = new DropTriggerStatement(cf, $name.text); }
+     @init { boolean ifExists = false; }
+    : K_DROP K_TRIGGER (K_IF K_EXISTS { ifExists = true; } )? (name=cident) K_ON cf=columnFamilyName
+      { $expr = new DropTriggerStatement(cf, name.toString(), ifExists); }
     ;
 
 /**
@@ -594,6 +640,24 @@
     ;
 
 /**
+ * ALTER TYPE <name> ALTER <field> TYPE <newtype>;
+ * ALTER TYPE <name> ADD <field> <newtype>;
+ * ALTER TYPE <name> RENAME <field> TO <newtype> AND ...;
+ */
+alterTypeStatement returns [AlterTypeStatement expr]
+    : K_ALTER K_TYPE name=userTypeName
+          ( K_ALTER f=cident K_TYPE v=comparatorType { $expr = AlterTypeStatement.alter(name, f, v); }
+          | K_ADD   f=cident v=comparatorType        { $expr = AlterTypeStatement.addition(name, f, v); }
+          | K_RENAME
+               { Map<ColumnIdentifier, ColumnIdentifier> renames = new HashMap<ColumnIdentifier, ColumnIdentifier>(); }
+                 id1=cident K_TO toId1=cident { renames.put(id1, toId1); }
+                 ( K_AND idn=cident K_TO toIdn=cident { renames.put(idn, toIdn); } )*
+               { $expr = AlterTypeStatement.renames(name, renames); }
+          )
+    ;
+
+
+/**
  * DROP KEYSPACE [IF EXISTS] <KSP>;
  */
 dropKeyspaceStatement returns [DropKeyspaceStatement ksp]
@@ -610,12 +674,20 @@
     ;
 
 /**
+ * DROP TYPE <name>;
+ */
+dropTypeStatement returns [DropTypeStatement stmt]
+    @init { boolean ifExists = false; }
+    : K_DROP K_TYPE (K_IF K_EXISTS { ifExists = true; } )? name=userTypeName { $stmt = new DropTypeStatement(name, ifExists); }
+    ;
+
+/**
  * DROP INDEX [IF EXISTS] <INDEX_NAME>
  */
 dropIndexStatement returns [DropIndexStatement expr]
     @init { boolean ifExists = false; }
-    : K_DROP K_INDEX (K_IF K_EXISTS { ifExists = true; } )? index=IDENT
-      { $expr = new DropIndexStatement($index.text, ifExists); }
+    : K_DROP K_INDEX (K_IF K_EXISTS { ifExists = true; } )? index=indexName
+      { $expr = new DropIndexStatement(index, ifExists); }
     ;
 
 /**
@@ -753,11 +825,26 @@
     : cfOrKsName[name, true] { $id = name.getKeyspace(); }
     ;
 
+indexName returns [IndexName name]
+    @init { $name = new IndexName(); }
+    : (idxOrKsName[name, true] '.')? idxOrKsName[name, false]
+    ;
+
+idxOrKsName[IndexName name, boolean isKs]
+    : t=IDENT              { if (isKs) $name.setKeyspace($t.text, false); else $name.setIndex($t.text, false); }
+    | t=QUOTED_NAME        { if (isKs) $name.setKeyspace($t.text, true); else $name.setIndex($t.text, true); }
+    | k=unreserved_keyword { if (isKs) $name.setKeyspace(k, false); else $name.setIndex(k, false); }
+    ;
+
 columnFamilyName returns [CFName name]
     @init { $name = new CFName(); }
     : (cfOrKsName[name, true] '.')? cfOrKsName[name, false]
     ;
 
+userTypeName returns [UTName name]
+    : (ks=cident '.')? ut=non_type_ident { return new UTName(ks, ut); }
+    ;
+
 cfOrKsName[CFName name, boolean isKs]
     : t=IDENT              { if (isKs) $name.setKeyspace($t.text, false); else $name.setColumnFamily($t.text, false); }
     | t=QUOTED_NAME        { if (isKs) $name.setKeyspace($t.text, true); else $name.setColumnFamily($t.text, true); }
@@ -774,13 +861,13 @@
     | { String sign=""; } ('-' {sign = "-"; } )? t=(K_NAN | K_INFINITY) { $constant = Constants.Literal.floatingPoint(sign + $t.text); }
     ;
 
-map_literal returns [Maps.Literal map]
+mapLiteral returns [Maps.Literal map]
     : '{' { List<Pair<Term.Raw, Term.Raw>> m = new ArrayList<Pair<Term.Raw, Term.Raw>>(); }
           ( k1=term ':' v1=term { m.add(Pair.create(k1, v1)); } ( ',' kn=term ':' vn=term { m.add(Pair.create(kn, vn)); } )* )?
       '}' { $map = new Maps.Literal(m); }
     ;
 
-set_or_map[Term.Raw t] returns [Term.Raw value]
+setOrMapLiteral[Term.Raw t] returns [Term.Raw value]
     : ':' v=term { List<Pair<Term.Raw, Term.Raw>> m = new ArrayList<Pair<Term.Raw, Term.Raw>>(); m.add(Pair.create(t, v)); }
           ( ',' kn=term ':' vn=term { m.add(Pair.create(kn, vn)); } )*
       { $value = new Maps.Literal(m); }
@@ -789,19 +876,34 @@
       { $value = new Sets.Literal(s); }
     ;
 
-collection_literal returns [Term.Raw value]
+collectionLiteral returns [Term.Raw value]
     : '[' { List<Term.Raw> l = new ArrayList<Term.Raw>(); }
           ( t1=term { l.add(t1); } ( ',' tn=term { l.add(tn); } )* )?
       ']' { $value = new Lists.Literal(l); }
-    | '{' t=term v=set_or_map[t] { $value = v; } '}'
+    | '{' t=term v=setOrMapLiteral[t] { $value = v; } '}'
     // Note that we have an ambiguity between maps and set for "{}". So we force it to a set literal,
     // and deal with it later based on the type of the column (SetLiteral.java).
     | '{' '}' { $value = new Sets.Literal(Collections.<Term.Raw>emptyList()); }
     ;
 
+usertypeLiteral returns [UserTypes.Literal ut]
+    @init{ Map<ColumnIdentifier, Term.Raw> m = new HashMap<ColumnIdentifier, Term.Raw>(); }
+    @after{ $ut = new UserTypes.Literal(m); }
+    // We don't allow empty literals because that conflicts with sets/maps and is currently useless since we don't allow empty user types
+    : '{' k1=cident ':' v1=term { m.put(k1, v1); } ( ',' kn=cident ':' vn=term { m.put(kn, vn); } )* '}'
+    ;
+
+tupleLiteral returns [Tuples.Literal tt]
+    @init{ List<Term.Raw> l = new ArrayList<Term.Raw>(); }
+    @after{ $tt = new Tuples.Literal(l); }
+    : '(' t1=term { l.add(t1); } ( ',' tn=term { l.add(tn); } )* ')'
+    ;
+
 value returns [Term.Raw value]
     : c=constant           { $value = c; }
-    | l=collection_literal { $value = l; }
+    | l=collectionLiteral  { $value = l; }
+    | u=usertypeLiteral    { $value = u; }
+    | t=tupleLiteral       { $value = t; }
     | K_NULL               { $value = Constants.NULL_LITERAL; }
     | ':' id=cident        { $value = newBindVariables(id); }
     | QMARK                { $value = newBindVariables(null); }
@@ -834,7 +936,16 @@
     ;
 
 columnOperation[List<Pair<ColumnIdentifier, Operation.RawUpdate>> operations]
-    : key=cident '=' t=term ('+' c=cident )?
+    : key=cident columnOperationDifferentiator[operations, key]
+    ;
+    
+columnOperationDifferentiator[List<Pair<ColumnIdentifier, Operation.RawUpdate>> operations, ColumnIdentifier key]
+    : '=' normalColumnOperation[operations, key]
+    | '[' k=term ']' specializedColumnOperation[operations, key, k]
+    ;
+    
+normalColumnOperation[List<Pair<ColumnIdentifier, Operation.RawUpdate>> operations, ColumnIdentifier key]
+    : t=term ('+' c=cident )?
       {
           if (c == null)
           {
@@ -847,13 +958,13 @@
               addRawUpdate(operations, key, new Operation.Prepend(t));
           }
       }
-    | key=cident '=' c=cident sig=('+' | '-') t=term
+    | c=cident sig=('+' | '-') t=term
       {
           if (!key.equals(c))
               addRecognitionError("Only expressions of the form X = X " + $sig.text + "<value> are supported.");
           addRawUpdate(operations, key, $sig.text.equals("+") ? new Operation.Addition(t) : new Operation.Substraction(t));
       }
-    | key=cident '=' c=cident i=INTEGER
+    | c=cident i=INTEGER
       {
           // Note that this production *is* necessary because X = X - 3 will in fact be lexed as [ X, '=', X, INTEGER].
           if (!key.equals(c))
@@ -861,7 +972,10 @@
               addRecognitionError("Only expressions of the form X = X " + ($i.text.charAt(0) == '-' ? '-' : '+') + " <value> are supported.");
           addRawUpdate(operations, key, new Operation.Addition(Constants.Literal.integer($i.text)));
       }
-    | key=cident '[' k=term ']' '=' t=term
+    ;
+      
+specializedColumnOperation[List<Pair<ColumnIdentifier, Operation.RawUpdate>> operations, ColumnIdentifier key, Term.Raw k]
+    : '=' t=term
       {
           addRawUpdate(operations, key, new Operation.SetElement(k, t));
       }
@@ -869,8 +983,20 @@
 
 columnCondition[List<Pair<ColumnIdentifier, ColumnCondition.Raw>> conditions]
     // Note: we'll reject duplicates later
-    : key=cident '=' t=term { conditions.add(Pair.create(key, ColumnCondition.Raw.simpleEqual(t))); }
-    | key=cident '[' element=term ']' '=' t=term { conditions.add(Pair.create(key, ColumnCondition.Raw.collectionEqual(t, element))); } 
+    : key=cident
+        ( op=relationType t=term { conditions.add(Pair.create(key, ColumnCondition.Raw.simpleCondition(t, op))); }
+        | K_IN
+            ( values=singleColumnInValues { conditions.add(Pair.create(key, ColumnCondition.Raw.simpleInCondition(values))); }
+            | marker=inMarker { conditions.add(Pair.create(key, ColumnCondition.Raw.simpleInCondition(marker))); }
+            )
+        | '[' element=term ']'
+            ( op=relationType t=term { conditions.add(Pair.create(key, ColumnCondition.Raw.collectionCondition(t, element, op))); }
+            | K_IN
+                ( values=singleColumnInValues { conditions.add(Pair.create(key, ColumnCondition.Raw.collectionInCondition(element, values))); }
+                | marker=inMarker { conditions.add(Pair.create(key, ColumnCondition.Raw.collectionInCondition(element, marker))); }
+                )
+            )
+        )
     ;
 
 properties[PropertyDefinitions props]
@@ -879,7 +1005,7 @@
 
 property[PropertyDefinitions props]
     : k=cident '=' (simple=propertyValue { try { $props.addProperty(k.toString(), simple); } catch (SyntaxException e) { addRecognitionError(e.getMessage()); } }
-                   |   map=map_literal   { try { $props.addProperty(k.toString(), convertPropertyMap(map)); } catch (SyntaxException e) { addRecognitionError(e.getMessage()); } })
+                   |   map=mapLiteral    { try { $props.addProperty(k.toString(), convertPropertyMap(map)); } catch (SyntaxException e) { addRecognitionError(e.getMessage()); } })
     ;
 
 propertyValue returns [String str]
@@ -893,6 +1019,7 @@
     | '<=' { $op = Relation.Type.LTE; }
     | '>'  { $op = Relation.Type.GT; }
     | '>=' { $op = Relation.Type.GTE; }
+    | '!=' { $op = Relation.Type.NEQ; }
     ;
 
 relation[List<Relation> clauses]
@@ -906,6 +1033,8 @@
         { $clauses.add(new SingleColumnRelation(name, Relation.Type.IN, marker)); }
     | name=cident K_IN inValues=singleColumnInValues
         { $clauses.add(SingleColumnRelation.createInRelation($name.id, inValues)); }
+    | name=cident K_CONTAINS { Relation.Type rt = Relation.Type.CONTAINS; } (K_KEY { rt = Relation.Type.CONTAINS_KEY; })?
+        t=term { $clauses.add(new SingleColumnRelation(name, rt, t)); }
     | ids=tupleOfIdentifiers
       ( K_IN
           ( '(' ')'
@@ -944,11 +1073,6 @@
     : '(' ( t1 = term { $terms.add(t1); } (',' ti=term { $terms.add(ti); })* )? ')'
     ;
 
-tupleLiteral returns [Tuples.Literal literal]
-    @init { List<Term.Raw> terms = new ArrayList<>(); }
-    : '(' t1=term { terms.add(t1); } (',' ti=term { terms.add(ti); })* ')' { $literal = new Tuples.Literal(terms); }
-    ;
-
 tupleOfTupleLiterals returns [List<Tuples.Literal> literals]
     @init { $literals = new ArrayList<>(); }
     : '(' t1=tupleLiteral { $literals.add(t1); } (',' ti=tupleLiteral { $literals.add(ti); })* ')'
@@ -969,13 +1093,23 @@
     | ':' name=cident { $marker = newTupleINBindVariables(name); }
     ;
 
-comparatorType returns [CQL3Type t]
-    : c=native_type     { $t = c; }
+comparatorType returns [CQL3Type.Raw t]
+    : n=native_type     { $t = CQL3Type.Raw.from(n); }
     | c=collection_type { $t = c; }
+    | tt=tuple_type     { $t = tt; }
+    | id=userTypeName   { $t = CQL3Type.Raw.userType(id); }
+    | K_FROZEN '<' f=comparatorType '>'
+      {
+        try {
+            $t = CQL3Type.Raw.frozen(f);
+        } catch (InvalidRequestException e) {
+            addRecognitionError(e.getMessage());
+        }
+      }
     | s=STRING_LITERAL
       {
         try {
-            $t = new CQL3Type.Custom($s.text);
+            $t = CQL3Type.Raw.from(new CQL3Type.Custom($s.text));
         } catch (SyntaxException e) {
             addRecognitionError("Cannot parse type " + $s.text + ": " + e.getMessage());
         } catch (ConfigurationException e) {
@@ -1003,17 +1137,23 @@
     | K_TIMEUUID  { $t = CQL3Type.Native.TIMEUUID; }
     ;
 
-collection_type returns [CQL3Type pt]
+collection_type returns [CQL3Type.Raw pt]
     : K_MAP  '<' t1=comparatorType ',' t2=comparatorType '>'
         { try {
             // if we can't parse either t1 or t2, antlr will "recover" and we may have t1 or t2 null.
             if (t1 != null && t2 != null)
-                $pt = CQL3Type.Collection.map(t1, t2);
+                $pt = CQL3Type.Raw.map(t1, t2);
           } catch (InvalidRequestException e) { addRecognitionError(e.getMessage()); } }
     | K_LIST '<' t=comparatorType '>'
-        { try { if (t != null) $pt = CQL3Type.Collection.list(t); } catch (InvalidRequestException e) { addRecognitionError(e.getMessage()); } }
+        { try { if (t != null) $pt = CQL3Type.Raw.list(t); } catch (InvalidRequestException e) { addRecognitionError(e.getMessage()); } }
     | K_SET  '<' t=comparatorType '>'
-        { try { if (t != null) $pt = CQL3Type.Collection.set(t); } catch (InvalidRequestException e) { addRecognitionError(e.getMessage()); } }
+        { try { if (t != null) $pt = CQL3Type.Raw.set(t); } catch (InvalidRequestException e) { addRecognitionError(e.getMessage()); } }
+    ;
+
+tuple_type returns [CQL3Type.Raw t]
+    : K_TUPLE '<' { List<CQL3Type.Raw> types = new ArrayList<>(); }
+         t1=comparatorType { types.add(t1); } (',' tn=comparatorType { types.add(tn); })*
+      '>' { try { $t = CQL3Type.Raw.tuple(types); } catch (InvalidRequestException e) { addRecognitionError(e.getMessage()); }}
     ;
 
 username
@@ -1021,13 +1161,27 @@
     | STRING_LITERAL
     ;
 
+// Basically the same than cident, but we need to exlude existing CQL3 types
+// (which for some reason are not reserved otherwise)
+non_type_ident returns [ColumnIdentifier id]
+    : t=IDENT                    { if (reservedTypeNames.contains($t.text)) addRecognitionError("Invalid (reserved) user type name " + $t.text); $id = new ColumnIdentifier($t.text, false); }
+    | t=QUOTED_NAME              { $id = new ColumnIdentifier($t.text, true); }
+    | k=basic_unreserved_keyword { $id = new ColumnIdentifier(k, false); }
+    | kk=K_KEY                   { $id = new ColumnIdentifier($kk.text, false); }
+    ;
+
 unreserved_keyword returns [String str]
     : u=unreserved_function_keyword     { $str = u; }
-    | k=(K_TTL | K_COUNT | K_WRITETIME) { $str = $k.text; }
+    | k=(K_TTL | K_COUNT | K_WRITETIME | K_KEY) { $str = $k.text; }
     ;
 
 unreserved_function_keyword returns [String str]
-    : k=( K_KEY
+    : u=basic_unreserved_keyword { $str = u; }
+    | t=native_type              { $str = t.toString(); }
+    ;
+
+basic_unreserved_keyword returns [String str]
+    : k=( K_KEYS
         | K_AS
         | K_CLUSTERING
         | K_COMPACT
@@ -1050,9 +1204,9 @@
         | K_CUSTOM
         | K_TRIGGER
         | K_DISTINCT
+        | K_CONTAINS
         | K_STATIC
         ) { $str = $k.text; }
-    | t=native_type { $str = t.toString(); }
     ;
 
 
@@ -1063,6 +1217,7 @@
 K_WHERE:       W H E R E;
 K_AND:         A N D;
 K_KEY:         K E Y;
+K_KEYS:        K E Y S;
 K_INSERT:      I N S E R T;
 K_UPDATE:      U P D A T E;
 K_WITH:        W I T H;
@@ -1108,6 +1263,7 @@
 K_ALLOW:       A L L O W;
 K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
+K_CONTAINS:    C O N T A I N S;
 
 K_GRANT:       G R A N T;
 K_ALL:         A L L;
@@ -1152,9 +1308,11 @@
 K_LIST:        L I S T;
 K_NAN:         N A N;
 K_INFINITY:    I N F I N I T Y;
+K_TUPLE:       T U P L E;
 
 K_TRIGGER:     T R I G G E R;
 K_STATIC:      S T A T I C;
+K_FROZEN:      F R O Z E N;
 
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');

diff --git a/src/java/org/apache/cassandra/cql3/ErrorCollector.java b/src/java/org/apache/cassandra/cql3/ErrorCollector.java
new file mode 100644
index 0000000..cd628b8
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/ErrorCollector.java

@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.LinkedList;
+
+import org.antlr.runtime.BaseRecognizer;
+import org.antlr.runtime.Parser;
+import org.antlr.runtime.RecognitionException;
+import org.antlr.runtime.Token;
+import org.antlr.runtime.TokenStream;
+import org.apache.cassandra.exceptions.SyntaxException;
+
+/**
+ * <code>ErrorListener</code> that collect and enhance the errors send by the CQL lexer and parser.
+ */
+public final class ErrorCollector implements ErrorListener
+{
+    /**
+     * The offset of the first token of the snippet.
+     */
+    private static final int FIRST_TOKEN_OFFSET = 10;
+
+    /**
+     * The offset of the last token of the snippet.
+     */
+    private static final int LAST_TOKEN_OFFSET = 2;
+
+    /**
+     * The CQL query.
+     */
+    private final String query;
+
+    /**
+     * The error messages.
+     */
+    private final LinkedList<String> errorMsgs = new LinkedList<>();
+
+    /**
+     * Creates a new <code>ErrorCollector</code> instance to collect the syntax errors associated to the specified CQL
+     * query.
+     *
+     * @param query the CQL query that will be parsed
+     */
+    public ErrorCollector(String query)
+    {
+        this.query = query;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void syntaxError(BaseRecognizer recognizer, String[] tokenNames, RecognitionException e)
+    {
+        String hdr = recognizer.getErrorHeader(e);
+        String msg = recognizer.getErrorMessage(e, tokenNames);
+
+        StringBuilder builder = new StringBuilder().append(hdr)
+                .append(' ')
+                .append(msg);
+
+        if (recognizer instanceof Parser)
+            appendQuerySnippet((Parser) recognizer, builder);
+
+        errorMsgs.add(builder.toString());
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void syntaxError(BaseRecognizer recognizer, String errorMsg)
+    {
+        errorMsgs.add(errorMsg);
+    }
+
+    /**
+     * Throws the last syntax error found by the lexer or the parser if it exists.
+     *
+     * @throws SyntaxException the syntax error.
+     */
+    public void throwLastSyntaxError() throws SyntaxException
+    {
+        if (!errorMsgs.isEmpty())
+            throw new SyntaxException(errorMsgs.getLast());
+    }
+
+    /**
+     * Appends a query snippet to the message to help the user to understand the problem.
+     *
+     * @param parser the parser used to parse the query
+     * @param builder the <code>StringBuilder</code> used to build the error message
+     */
+    private void appendQuerySnippet(Parser parser, StringBuilder builder)
+    {
+        TokenStream tokenStream = parser.getTokenStream();
+        int index = tokenStream.index();
+        int size = tokenStream.size();
+
+        Token from = tokenStream.get(getSnippetFirstTokenIndex(index));
+        Token to = tokenStream.get(getSnippetLastTokenIndex(index, size));
+        Token offending = tokenStream.get(getOffendingTokenIndex(index, size));
+
+        appendSnippet(builder, from, to, offending);
+    }
+
+    /**
+     * Appends a query snippet to the message to help the user to understand the problem.
+     *
+     * @param from the first token to include within the snippet
+     * @param to the last token to include within the snippet
+     * @param offending the token which is responsible for the error
+     */
+    final void appendSnippet(StringBuilder builder,
+                             Token from,
+                             Token to,
+                             Token offending)
+    {
+        String[] lines = query.split("\n");
+
+        boolean includeQueryStart = (from.getLine() == 1) && (from.getCharPositionInLine() == 0);
+        boolean includeQueryEnd = (to.getLine() == lines.length)
+                && (getLastCharPositionInLine(to) == lines[lines.length - 1].length());
+
+        builder.append(" (");
+
+        if (!includeQueryStart)
+            builder.append("...");
+
+        lines[lineIndex(to)] = lines[lineIndex(to)].substring(0, getLastCharPositionInLine(to));
+        lines[lineIndex(offending)] = highlightToken(lines[lineIndex(offending)], offending);
+        lines[lineIndex(from)] = lines[lineIndex(from)].substring(from.getCharPositionInLine());
+
+        for (int i = lineIndex(from), m = lineIndex(to); i <= m; i++)
+            builder.append(lines[i]);
+
+        if (!includeQueryEnd)
+            builder.append("...");
+
+        builder.append(")");
+    }
+
+    /**
+     * Returns the index of the offending token. <p>In the case where the offending token is an extra
+     * character at the end, the index returned by the <code>TokenStream</code> might be after the last token.
+     * To avoid that problem we need to make sure that the index of the offending token is a valid index 
+     * (one for which a token exist).</p>
+     *
+     * @param index the token index returned by the <code>TokenStream</code>
+     * @param size the <code>TokenStream</code> size
+     * @return the valid index of the offending token
+     */
+    private static int getOffendingTokenIndex(int index, int size)
+    {
+        return Math.min(index, size - 1);
+    }
+
+    /**
+     * Puts the specified token within square brackets.
+     *
+     * @param line the line containing the token
+     * @param token the token to put within square brackets
+     */
+    private static String highlightToken(String line, Token token)
+    {
+        String newLine = insertChar(line, getLastCharPositionInLine(token), ']');
+        return insertChar(newLine, token.getCharPositionInLine(), '[');
+    }
+
+    /**
+     * Returns the index of the last character relative to the beginning of the line 0..n-1
+     *
+     * @param token the token
+     * @return the index of the last character relative to the beginning of the line 0..n-1
+     */
+    private static int getLastCharPositionInLine(Token token)
+    {
+        return token.getCharPositionInLine() + getLength(token);
+    }
+
+    /**
+     * Return the token length.
+     *
+     * @param token the token
+     * @return the token length
+     */
+    private static int getLength(Token token)
+    {
+        return token.getText().length();
+    }
+
+    /**
+     * Inserts a character at a given position within a <code>String</code>.
+     *
+     * @param s the <code>String</code> in which the character must be inserted
+     * @param index the position where the character must be inserted
+     * @param c the character to insert
+     * @return the modified <code>String</code>
+     */
+    private static String insertChar(String s, int index, char c)
+    {
+        return new StringBuilder().append(s.substring(0, index))
+                .append(c)
+                .append(s.substring(index))
+                .toString();
+    }
+
+    /**
+     * Returns the index of the line number on which this token was matched; index=0..n-1
+     *
+     * @param token the token
+     * @return the index of the line number on which this token was matched; index=0..n-1
+     */
+    private static int lineIndex(Token token)
+    {
+        return token.getLine() - 1;
+    }
+
+    /**
+     * Returns the index of the last token which is part of the snippet.
+     *
+     * @param index the index of the token causing the error
+     * @param size the total number of tokens
+     * @return the index of the last token which is part of the snippet.
+     */
+    private static int getSnippetLastTokenIndex(int index, int size)
+    {
+        return Math.min(size - 1, index + LAST_TOKEN_OFFSET);
+    }
+
+    /**
+     * Returns the index of the first token which is part of the snippet.
+     *
+     * @param index the index of the token causing the error
+     * @return the index of the first token which is part of the snippet.
+     */
+    private static int getSnippetFirstTokenIndex(int index)
+    {
+        return Math.max(0, index - FIRST_TOKEN_OFFSET);
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/ErrorListener.java b/src/java/org/apache/cassandra/cql3/ErrorListener.java
new file mode 100644
index 0000000..0bf891a
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/ErrorListener.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.antlr.runtime.BaseRecognizer;
+import org.antlr.runtime.RecognitionException;
+
+/**
+ * Listener used to collect the syntax errors emitted by the Lexer and Parser.
+ */
+public interface ErrorListener
+{
+    /**
+     * Invoked when a syntax error occurs.
+     *
+     * @param recognizer the parser or lexer that emitted the error
+     * @param tokenNames the token names
+     * @param e the exception
+     */
+    void syntaxError(BaseRecognizer recognizer, String[] tokenNames, RecognitionException e);
+
+    /**
+     * Invoked when a syntax error with a specified message occurs.
+     *
+     * @param recognizer the parser or lexer that emitted the error
+     * @param errorMsg the error message
+     */
+    void syntaxError(BaseRecognizer recognizer, String errorMsg);
+}

diff --git a/src/java/org/apache/cassandra/cql3/IndexName.java b/src/java/org/apache/cassandra/cql3/IndexName.java
new file mode 100644
index 0000000..ded86e4
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/IndexName.java

@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.Locale;
+
+public class IndexName 
+{
+    private String ksName;
+    private String idxName;
+
+    public void setKeyspace(String ks, boolean keepCase)
+    {
+        ksName = keepCase ? ks : ks.toLowerCase(Locale.US);
+    }
+
+    public void setIndex(String idx, boolean keepCase)
+    {
+        idxName = keepCase ? idx : idx.toLowerCase(Locale.US);
+    }
+
+    public boolean hasKeyspace()
+    {
+        return ksName != null;
+    }
+
+    public String getKeyspace()
+    {
+        return ksName;
+    }
+
+    public String getIdx()
+    {
+        return idxName;
+    }
+
+    public CFName getCfName()
+    {
+        CFName cfName = new CFName();
+        if (hasKeyspace())
+            cfName.setKeyspace(ksName, true);
+    	return cfName;
+    }
+
+    @Override
+    public String toString()
+    {
+        return (hasKeyspace() ? (ksName + ".") : "") + idxName;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/Lists.java b/src/java/org/apache/cassandra/cql3/Lists.java
index d483dd5..9d22364 100644
--- a/src/java/org/apache/cassandra/cql3/Lists.java
+++ b/src/java/org/apache/cassandra/cql3/Lists.java

@@ -20,26 +20,33 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicReference;
 
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.Column;
-import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.ListType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.UUIDGen;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Static helper methods and classes for lists.
  */
 public abstract class Lists
 {
+    private static final Logger logger = LoggerFactory.getLogger(Lists.class);
+
     private Lists() {}
 
     public static ColumnSpecification indexSpecOf(ColumnSpecification column)
@@ -61,19 +68,19 @@
             this.elements = elements;
         }
 
-        public Term prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            validateAssignableTo(receiver);
+            validateAssignableTo(keyspace, receiver);
 
             ColumnSpecification valueSpec = Lists.valueSpecOf(receiver);
             List<Term> values = new ArrayList<Term>(elements.size());
             boolean allTerminal = true;
             for (Term.Raw rt : elements)
             {
-                Term t = rt.prepare(valueSpec);
+                Term t = rt.prepare(keyspace, valueSpec);
 
                 if (t.containsBindMarker())
-                    throw new InvalidRequestException(String.format("Invalid list literal for %s: bind variables are not supported inside collection literals", receiver));
+                    throw new InvalidRequestException(String.format("Invalid list literal for %s: bind variables are not supported inside collection literals", receiver.name));
 
                 if (t instanceof Term.NonTerminal)
                     allTerminal = false;
@@ -81,27 +88,27 @@
                 values.add(t);
             }
             DelayedValue value = new DelayedValue(values);
-            return allTerminal ? value.bind(Collections.<ByteBuffer>emptyList()) : value;
+            return allTerminal ? value.bind(QueryOptions.DEFAULT) : value;
         }
 
-        private void validateAssignableTo(ColumnSpecification receiver) throws InvalidRequestException
+        private void validateAssignableTo(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
             if (!(receiver.type instanceof ListType))
-                throw new InvalidRequestException(String.format("Invalid list literal for %s of type %s", receiver, receiver.type.asCQL3Type()));
+                throw new InvalidRequestException(String.format("Invalid list literal for %s of type %s", receiver.name, receiver.type.asCQL3Type()));
 
             ColumnSpecification valueSpec = Lists.valueSpecOf(receiver);
             for (Term.Raw rt : elements)
             {
-                if (!rt.isAssignableTo(valueSpec))
-                    throw new InvalidRequestException(String.format("Invalid list literal for %s: value %s is not of type %s", receiver, rt, valueSpec.type.asCQL3Type()));
+                if (!rt.isAssignableTo(keyspace, valueSpec))
+                    throw new InvalidRequestException(String.format("Invalid list literal for %s: value %s is not of type %s", receiver.name, rt, valueSpec.type.asCQL3Type()));
             }
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
         {
             try
             {
-                validateAssignableTo(receiver);
+                validateAssignableTo(keyspace, receiver);
                 return true;
             }
             catch (InvalidRequestException e)
@@ -126,16 +133,17 @@
             this.elements = elements;
         }
 
-        public static Value fromSerialized(ByteBuffer value, ListType type) throws InvalidRequestException
+        public static Value fromSerialized(ByteBuffer value, ListType type, int version) throws InvalidRequestException
         {
             try
             {
                 // Collections have this small hack that validate cannot be called on a serialized object,
                 // but compose does the validation (so we're fine).
-                List<?> l = (List<?>)type.compose(value);
+                List<?> l = (List<?>)type.getSerializer().deserializeForNativeProtocol(value, version);
                 List<ByteBuffer> elements = new ArrayList<ByteBuffer>(l.size());
                 for (Object element : l)
-                    elements.add(type.elements.decompose(element));
+                    // elements can be null in lists that represent a set of IN values
+                    elements.add(element == null ? null : type.elements.decompose(element));
                 return new Value(elements);
             }
             catch (MarshalException e)
@@ -144,9 +152,21 @@
             }
         }
 
-        public ByteBuffer get()
+        public ByteBuffer get(QueryOptions options)
         {
-            return CollectionType.pack(elements, elements.size());
+            return CollectionSerializer.pack(elements, elements.size(), options.getProtocolVersion());
+        }
+
+        public boolean equals(ListType lt, Value v)
+        {
+            if (elements.size() != v.elements.size())
+                return false;
+
+            for (int i = 0; i < elements.size(); i++)
+                if (lt.elements.compare(elements.get(i), v.elements.get(i)) != 0)
+                    return false;
+
+            return true;
         }
 
         public List<ByteBuffer> getElements()
@@ -183,12 +203,12 @@
         {
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        public Value bind(QueryOptions options) throws InvalidRequestException
         {
             List<ByteBuffer> buffers = new ArrayList<ByteBuffer>(elements.size());
             for (Term t : elements)
             {
-                ByteBuffer bytes = t.bindAndGet(values);
+                ByteBuffer bytes = t.bindAndGet(options);
 
                 if (bytes == null)
                     throw new InvalidRequestException("null is not supported inside collections");
@@ -216,10 +236,10 @@
             assert receiver.type instanceof ListType;
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        public Value bind(QueryOptions options) throws InvalidRequestException
         {
-            ByteBuffer value = values.get(bindIndex);
-            return value == null ? null : Value.fromSerialized(value, (ListType)receiver.type);
+            ByteBuffer value = options.getValues().get(bindIndex);
+            return value == null ? null : Value.fromSerialized(value, (ListType)receiver.type, options.getProtocolVersion());
         }
     }
 
@@ -265,17 +285,17 @@
 
     public static class Setter extends Operation
     {
-        public Setter(ColumnIdentifier column, Term t)
+        public Setter(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
             // delete + append
-            ColumnNameBuilder column = maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key);
-            cf.addAtom(params.makeTombstoneForOverwrite(column.build(), column.buildAsEndOfRange()));
-            Appender.doAppend(t, cf, column, params);
+            CellName name = cf.getComparator().create(prefix, column);
+            cf.addAtom(params.makeTombstoneForOverwrite(name.slice()));
+            Appender.doAppend(t, cf, prefix, column, params);
         }
     }
 
@@ -283,7 +303,7 @@
     {
         private final Term idx;
 
-        public SetterByIndex(ColumnIdentifier column, Term idx, Term t)
+        public SetterByIndex(ColumnDefinition column, Term idx, Term t)
         {
             super(column, t);
             this.idx = idx;
@@ -302,22 +322,20 @@
             idx.collectMarkerSpecification(boundNames);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            ByteBuffer index = idx.bindAndGet(params.variables);
-            ByteBuffer value = t.bindAndGet(params.variables);
+            ByteBuffer index = idx.bindAndGet(params.options);
+            ByteBuffer value = t.bindAndGet(params.options);
 
             if (index == null)
                 throw new InvalidRequestException("Invalid null value for list index");
 
-            List<Pair<ByteBuffer, Column>> existingList = params.getPrefetchedList(rowKey, columnName.key);
+            List<Cell> existingList = params.getPrefetchedList(rowKey, column.name);
             int idx = ByteBufferUtil.toInt(index);
             if (idx < 0 || idx >= existingList.size())
                 throw new InvalidRequestException(String.format("List index %d out of bound, list has size %d", idx, existingList.size()));
 
-            ByteBuffer elementName = existingList.get(idx).right.name();
-            // Since we reuse the name we're read, if it's a static column, the static marker will already be set
-
+            CellName elementName = existingList.get(idx).name();
             if (value == null)
             {
                 cf.addColumn(params.makeTombstone(elementName));
@@ -337,19 +355,19 @@
 
     public static class Appender extends Operation
     {
-        public Appender(ColumnIdentifier column, Term t)
+        public Appender(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            doAppend(t, cf, maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key), params);
+            doAppend(t, cf, prefix, column, params);
         }
 
-        static void doAppend(Term t, ColumnFamily cf, ColumnNameBuilder columnName, UpdateParameters params) throws InvalidRequestException
+        static void doAppend(Term t, ColumnFamily cf, Composite prefix, ColumnDefinition column, UpdateParameters params) throws InvalidRequestException
         {
-            Term.Terminal value = t.bind(params.variables);
+            Term.Terminal value = t.bind(params.options);
             // If we append null, do nothing. Note that for Setter, we've
             // already removed the previous value so we're good here too
             if (value == null)
@@ -359,24 +377,22 @@
             List<ByteBuffer> toAdd = ((Lists.Value)value).elements;
             for (int i = 0; i < toAdd.size(); i++)
             {
-                ColumnNameBuilder b = i == toAdd.size() - 1 ? columnName : columnName.copy();
                 ByteBuffer uuid = ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes());
-                ByteBuffer cellName = b.add(uuid).build();
-                cf.addColumn(params.makeColumn(cellName, toAdd.get(i)));
+                cf.addColumn(params.makeColumn(cf.getComparator().create(prefix, column, uuid), toAdd.get(i)));
             }
         }
     }
 
     public static class Prepender extends Operation
     {
-        public Prepender(ColumnIdentifier column, Term t)
+        public Prepender(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            Term.Terminal value = t.bind(params.variables);
+            Term.Terminal value = t.bind(params.options);
             if (value == null)
                 return;
 
@@ -384,21 +400,18 @@
             long time = PrecisionTime.REFERENCE_TIME - (System.currentTimeMillis() - PrecisionTime.REFERENCE_TIME);
 
             List<ByteBuffer> toAdd = ((Lists.Value)value).elements;
-            ColumnNameBuilder column = maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key);
             for (int i = 0; i < toAdd.size(); i++)
             {
-                ColumnNameBuilder b = i == toAdd.size() - 1 ? column : column.copy();
                 PrecisionTime pt = PrecisionTime.getNext(time);
                 ByteBuffer uuid = ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes(pt.millis, pt.nanos));
-                ByteBuffer cellName = b.add(uuid).build();
-                cf.addColumn(params.makeColumn(cellName, toAdd.get(i)));
+                cf.addColumn(params.makeColumn(cf.getComparator().create(prefix, column, uuid), toAdd.get(i)));
             }
         }
     }
 
     public static class Discarder extends Operation
     {
-        public Discarder(ColumnIdentifier column, Term t)
+        public Discarder(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
@@ -409,13 +422,15 @@
             return true;
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            List<Pair<ByteBuffer, Column>> existingList = params.getPrefetchedList(rowKey, columnName.key);
+            List<Cell> existingList = params.getPrefetchedList(rowKey, column.name);
+            // We want to call bind before possibly returning to reject queries where the value provided is not a list.
+            Term.Terminal value = t.bind(params.options);
+
             if (existingList.isEmpty())
                 return;
 
-            Term.Terminal value = t.bind(params.variables);
             if (value == null)
                 return;
 
@@ -426,18 +441,17 @@
             // the read-before-write this operation requires limits its usefulness on big lists, so in practice
             // toDiscard will be small and keeping a list will be more efficient.
             List<ByteBuffer> toDiscard = ((Lists.Value)value).elements;
-            for (Pair<ByteBuffer, Column> p : existingList)
+            for (Cell cell : existingList)
             {
-                Column element = p.right;
-                if (toDiscard.contains(element.value()))
-                    cf.addColumn(params.makeTombstone(element.name()));
+                if (toDiscard.contains(cell.value()))
+                    cf.addColumn(params.makeTombstone(cell.name()));
             }
         }
     }
 
     public static class DiscarderByIndex extends Operation
     {
-        public DiscarderByIndex(ColumnIdentifier column, Term idx)
+        public DiscarderByIndex(ColumnDefinition column, Term idx)
         {
             super(column, idx);
         }
@@ -448,20 +462,20 @@
             return true;
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            Term.Terminal index = t.bind(params.variables);
+            Term.Terminal index = t.bind(params.options);
             if (index == null)
                 throw new InvalidRequestException("Invalid null value for list index");
 
             assert index instanceof Constants.Value;
 
-            List<Pair<ByteBuffer, Column>> existingList = params.getPrefetchedList(rowKey, columnName.key);
+            List<Cell> existingList = params.getPrefetchedList(rowKey, column.name);
             int idx = ByteBufferUtil.toInt(((Constants.Value)index).bytes);
             if (idx < 0 || idx >= existingList.size())
                 throw new InvalidRequestException(String.format("List index %d out of bound, list has size %d", idx, existingList.size()));
 
-            ByteBuffer elementName = existingList.get(idx).right.name();
+            CellName elementName = existingList.get(idx).name();
             cf.addColumn(params.makeTombstone(elementName));
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/Maps.java b/src/java/org/apache/cassandra/cql3/Maps.java
index c332999..ce0ba2a 100644
--- a/src/java/org/apache/cassandra/cql3/Maps.java
+++ b/src/java/org/apache/cassandra/cql3/Maps.java

@@ -22,15 +22,19 @@
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
 
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.MapType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
@@ -61,9 +65,9 @@
             this.entries = entries;
         }
 
-        public Term prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            validateAssignableTo(receiver);
+            validateAssignableTo(keyspace, receiver);
 
             ColumnSpecification keySpec = Maps.keySpecOf(receiver);
             ColumnSpecification valueSpec = Maps.valueSpecOf(receiver);
@@ -71,11 +75,11 @@
             boolean allTerminal = true;
             for (Pair<Term.Raw, Term.Raw> entry : entries)
             {
-                Term k = entry.left.prepare(keySpec);
-                Term v = entry.right.prepare(valueSpec);
+                Term k = entry.left.prepare(keyspace, keySpec);
+                Term v = entry.right.prepare(keyspace, valueSpec);
 
                 if (k.containsBindMarker() || v.containsBindMarker())
-                    throw new InvalidRequestException(String.format("Invalid map literal for %s: bind variables are not supported inside collection literals", receiver));
+                    throw new InvalidRequestException(String.format("Invalid map literal for %s: bind variables are not supported inside collection literals", receiver.name));
 
                 if (k instanceof Term.NonTerminal || v instanceof Term.NonTerminal)
                     allTerminal = false;
@@ -83,30 +87,30 @@
                 values.put(k, v);
             }
             DelayedValue value = new DelayedValue(((MapType)receiver.type).keys, values);
-            return allTerminal ? value.bind(Collections.<ByteBuffer>emptyList()) : value;
+            return allTerminal ? value.bind(QueryOptions.DEFAULT) : value;
         }
 
-        private void validateAssignableTo(ColumnSpecification receiver) throws InvalidRequestException
+        private void validateAssignableTo(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
             if (!(receiver.type instanceof MapType))
-                throw new InvalidRequestException(String.format("Invalid map literal for %s of type %s", receiver, receiver.type.asCQL3Type()));
+                throw new InvalidRequestException(String.format("Invalid map literal for %s of type %s", receiver.name, receiver.type.asCQL3Type()));
 
             ColumnSpecification keySpec = Maps.keySpecOf(receiver);
             ColumnSpecification valueSpec = Maps.valueSpecOf(receiver);
             for (Pair<Term.Raw, Term.Raw> entry : entries)
             {
-                if (!entry.left.isAssignableTo(keySpec))
-                    throw new InvalidRequestException(String.format("Invalid map literal for %s: key %s is not of type %s", receiver, entry.left, keySpec.type.asCQL3Type()));
-                if (!entry.right.isAssignableTo(valueSpec))
-                    throw new InvalidRequestException(String.format("Invalid map literal for %s: value %s is not of type %s", receiver, entry.right, valueSpec.type.asCQL3Type()));
+                if (!entry.left.isAssignableTo(keyspace, keySpec))
+                    throw new InvalidRequestException(String.format("Invalid map literal for %s: key %s is not of type %s", receiver.name, entry.left, keySpec.type.asCQL3Type()));
+                if (!entry.right.isAssignableTo(keyspace, valueSpec))
+                    throw new InvalidRequestException(String.format("Invalid map literal for %s: value %s is not of type %s", receiver.name, entry.right, valueSpec.type.asCQL3Type()));
             }
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
         {
             try
             {
-                validateAssignableTo(receiver);
+                validateAssignableTo(keyspace, receiver);
                 return true;
             }
             catch (InvalidRequestException e)
@@ -139,13 +143,13 @@
             this.map = map;
         }
 
-        public static Value fromSerialized(ByteBuffer value, MapType type) throws InvalidRequestException
+        public static Value fromSerialized(ByteBuffer value, MapType type, int version) throws InvalidRequestException
         {
             try
             {
                 // Collections have this small hack that validate cannot be called on a serialized object,
                 // but compose does the validation (so we're fine).
-                Map<?, ?> m = (Map<?, ?>)type.compose(value);
+                Map<?, ?> m = (Map<?, ?>)type.getSerializer().deserializeForNativeProtocol(value, version);
                 Map<ByteBuffer, ByteBuffer> map = new LinkedHashMap<ByteBuffer, ByteBuffer>(m.size());
                 for (Map.Entry<?, ?> entry : m.entrySet())
                     map.put(type.keys.decompose(entry.getKey()), type.values.decompose(entry.getValue()));
@@ -157,7 +161,7 @@
             }
         }
 
-        public ByteBuffer get()
+        public ByteBuffer get(QueryOptions options)
         {
             List<ByteBuffer> buffers = new ArrayList<ByteBuffer>(2 * map.size());
             for (Map.Entry<ByteBuffer, ByteBuffer> entry : map.entrySet())
@@ -165,7 +169,26 @@
                 buffers.add(entry.getKey());
                 buffers.add(entry.getValue());
             }
-            return CollectionType.pack(buffers, map.size());
+            return CollectionSerializer.pack(buffers, map.size(), options.getProtocolVersion());
+        }
+
+        public boolean equals(MapType mt, Value v)
+        {
+            if (map.size() != v.map.size())
+                return false;
+
+            // We use the fact that we know the maps iteration will both be in comparator order
+            Iterator<Map.Entry<ByteBuffer, ByteBuffer>> thisIter = map.entrySet().iterator();
+            Iterator<Map.Entry<ByteBuffer, ByteBuffer>> thatIter = v.map.entrySet().iterator();
+            while (thisIter.hasNext())
+            {
+                Map.Entry<ByteBuffer, ByteBuffer> thisEntry = thisIter.next();
+                Map.Entry<ByteBuffer, ByteBuffer> thatEntry = thatIter.next();
+                if (mt.keys.compare(thisEntry.getKey(), thatEntry.getKey()) != 0 || mt.values.compare(thisEntry.getValue(), thatEntry.getValue()) != 0)
+                    return false;
+            }
+
+            return true;
         }
     }
 
@@ -191,13 +214,13 @@
         {
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        public Value bind(QueryOptions options) throws InvalidRequestException
         {
             Map<ByteBuffer, ByteBuffer> buffers = new TreeMap<ByteBuffer, ByteBuffer>(comparator);
             for (Map.Entry<Term, Term> entry : elements.entrySet())
             {
                 // We don't support values > 64K because the serialization format encode the length as an unsigned short.
-                ByteBuffer keyBytes = entry.getKey().bindAndGet(values);
+                ByteBuffer keyBytes = entry.getKey().bindAndGet(options);
                 if (keyBytes == null)
                     throw new InvalidRequestException("null is not supported inside collections");
                 if (keyBytes.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
@@ -205,7 +228,7 @@
                                                                     FBUtilities.MAX_UNSIGNED_SHORT,
                                                                     keyBytes.remaining()));
 
-                ByteBuffer valueBytes = entry.getValue().bindAndGet(values);
+                ByteBuffer valueBytes = entry.getValue().bindAndGet(options);
                 if (valueBytes == null)
                     throw new InvalidRequestException("null is not supported inside collections");
                 if (valueBytes.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
@@ -227,26 +250,26 @@
             assert receiver.type instanceof MapType;
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        public Value bind(QueryOptions options) throws InvalidRequestException
         {
-            ByteBuffer value = values.get(bindIndex);
-            return value == null ? null : Value.fromSerialized(value, (MapType)receiver.type);
+            ByteBuffer value = options.getValues().get(bindIndex);
+            return value == null ? null : Value.fromSerialized(value, (MapType)receiver.type, options.getProtocolVersion());
         }
     }
 
     public static class Setter extends Operation
     {
-        public Setter(ColumnIdentifier column, Term t)
+        public Setter(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
             // delete + put
-            ColumnNameBuilder column = maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key);
-            cf.addAtom(params.makeTombstoneForOverwrite(column.build(), column.buildAsEndOfRange()));
-            Putter.doPut(t, cf, column, params);
+            CellName name = cf.getComparator().create(prefix, column);
+            cf.addAtom(params.makeTombstoneForOverwrite(name.slice()));
+            Putter.doPut(t, cf, prefix, column, params);
         }
     }
 
@@ -254,7 +277,7 @@
     {
         private final Term k;
 
-        public SetterByKey(ColumnIdentifier column, Term k, Term t)
+        public SetterByKey(ColumnDefinition column, Term k, Term t)
         {
             super(column, t);
             this.k = k;
@@ -267,14 +290,14 @@
             k.collectMarkerSpecification(boundNames);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            ByteBuffer key = k.bindAndGet(params.variables);
-            ByteBuffer value = t.bindAndGet(params.variables);
+            ByteBuffer key = k.bindAndGet(params.options);
+            ByteBuffer value = t.bindAndGet(params.options);
             if (key == null)
                 throw new InvalidRequestException("Invalid null map key");
 
-            ByteBuffer cellName = maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key).add(key).build();
+            CellName cellName = cf.getComparator().create(prefix, column, key);
 
             if (value == null)
             {
@@ -295,19 +318,19 @@
 
     public static class Putter extends Operation
     {
-        public Putter(ColumnIdentifier column, Term t)
+        public Putter(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            doPut(t, cf, maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key), params);
+            doPut(t, cf, prefix, column, params);
         }
 
-        static void doPut(Term t, ColumnFamily cf, ColumnNameBuilder columnName, UpdateParameters params) throws InvalidRequestException
+        static void doPut(Term t, ColumnFamily cf, Composite prefix, ColumnDefinition column, UpdateParameters params) throws InvalidRequestException
         {
-            Term.Terminal value = t.bind(params.variables);
+            Term.Terminal value = t.bind(params.options);
             if (value == null)
                 return;
             assert value instanceof Maps.Value;
@@ -315,7 +338,7 @@
             Map<ByteBuffer, ByteBuffer> toAdd = ((Maps.Value)value).map;
             for (Map.Entry<ByteBuffer, ByteBuffer> entry : toAdd.entrySet())
             {
-                ByteBuffer cellName = columnName.copy().add(entry.getKey()).build();
+                CellName cellName = cf.getComparator().create(prefix, column, entry.getKey());
                 cf.addColumn(params.makeColumn(cellName, entry.getValue()));
             }
         }
@@ -323,19 +346,19 @@
 
     public static class DiscarderByKey extends Operation
     {
-        public DiscarderByKey(ColumnIdentifier column, Term k)
+        public DiscarderByKey(ColumnDefinition column, Term k)
         {
             super(column, k);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            Term.Terminal key = t.bind(params.variables);
+            Term.Terminal key = t.bind(params.options);
             if (key == null)
                 throw new InvalidRequestException("Invalid null map key");
             assert key instanceof Constants.Value;
 
-            ByteBuffer cellName = maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key).add(((Constants.Value)key).bytes).build();
+            CellName cellName = cf.getComparator().create(prefix, column, ((Constants.Value)key).bytes);
             cf.addColumn(params.makeTombstone(cellName));
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/Operation.java b/src/java/org/apache/cassandra/cql3/Operation.java
index 6bf46b5..ebcb30b 100644
--- a/src/java/org/apache/cassandra/cql3/Operation.java
+++ b/src/java/org/apache/cassandra/cql3/Operation.java

@@ -19,12 +19,10 @@
 
 import java.nio.ByteBuffer;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.marshal.CollectionType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
 /**
@@ -43,36 +41,20 @@
  */
 public abstract class Operation
 {
-    // Name of the column the operation applies to
-    public final ColumnIdentifier columnName;
+    // the column the operation applies to
+    public final ColumnDefinition column;
 
     // Term involved in the operation. In theory this should not be here since some operation
     // may require none of more than one term, but most need 1 so it simplify things a bit.
     protected final Term t;
 
-    protected Operation(ColumnIdentifier columnName, Term t)
+    protected Operation(ColumnDefinition column, Term t)
     {
-        this.columnName = columnName;
+        assert column != null;
+        this.column = column;
         this.t = t;
     }
 
-    // Whether the colum operated on is a static column (on trunk, Operation stores the ColumnDefinition directly,
-    // not just the column name, so we'll be able to remove that lookup and check ColumnDefinition.isStatic field
-    // directly. But for 2.0, it's simpler that way).
-    public boolean isStatic(CFMetaData cfm)
-    {
-        if (columnName == null)
-            return false;
-
-        ColumnDefinition def = cfm.getColumnDefinition(columnName.key);
-        return def != null && def.type == ColumnDefinition.Type.STATIC;
-    }
-
-    protected ColumnNameBuilder maybeUpdatePrefix(CFMetaData cfm, ColumnNameBuilder prefix)
-    {
-        return isStatic(cfm) ? cfm.getStaticColumnNameBuilder() : prefix;
-    }
-
     /**
      * @return whether the operation requires a read of the previous value to be executed
      * (only lists setterByIdx, discard and discardByIdx requires that).
@@ -99,11 +81,10 @@
      *
      * @param rowKey row key for the update.
      * @param cf the column family to which to add the updates generated by this operation.
-     * @param namePrefix the prefix that identify the CQL3 row this operation applies to (callers should not reuse
-     * the ColumnNameBuilder they pass here).
+     * @param prefix the prefix that identify the CQL3 row this operation applies to.
      * @param params parameters of the update.
      */
-    public abstract void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder namePrefix, UpdateParameters params) throws InvalidRequestException;
+    public abstract void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException;
 
     /**
      * A parsed raw UPDATE operation.
@@ -111,7 +92,7 @@
      * This can be one of:
      *   - Setting a value: c = v
      *   - Setting an element of a collection: c[x] = v
-     *   - An addition/substraction to a variable: c = c +/- v (where v can be a collection literal)
+     *   - An addition/subtraction to a variable: c = c +/- v (where v can be a collection literal)
      *   - An prepend operation: c = v + c
      */
     public interface RawUpdate
@@ -128,7 +109,7 @@
          * be a true column.
          * @return the prepared update operation.
          */
-        public Operation prepare(CFDefinition.Name receiver) throws InvalidRequestException;
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException;
 
         /**
          * @return whether this operation can be applied alongside the {@code
@@ -162,7 +143,7 @@
          * @param receiver the "column" this operation applies to.
          * @return the prepared delete operation.
          */
-        public Operation prepare(ColumnSpecification receiver) throws InvalidRequestException;
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException;
     }
 
     public static class SetValue implements RawUpdate
@@ -174,24 +155,24 @@
             this.value = value;
         }
 
-        public Operation prepare(CFDefinition.Name receiver) throws InvalidRequestException
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
         {
-            Term v = value.prepare(receiver);
+            Term v = value.prepare(keyspace, receiver);
 
             if (receiver.type instanceof CounterColumnType)
-                throw new InvalidRequestException(String.format("Cannot set the value of counter column %s (counters can only be incremented/decremented, not set)", receiver));
+                throw new InvalidRequestException(String.format("Cannot set the value of counter column %s (counters can only be incremented/decremented, not set)", receiver.name));
 
             if (!(receiver.type instanceof CollectionType))
-                return new Constants.Setter(receiver.kind == CFDefinition.Name.Kind.VALUE_ALIAS ? null : receiver.name, v);
+                return new Constants.Setter(receiver, v);
 
             switch (((CollectionType)receiver.type).kind)
             {
                 case LIST:
-                    return new Lists.Setter(receiver.name, v);
+                    return new Lists.Setter(receiver, v);
                 case SET:
-                    return new Sets.Setter(receiver.name, v);
+                    return new Sets.Setter(receiver, v);
                 case MAP:
-                    return new Maps.Setter(receiver.name, v);
+                    return new Maps.Setter(receiver, v);
             }
             throw new AssertionError();
         }
@@ -220,30 +201,30 @@
             this.value = value;
         }
 
-        public Operation prepare(CFDefinition.Name receiver) throws InvalidRequestException
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
         {
             if (!(receiver.type instanceof CollectionType))
-                throw new InvalidRequestException(String.format("Invalid operation (%s) for non collection column %s", toString(receiver), receiver));
+                throw new InvalidRequestException(String.format("Invalid operation (%s) for non collection column %s", toString(receiver), receiver.name));
 
             switch (((CollectionType)receiver.type).kind)
             {
                 case LIST:
-                    Term idx = selector.prepare(Lists.indexSpecOf(receiver));
-                    Term lval = value.prepare(Lists.valueSpecOf(receiver));
-                    return new Lists.SetterByIndex(receiver.name, idx, lval);
+                    Term idx = selector.prepare(keyspace, Lists.indexSpecOf(receiver));
+                    Term lval = value.prepare(keyspace, Lists.valueSpecOf(receiver));
+                    return new Lists.SetterByIndex(receiver, idx, lval);
                 case SET:
-                    throw new InvalidRequestException(String.format("Invalid operation (%s) for set column %s", toString(receiver), receiver));
+                    throw new InvalidRequestException(String.format("Invalid operation (%s) for set column %s", toString(receiver), receiver.name));
                 case MAP:
-                    Term key = selector.prepare(Maps.keySpecOf(receiver));
-                    Term mval = value.prepare(Maps.valueSpecOf(receiver));
-                    return new Maps.SetterByKey(receiver.name, key, mval);
+                    Term key = selector.prepare(keyspace, Maps.keySpecOf(receiver));
+                    Term mval = value.prepare(keyspace, Maps.valueSpecOf(receiver));
+                    return new Maps.SetterByKey(receiver, key, mval);
             }
             throw new AssertionError();
         }
 
         protected String toString(ColumnSpecification column)
         {
-            return String.format("%s[%s] = %s", column, selector, value);
+            return String.format("%s[%s] = %s", column.name, selector, value);
         }
 
         public boolean isCompatibleWith(RawUpdate other)
@@ -263,32 +244,32 @@
             this.value = value;
         }
 
-        public Operation prepare(CFDefinition.Name receiver) throws InvalidRequestException
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
         {
-            Term v = value.prepare(receiver);
+            Term v = value.prepare(keyspace, receiver);
 
             if (!(receiver.type instanceof CollectionType))
             {
                 if (!(receiver.type instanceof CounterColumnType))
-                    throw new InvalidRequestException(String.format("Invalid operation (%s) for non counter column %s", toString(receiver), receiver));
-                return new Constants.Adder(receiver.kind == CFDefinition.Name.Kind.VALUE_ALIAS ? null : receiver.name, v);
+                    throw new InvalidRequestException(String.format("Invalid operation (%s) for non counter column %s", toString(receiver), receiver.name));
+                return new Constants.Adder(receiver, v);
             }
 
             switch (((CollectionType)receiver.type).kind)
             {
                 case LIST:
-                    return new Lists.Appender(receiver.name, v);
+                    return new Lists.Appender(receiver, v);
                 case SET:
-                    return new Sets.Adder(receiver.name, v);
+                    return new Sets.Adder(receiver, v);
                 case MAP:
-                    return new Maps.Putter(receiver.name, v);
+                    return new Maps.Putter(receiver, v);
             }
             throw new AssertionError();
         }
 
         protected String toString(ColumnSpecification column)
         {
-            return String.format("%s = %s + %s", column, column, value);
+            return String.format("%s = %s + %s", column.name, column.name, value);
         }
 
         public boolean isCompatibleWith(RawUpdate other)
@@ -306,32 +287,35 @@
             this.value = value;
         }
 
-        public Operation prepare(CFDefinition.Name receiver) throws InvalidRequestException
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
         {
-            Term v = value.prepare(receiver);
-
             if (!(receiver.type instanceof CollectionType))
             {
                 if (!(receiver.type instanceof CounterColumnType))
-                    throw new InvalidRequestException(String.format("Invalid operation (%s) for non counter column %s", toString(receiver), receiver));
-                return new Constants.Substracter(receiver.kind == CFDefinition.Name.Kind.VALUE_ALIAS ? null : receiver.name, v);
+                    throw new InvalidRequestException(String.format("Invalid operation (%s) for non counter column %s", toString(receiver), receiver.name));
+                return new Constants.Substracter(receiver, value.prepare(keyspace, receiver));
             }
 
             switch (((CollectionType)receiver.type).kind)
             {
                 case LIST:
-                    return new Lists.Discarder(receiver.name, v);
+                    return new Lists.Discarder(receiver, value.prepare(keyspace, receiver));
                 case SET:
-                    return new Sets.Discarder(receiver.name, v);
+                    return new Sets.Discarder(receiver, value.prepare(keyspace, receiver));
                 case MAP:
-                    throw new InvalidRequestException(String.format("Invalid operation (%s) for map column %s", toString(receiver), receiver));
+                    // The value for a map subtraction is actually a set
+                    ColumnSpecification vr = new ColumnSpecification(receiver.ksName,
+                                                                     receiver.cfName,
+                                                                     receiver.name,
+                                                                     SetType.getInstance(((MapType)receiver.type).keys));
+                    return new Sets.Discarder(receiver, value.prepare(keyspace, vr));
             }
             throw new AssertionError();
         }
 
         protected String toString(ColumnSpecification column)
         {
-            return String.format("%s = %s - %s", column, column, value);
+            return String.format("%s = %s - %s", column.name, column.name, value);
         }
 
         public boolean isCompatibleWith(RawUpdate other)
@@ -349,19 +333,19 @@
             this.value = value;
         }
 
-        public Operation prepare(CFDefinition.Name receiver) throws InvalidRequestException
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
         {
-            Term v = value.prepare(receiver);
+            Term v = value.prepare(keyspace, receiver);
 
             if (!(receiver.type instanceof ListType))
-                throw new InvalidRequestException(String.format("Invalid operation (%s) for non list column %s", toString(receiver), receiver));
+                throw new InvalidRequestException(String.format("Invalid operation (%s) for non list column %s", toString(receiver), receiver.name));
 
-            return new Lists.Prepender(receiver.name, v);
+            return new Lists.Prepender(receiver, v);
         }
 
         protected String toString(ColumnSpecification column)
         {
-            return String.format("%s = %s - %s", column, value, column);
+            return String.format("%s = %s - %s", column.name, value, column.name);
         }
 
         public boolean isCompatibleWith(RawUpdate other)
@@ -384,10 +368,10 @@
             return id;
         }
 
-        public Operation prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
         {
             // No validation, deleting a column is always "well typed"
-            return new Constants.Deleter(id, receiver.type instanceof CollectionType);
+            return new Constants.Deleter(receiver);
         }
     }
 
@@ -407,22 +391,22 @@
             return id;
         }
 
-        public Operation prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
         {
             if (!(receiver.type instanceof CollectionType))
-                throw new InvalidRequestException(String.format("Invalid deletion operation for non collection column %s", receiver));
+                throw new InvalidRequestException(String.format("Invalid deletion operation for non collection column %s", receiver.name));
 
             switch (((CollectionType)receiver.type).kind)
             {
                 case LIST:
-                    Term idx = element.prepare(Lists.indexSpecOf(receiver));
-                    return new Lists.DiscarderByIndex(id, idx);
+                    Term idx = element.prepare(keyspace, Lists.indexSpecOf(receiver));
+                    return new Lists.DiscarderByIndex(receiver, idx);
                 case SET:
-                    Term elt = element.prepare(Sets.valueSpecOf(receiver));
-                    return new Sets.Discarder(id, elt);
+                    Term elt = element.prepare(keyspace, Sets.valueSpecOf(receiver));
+                    return new Sets.Discarder(receiver, elt);
                 case MAP:
-                    Term key = element.prepare(Maps.keySpecOf(receiver));
-                    return new Maps.DiscarderByKey(id, key);
+                    Term key = element.prepare(keyspace, Maps.keySpecOf(receiver));
+                    return new Maps.DiscarderByKey(receiver, key);
             }
             throw new AssertionError();
         }

diff --git a/src/java/org/apache/cassandra/cql3/QueryHandler.java b/src/java/org/apache/cassandra/cql3/QueryHandler.java
index 4d72333..d42d90e 100644
--- a/src/java/org/apache/cassandra/cql3/QueryHandler.java
+++ b/src/java/org/apache/cassandra/cql3/QueryHandler.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.cql3;
 
 import org.apache.cassandra.cql3.statements.BatchStatement;
+import org.apache.cassandra.cql3.statements.ParsedStatement;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.service.QueryState;
@@ -28,8 +29,8 @@
 {
     public ResultMessage process(String query, QueryState state, QueryOptions options) throws RequestExecutionException, RequestValidationException;
     public ResultMessage.Prepared prepare(String query, QueryState state) throws RequestValidationException;
-    public CQLStatement getPrepared(MD5Digest id);
-    public CQLStatement getPreparedForThrift(Integer id);
+    public ParsedStatement.Prepared getPrepared(MD5Digest id);
+    public ParsedStatement.Prepared getPreparedForThrift(Integer id);
     public ResultMessage processPrepared(CQLStatement statement, QueryState state, QueryOptions options) throws RequestExecutionException, RequestValidationException;
     public ResultMessage processBatch(BatchStatement statement, QueryState state, BatchQueryOptions options) throws RequestExecutionException, RequestValidationException;
 }

diff --git a/src/java/org/apache/cassandra/cql3/QueryOptions.java b/src/java/org/apache/cassandra/cql3/QueryOptions.java
index 0f3e11b..c946e8b 100644
--- a/src/java/org/apache/cassandra/cql3/QueryOptions.java
+++ b/src/java/org/apache/cassandra/cql3/QueryOptions.java

@@ -23,123 +23,238 @@
 import java.util.EnumSet;
 import java.util.List;
 
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.pager.PagingState;
 import org.apache.cassandra.transport.CBCodec;
 import org.apache.cassandra.transport.CBUtil;
+import org.apache.cassandra.transport.ProtocolException;
+import org.apache.cassandra.utils.Pair;
 
 /**
  * Options for a query.
  */
-public class QueryOptions
+public abstract class QueryOptions
 {
-    public static final QueryOptions DEFAULT = new QueryOptions(ConsistencyLevel.ONE, Collections.<ByteBuffer>emptyList());
+    public static final QueryOptions DEFAULT = new DefaultQueryOptions(ConsistencyLevel.ONE,
+                                                                       Collections.<ByteBuffer>emptyList(),
+                                                                       false,
+                                                                       SpecificOptions.DEFAULT,
+                                                                       3);
 
     public static final CBCodec<QueryOptions> codec = new Codec();
 
-    private final ConsistencyLevel consistency;
-    private final List<ByteBuffer> values;
-    private final boolean skipMetadata;
-
-    private final SpecificOptions options;
-
-    // The protocol version of incoming queries. This is set during deserializaion and will be 0
-    // if the QueryOptions does not come from a user message (or come from thrift).
-    private final transient int protocolVersion;
-
-    public QueryOptions(ConsistencyLevel consistency, List<ByteBuffer> values)
-    {
-        this(consistency, values, false, SpecificOptions.DEFAULT, 0);
-    }
-
-    public QueryOptions(ConsistencyLevel consistency,
-                        List<ByteBuffer> values,
-                        boolean skipMetadata,
-                        int pageSize,
-                        PagingState pagingState,
-                        ConsistencyLevel serialConsistency)
-    {
-        this(consistency, values, skipMetadata, new SpecificOptions(pageSize, pagingState, serialConsistency), 0);
-    }
-
-    private QueryOptions(ConsistencyLevel consistency, List<ByteBuffer> values, boolean skipMetadata, SpecificOptions options, int protocolVersion)
-    {
-        this.consistency = consistency;
-        this.values = values;
-        this.skipMetadata = skipMetadata;
-        this.options = options;
-        this.protocolVersion = protocolVersion;
-    }
-
     public static QueryOptions fromProtocolV1(ConsistencyLevel consistency, List<ByteBuffer> values)
     {
-        return new QueryOptions(consistency, values, false, SpecificOptions.DEFAULT, 1);
+        return new DefaultQueryOptions(consistency, values, false, SpecificOptions.DEFAULT, 1);
     }
 
-    public ConsistencyLevel getConsistency()
+    public static QueryOptions fromProtocolV2(ConsistencyLevel consistency, List<ByteBuffer> values)
     {
-        return consistency;
+        return new DefaultQueryOptions(consistency, values, false, SpecificOptions.DEFAULT, 2);
     }
 
-    public List<ByteBuffer> getValues()
+    public static QueryOptions forInternalCalls(ConsistencyLevel consistency, List<ByteBuffer> values)
     {
-        return values;
+        return new DefaultQueryOptions(consistency, values, false, SpecificOptions.DEFAULT, 3);
     }
 
-    public boolean skipMetadata()
+    public static QueryOptions forInternalCalls(List<ByteBuffer> values)
     {
-        return skipMetadata;
+        return new DefaultQueryOptions(ConsistencyLevel.ONE, values, false, SpecificOptions.DEFAULT, 3);
     }
 
-    /**
-     * The pageSize for this query. Will be <= 0 if not relevant for the query.
-     */
+    public static QueryOptions fromPreV3Batch(ConsistencyLevel consistency)
+    {
+        return new DefaultQueryOptions(consistency, Collections.<ByteBuffer>emptyList(), false, SpecificOptions.DEFAULT, 2);
+    }
+
+    public static QueryOptions create(ConsistencyLevel consistency, List<ByteBuffer> values, boolean skipMetadata, int pageSize, PagingState pagingState, ConsistencyLevel serialConsistency)
+    {
+        return new DefaultQueryOptions(consistency, values, skipMetadata, new SpecificOptions(pageSize, pagingState, serialConsistency, -1L), 0);
+    }
+
+    public abstract ConsistencyLevel getConsistency();
+    public abstract List<ByteBuffer> getValues();
+    public abstract boolean skipMetadata();
+
+    /**  The pageSize for this query. Will be <= 0 if not relevant for the query.  */
     public int getPageSize()
     {
-        return options.pageSize;
+        return getSpecificOptions().pageSize;
     }
 
-    /**
-     * The paging state for this query, or null if not relevant.
-     */
+    /** The paging state for this query, or null if not relevant. */
     public PagingState getPagingState()
     {
-        return options.state;
+        return getSpecificOptions().state;
     }
 
-    /**
-     * Serial consistency for conditional updates.
-     */
+    /**  Serial consistency for conditional updates. */
     public ConsistencyLevel getSerialConsistency()
     {
-        return options.serialConsistency;
+        return getSpecificOptions().serialConsistency;
+    }
+
+    public long getTimestamp(QueryState state)
+    {
+        long tstamp = getSpecificOptions().timestamp;
+        return tstamp >= 0 ? tstamp : state.getTimestamp();
     }
 
     /**
-     * The protocol version for the query. Will be 0 if the object don't come from
+     * The protocol version for the query. Will be 3 if the object don't come from
      * a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
      */
-    public int getProtocolVersion()
+    public abstract int getProtocolVersion();
+
+    // Mainly for the sake of BatchQueryOptions
+    abstract SpecificOptions getSpecificOptions();
+
+    public QueryOptions prepare(List<ColumnSpecification> specs)
     {
-        return protocolVersion;
+        return this;
+    }
+
+    static class DefaultQueryOptions extends QueryOptions
+    {
+        private final ConsistencyLevel consistency;
+        private final List<ByteBuffer> values;
+        private final boolean skipMetadata;
+
+        private final SpecificOptions options;
+
+        private final transient int protocolVersion;
+
+        DefaultQueryOptions(ConsistencyLevel consistency, List<ByteBuffer> values, boolean skipMetadata, SpecificOptions options, int protocolVersion)
+        {
+            this.consistency = consistency;
+            this.values = values;
+            this.skipMetadata = skipMetadata;
+            this.options = options;
+            this.protocolVersion = protocolVersion;
+        }
+
+        public ConsistencyLevel getConsistency()
+        {
+            return consistency;
+        }
+
+        public List<ByteBuffer> getValues()
+        {
+            return values;
+        }
+
+        public boolean skipMetadata()
+        {
+            return skipMetadata;
+        }
+
+        public int getProtocolVersion()
+        {
+            return protocolVersion;
+        }
+
+        SpecificOptions getSpecificOptions()
+        {
+            return options;
+        }
+    }
+
+    static abstract class QueryOptionsWrapper extends QueryOptions
+    {
+        protected final QueryOptions wrapped;
+
+        QueryOptionsWrapper(QueryOptions wrapped)
+        {
+            this.wrapped = wrapped;
+        }
+
+        public ConsistencyLevel getConsistency()
+        {
+            return wrapped.getConsistency();
+        }
+
+        public boolean skipMetadata()
+        {
+            return wrapped.skipMetadata();
+        }
+
+        public int getProtocolVersion()
+        {
+            return wrapped.getProtocolVersion();
+        }
+
+        SpecificOptions getSpecificOptions()
+        {
+            return wrapped.getSpecificOptions();
+        }
+
+        @Override
+        public QueryOptions prepare(List<ColumnSpecification> specs)
+        {
+            wrapped.prepare(specs);
+            return this;
+        }
+    }
+
+    static class OptionsWithNames extends QueryOptionsWrapper
+    {
+        private final List<String> names;
+        private List<ByteBuffer> orderedValues;
+
+        OptionsWithNames(DefaultQueryOptions wrapped, List<String> names)
+        {
+            super(wrapped);
+            this.names = names;
+        }
+
+        @Override
+        public QueryOptions prepare(List<ColumnSpecification> specs)
+        {
+            super.prepare(specs);
+
+            orderedValues = new ArrayList<ByteBuffer>(specs.size());
+            for (int i = 0; i < specs.size(); i++)
+            {
+                String name = specs.get(i).name.toString();
+                for (int j = 0; j < names.size(); j++)
+                {
+                    if (name.equals(names.get(j)))
+                    {
+                        orderedValues.add(wrapped.getValues().get(j));
+                        break;
+                    }
+                }
+            }
+            return this;
+        }
+
+        public List<ByteBuffer> getValues()
+        {
+            assert orderedValues != null; // We should have called prepare first!
+            return orderedValues;
+        }
     }
 
     // Options that are likely to not be present in most queries
-    private static class SpecificOptions
+    static class SpecificOptions
     {
-        private static final SpecificOptions DEFAULT = new SpecificOptions(-1, null, null);
+        private static final SpecificOptions DEFAULT = new SpecificOptions(-1, null, null, -1L);
 
         private final int pageSize;
         private final PagingState state;
         private final ConsistencyLevel serialConsistency;
+        private final long timestamp;
 
-        private SpecificOptions(int pageSize, PagingState state, ConsistencyLevel serialConsistency)
+        private SpecificOptions(int pageSize, PagingState state, ConsistencyLevel serialConsistency, long timestamp)
         {
             this.pageSize = pageSize;
             this.state = state;
             this.serialConsistency = serialConsistency == null ? ConsistencyLevel.SERIAL : serialConsistency;
+            this.timestamp = timestamp;
         }
     }
 
@@ -152,7 +267,9 @@
             SKIP_METADATA,
             PAGE_SIZE,
             PAGING_STATE,
-            SERIAL_CONSISTENCY;
+            SERIAL_CONSISTENCY,
+            TIMESTAMP,
+            NAMES_FOR_VALUES;
 
             private static final Flag[] ALL_VALUES = values();
 
@@ -176,16 +293,28 @@
             }
         }
 
-        public QueryOptions decode(ChannelBuffer body, int version)
+        public QueryOptions decode(ByteBuf body, int version)
         {
             assert version >= 2;
 
             ConsistencyLevel consistency = CBUtil.readConsistencyLevel(body);
             EnumSet<Flag> flags = Flag.deserialize((int)body.readByte());
 
-            List<ByteBuffer> values = flags.contains(Flag.VALUES)
-                                    ? CBUtil.readValueList(body)
-                                    : Collections.<ByteBuffer>emptyList();
+            List<ByteBuffer> values = Collections.<ByteBuffer>emptyList();
+            List<String> names = null;
+            if (flags.contains(Flag.VALUES))
+            {
+                if (flags.contains(Flag.NAMES_FOR_VALUES))
+                {
+                    Pair<List<String>, List<ByteBuffer>> namesAndValues = CBUtil.readNameAndValueList(body);
+                    names = namesAndValues.left;
+                    values = namesAndValues.right;
+                }
+                else
+                {
+                    values = CBUtil.readValueList(body);
+                }
+            }
 
             boolean skipMetadata = flags.contains(Flag.SKIP_METADATA);
             flags.remove(Flag.VALUES);
@@ -197,12 +326,22 @@
                 int pageSize = flags.contains(Flag.PAGE_SIZE) ? body.readInt() : -1;
                 PagingState pagingState = flags.contains(Flag.PAGING_STATE) ? PagingState.deserialize(CBUtil.readValue(body)) : null;
                 ConsistencyLevel serialConsistency = flags.contains(Flag.SERIAL_CONSISTENCY) ? CBUtil.readConsistencyLevel(body) : ConsistencyLevel.SERIAL;
-                options = new SpecificOptions(pageSize, pagingState, serialConsistency);
+                long timestamp = -1L;
+                if (flags.contains(Flag.TIMESTAMP))
+                {
+                    long ts = body.readLong();
+                    if (ts < 0)
+                        throw new ProtocolException("Invalid negative (" + ts + ") protocol level timestamp");
+                    timestamp = ts;
+                }
+
+                options = new SpecificOptions(pageSize, pagingState, serialConsistency, timestamp);
             }
-            return new QueryOptions(consistency, values, skipMetadata, options, version);
+            DefaultQueryOptions opts = new DefaultQueryOptions(consistency, values, skipMetadata, options, version);
+            return names == null ? opts : new OptionsWithNames(opts, names);
         }
 
-        public void encode(QueryOptions options, ChannelBuffer dest, int version)
+        public void encode(QueryOptions options, ByteBuf dest, int version)
         {
             assert version >= 2;
 
@@ -219,6 +358,12 @@
                 CBUtil.writeValue(options.getPagingState().serialize(), dest);
             if (flags.contains(Flag.SERIAL_CONSISTENCY))
                 CBUtil.writeConsistencyLevel(options.getSerialConsistency(), dest);
+            if (flags.contains(Flag.TIMESTAMP))
+                dest.writeLong(options.getSpecificOptions().timestamp);
+
+            // Note that we don't really have to bother with NAMES_FOR_VALUES server side,
+            // and in fact we never really encode QueryOptions, only decode them, so we
+            // don't bother.
         }
 
         public int encodedSize(QueryOptions options, int version)
@@ -238,6 +383,8 @@
                 size += CBUtil.sizeOfValue(options.getPagingState().serialize());
             if (flags.contains(Flag.SERIAL_CONSISTENCY))
                 size += CBUtil.sizeOfConsistencyLevel(options.getSerialConsistency());
+            if (flags.contains(Flag.TIMESTAMP))
+                size += 8;
 
             return size;
         }
@@ -247,7 +394,7 @@
             EnumSet<Flag> flags = EnumSet.noneOf(Flag.class);
             if (options.getValues().size() > 0)
                 flags.add(Flag.VALUES);
-            if (options.skipMetadata)
+            if (options.skipMetadata())
                 flags.add(Flag.SKIP_METADATA);
             if (options.getPageSize() >= 0)
                 flags.add(Flag.PAGE_SIZE);
@@ -255,6 +402,8 @@
                 flags.add(Flag.PAGING_STATE);
             if (options.getSerialConsistency() != ConsistencyLevel.SERIAL)
                 flags.add(Flag.SERIAL_CONSISTENCY);
+            if (options.getSpecificOptions().timestamp >= 0)
+                flags.add(Flag.TIMESTAMP);
             return flags;
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/QueryProcessor.java b/src/java/org/apache/cassandra/cql3/QueryProcessor.java
index a59fe9b..2818358 100644
--- a/src/java/org/apache/cassandra/cql3/QueryProcessor.java
+++ b/src/java/org/apache/cassandra/cql3/QueryProcessor.java

@@ -19,11 +19,15 @@
 
 import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicInteger;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.primitives.Ints;
 
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap;
 import com.googlecode.concurrentlinkedhashmap.EntryWeigher;
+import com.googlecode.concurrentlinkedhashmap.EvictionListener;
 import org.antlr.runtime.*;
 import org.github.jamm.MemoryMeter;
 import org.slf4j.Logger;
@@ -31,9 +35,13 @@
 
 import org.apache.cassandra.cql3.statements.*;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.metrics.CQLMetrics;
+import org.apache.cassandra.service.*;
+import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.service.pager.QueryPagers;
 import org.apache.cassandra.thrift.ThriftClientState;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.messages.ResultMessage;
@@ -43,72 +51,128 @@
 
 public class QueryProcessor implements QueryHandler
 {
-    public static final SemanticVersion CQL_VERSION = new SemanticVersion("3.1.7");
+    public static final SemanticVersion CQL_VERSION = new SemanticVersion("3.2.0");
 
     public static final QueryProcessor instance = new QueryProcessor();
 
     private static final Logger logger = LoggerFactory.getLogger(QueryProcessor.class);
-    private static final MemoryMeter meter = new MemoryMeter();
+    private static final MemoryMeter meter = new MemoryMeter().withGuessing(MemoryMeter.Guess.FALLBACK_BEST);
     private static final long MAX_CACHE_PREPARED_MEMORY = Runtime.getRuntime().maxMemory() / 256;
-    private static final int MAX_CACHE_PREPARED_COUNT = 10000;
 
-    private static EntryWeigher<MD5Digest, CQLStatement> cqlMemoryUsageWeigher = new EntryWeigher<MD5Digest, CQLStatement>()
+    private static EntryWeigher<MD5Digest, ParsedStatement.Prepared> cqlMemoryUsageWeigher = new EntryWeigher<MD5Digest, ParsedStatement.Prepared>()
     {
         @Override
-        public int weightOf(MD5Digest key, CQLStatement value)
+        public int weightOf(MD5Digest key, ParsedStatement.Prepared value)
         {
-            return Ints.checkedCast(measure(key) + measure(value));
+            return Ints.checkedCast(measure(key) + measure(value.statement) + measure(value.boundNames));
         }
     };
 
-    private static EntryWeigher<Integer, CQLStatement> thriftMemoryUsageWeigher = new EntryWeigher<Integer, CQLStatement>()
+    private static EntryWeigher<Integer, ParsedStatement.Prepared> thriftMemoryUsageWeigher = new EntryWeigher<Integer, ParsedStatement.Prepared>()
     {
         @Override
-        public int weightOf(Integer key, CQLStatement value)
+        public int weightOf(Integer key, ParsedStatement.Prepared value)
         {
-            return Ints.checkedCast(measure(key) + measure(value));
+            return Ints.checkedCast(measure(key) + measure(value.statement) + measure(value.boundNames));
         }
     };
 
-    private static final ConcurrentLinkedHashMap<MD5Digest, CQLStatement> preparedStatements;
-    private static final ConcurrentLinkedHashMap<Integer, CQLStatement> thriftPreparedStatements;
+    private static final ConcurrentLinkedHashMap<MD5Digest, ParsedStatement.Prepared> preparedStatements;
+    private static final ConcurrentLinkedHashMap<Integer, ParsedStatement.Prepared> thriftPreparedStatements;
+
+    // A map for prepared statements used internally (which we don't want to mix with user statement, in particular we don't
+    // bother with expiration on those.
+    private static final ConcurrentMap<String, ParsedStatement.Prepared> internalStatements = new ConcurrentHashMap<>();
+
+    @VisibleForTesting
+    public static final CQLMetrics metrics = new CQLMetrics();
+
+    private static final AtomicInteger lastMinuteEvictionsCount = new AtomicInteger(0);
+    private static final ScheduledExecutorService evictionCheckTimer = Executors.newScheduledThreadPool(1);
 
     static
     {
-        if (MemoryMeter.isInitialized())
+        preparedStatements = new ConcurrentLinkedHashMap.Builder<MD5Digest, ParsedStatement.Prepared>()
+                             .maximumWeightedCapacity(MAX_CACHE_PREPARED_MEMORY)
+                             .weigher(cqlMemoryUsageWeigher)
+                             .listener(new EvictionListener<MD5Digest, ParsedStatement.Prepared>()
+                             {
+                                 public void onEviction(MD5Digest md5Digest, ParsedStatement.Prepared prepared)
+                                 {
+                                     metrics.preparedStatementsEvicted.inc();
+                                     lastMinuteEvictionsCount.incrementAndGet();
+                                 }
+                             }).build();
+
+        thriftPreparedStatements = new ConcurrentLinkedHashMap.Builder<Integer, ParsedStatement.Prepared>()
+                                   .maximumWeightedCapacity(MAX_CACHE_PREPARED_MEMORY)
+                                   .weigher(thriftMemoryUsageWeigher)
+                                   .listener(new EvictionListener<Integer, ParsedStatement.Prepared>()
+                                   {
+                                       public void onEviction(Integer integer, ParsedStatement.Prepared prepared)
+                                       {
+                                           metrics.preparedStatementsEvicted.inc();
+                                           lastMinuteEvictionsCount.incrementAndGet();
+                                       }
+                                   })
+                                   .build();
+
+        evictionCheckTimer.scheduleAtFixedRate(new Runnable()
         {
-            preparedStatements = new ConcurrentLinkedHashMap.Builder<MD5Digest, CQLStatement>()
-                                 .maximumWeightedCapacity(MAX_CACHE_PREPARED_MEMORY)
-                                 .weigher(cqlMemoryUsageWeigher)
-                                 .build();
-            thriftPreparedStatements = new ConcurrentLinkedHashMap.Builder<Integer, CQLStatement>()
-                                       .maximumWeightedCapacity(MAX_CACHE_PREPARED_MEMORY)
-                                       .weigher(thriftMemoryUsageWeigher)
-                                       .build();
-        }
-        else
+            public void run()
+            {
+                long count = lastMinuteEvictionsCount.getAndSet(0);
+                if (count > 0)
+                    logger.info("{} prepared statements discarded in the last minute because cache limit reached ({} bytes)",
+                                count,
+                                MAX_CACHE_PREPARED_MEMORY);
+            }
+        }, 1, 1, TimeUnit.MINUTES);
+    }
+
+    public static int preparedStatementsCount()
+    {
+        return preparedStatements.size() + thriftPreparedStatements.size();
+    }
+
+    // Work around initialization dependency
+    private static enum InternalStateInstance
+    {
+        INSTANCE;
+
+        private final QueryState queryState;
+
+        InternalStateInstance()
         {
-            logger.error("Unable to initialize MemoryMeter (jamm not specified as javaagent).  This means "
-                         + "Cassandra will be unable to measure object sizes accurately and may consequently OOM.");
-            preparedStatements = new ConcurrentLinkedHashMap.Builder<MD5Digest, CQLStatement>()
-                                 .maximumWeightedCapacity(MAX_CACHE_PREPARED_COUNT)
-                                 .build();
-            thriftPreparedStatements = new ConcurrentLinkedHashMap.Builder<Integer, CQLStatement>()
-                                       .maximumWeightedCapacity(MAX_CACHE_PREPARED_COUNT)
-                                       .build();
+            ClientState state = ClientState.forInternalCalls();
+            try
+            {
+                state.setKeyspace(Keyspace.SYSTEM_KS);
+            }
+            catch (InvalidRequestException e)
+            {
+                throw new RuntimeException();
+            }
+            this.queryState = new QueryState(state);
         }
     }
 
+    private static QueryState internalQueryState()
+    {
+        return InternalStateInstance.INSTANCE.queryState;
+    }
+
     private QueryProcessor()
     {
+        MigrationManager.instance.register(new MigrationSubscriber());
     }
 
-    public CQLStatement getPrepared(MD5Digest id)
+    public ParsedStatement.Prepared getPrepared(MD5Digest id)
     {
         return preparedStatements.get(id);
     }
 
-    public CQLStatement getPreparedForThrift(Integer id)
+    public ParsedStatement.Prepared getPreparedForThrift(Integer id)
     {
         return thriftPreparedStatements.get(id);
     }
@@ -128,24 +192,29 @@
         }
     }
 
-    public static void validateCellNames(Iterable<ByteBuffer> cellNames) throws InvalidRequestException
+    public static void validateCellNames(Iterable<CellName> cellNames, CellNameType type) throws InvalidRequestException
     {
-        for (ByteBuffer name : cellNames)
-            validateCellName(name);
+        for (CellName name : cellNames)
+            validateCellName(name, type);
     }
 
-    public static void validateCellName(ByteBuffer name) throws InvalidRequestException
+    public static void validateCellName(CellName name, CellNameType type) throws InvalidRequestException
     {
-        if (name.remaining() > Column.MAX_NAME_LENGTH)
-            throw new InvalidRequestException(String.format("The sum of all clustering columns is too long (%s > %s)",
-                                                            name.remaining(),
-                                                            Column.MAX_NAME_LENGTH));
-
-        if (name.remaining() == 0)
+        validateComposite(name, type);
+        if (name.isEmpty())
             throw new InvalidRequestException("Invalid empty value for clustering column of COMPACT TABLE");
     }
 
-    public static ResultMessage processStatement(CQLStatement statement,
+    public static void validateComposite(Composite name, CType type) throws InvalidRequestException
+    {
+        long serializedSize = type.serializer().serializedSize(name, TypeSizes.NATIVE);
+        if (serializedSize > Cell.MAX_NAME_LENGTH)
+            throw new InvalidRequestException(String.format("The sum of all clustering columns is too long (%s > %s)",
+                                                            serializedSize,
+                                                            Cell.MAX_NAME_LENGTH));
+    }
+
+    private static ResultMessage processStatement(CQLStatement statement,
                                                   QueryState queryState,
                                                   QueryOptions options)
     throws RequestExecutionException, RequestValidationException
@@ -162,31 +231,36 @@
     public static ResultMessage process(String queryString, ConsistencyLevel cl, QueryState queryState)
     throws RequestExecutionException, RequestValidationException
     {
-        return instance.process(queryString, queryState, new QueryOptions(cl, Collections.<ByteBuffer>emptyList()));
+        return instance.process(queryString, queryState, QueryOptions.forInternalCalls(cl, Collections.<ByteBuffer>emptyList()));
     }
 
     public ResultMessage process(String queryString, QueryState queryState, QueryOptions options)
     throws RequestExecutionException, RequestValidationException
     {
-        CQLStatement prepared = getStatement(queryString, queryState.getClientState()).statement;
+        ParsedStatement.Prepared p = getStatement(queryString, queryState.getClientState());
+        options.prepare(p.boundNames);
+        CQLStatement prepared = p.statement;
         if (prepared.getBoundTerms() != options.getValues().size())
             throw new InvalidRequestException("Invalid amount of bind variables");
 
+        if (!queryState.getClientState().isInternal)
+            metrics.regularStatementsExecuted.inc();
+
         return processStatement(prepared, queryState, options);
     }
 
-    public static CQLStatement parseStatement(String queryStr, QueryState queryState) throws RequestValidationException
+    public static ParsedStatement.Prepared parseStatement(String queryStr, QueryState queryState) throws RequestValidationException
     {
-        return getStatement(queryStr, queryState.getClientState()).statement;
+        return getStatement(queryStr, queryState.getClientState());
     }
 
     public static UntypedResultSet process(String query, ConsistencyLevel cl) throws RequestExecutionException
     {
         try
         {
-            ResultMessage result = instance.process(query, QueryState.forInternalCalls(), new QueryOptions(cl, Collections.<ByteBuffer>emptyList()));
+            ResultMessage result = instance.process(query, QueryState.forInternalCalls(), QueryOptions.forInternalCalls(cl, Collections.<ByteBuffer>emptyList()));
             if (result instanceof ResultMessage.Rows)
-                return new UntypedResultSet(((ResultMessage.Rows)result).result);
+                return UntypedResultSet.create(((ResultMessage.Rows)result).result);
             else
                 return null;
         }
@@ -196,18 +270,42 @@
         }
     }
 
-    public static UntypedResultSet processInternal(String query)
+    private static QueryOptions makeInternalOptions(ParsedStatement.Prepared prepared, Object[] values)
+    {
+        if (prepared.boundNames.size() != values.length)
+            throw new IllegalArgumentException(String.format("Invalid number of values. Expecting %d but got %d", prepared.boundNames.size(), values.length));
+
+        List<ByteBuffer> boundValues = new ArrayList<ByteBuffer>(values.length);
+        for (int i = 0; i < values.length; i++)
+        {
+            Object value = values[i];
+            AbstractType type = prepared.boundNames.get(i).type;
+            boundValues.add(value instanceof ByteBuffer || value == null ? (ByteBuffer)value : type.decompose(value));
+        }
+        return QueryOptions.forInternalCalls(boundValues);
+    }
+
+    private static ParsedStatement.Prepared prepareInternal(String query) throws RequestValidationException
+    {
+        ParsedStatement.Prepared prepared = internalStatements.get(query);
+        if (prepared != null)
+            return prepared;
+
+        // Note: if 2 threads prepare the same query, we'll live so don't bother synchronizing
+        prepared = parseStatement(query, internalQueryState());
+        prepared.statement.validate(internalQueryState().getClientState());
+        internalStatements.putIfAbsent(query, prepared);
+        return prepared;
+    }
+
+    public static UntypedResultSet executeInternal(String query, Object... values)
     {
         try
         {
-            ClientState state = ClientState.forInternalCalls();
-            QueryState qState = new QueryState(state);
-            state.setKeyspace(Keyspace.SYSTEM_KS);
-            CQLStatement statement = getStatement(query, state).statement;
-            statement.validate(state);
-            ResultMessage result = statement.executeInternal(qState, QueryOptions.DEFAULT);
+            ParsedStatement.Prepared prepared = prepareInternal(query);
+            ResultMessage result = prepared.statement.executeInternal(internalQueryState(), makeInternalOptions(prepared, values));
             if (result instanceof ResultMessage.Rows)
-                return new UntypedResultSet(((ResultMessage.Rows)result).result);
+                return UntypedResultSet.create(((ResultMessage.Rows)result).result);
             else
                 return null;
         }
@@ -221,13 +319,62 @@
         }
     }
 
+    public static UntypedResultSet executeInternalWithPaging(String query, int pageSize, Object... values)
+    {
+        try
+        {
+            ParsedStatement.Prepared prepared = prepareInternal(query);
+            if (!(prepared.statement instanceof SelectStatement))
+                throw new IllegalArgumentException("Only SELECTs can be paged");
+
+            SelectStatement select = (SelectStatement)prepared.statement;
+            QueryPager pager = QueryPagers.localPager(select.getPageableCommand(makeInternalOptions(prepared, values)));
+            return UntypedResultSet.create(select, pager, pageSize);
+        }
+        catch (RequestValidationException e)
+        {
+            throw new RuntimeException("Error validating query" + e);
+        }
+    }
+
+    /**
+     * Same than executeInternal, but to use for queries we know are only executed once so that the
+     * created statement object is not cached.
+     */
+    public static UntypedResultSet executeOnceInternal(String query, Object... values)
+    {
+        try
+        {
+            ParsedStatement.Prepared prepared = parseStatement(query, internalQueryState());
+            prepared.statement.validate(internalQueryState().getClientState());
+            ResultMessage result = prepared.statement.executeInternal(internalQueryState(), makeInternalOptions(prepared, values));
+            if (result instanceof ResultMessage.Rows)
+                return UntypedResultSet.create(((ResultMessage.Rows)result).result);
+            else
+                return null;
+        }
+        catch (RequestExecutionException e)
+        {
+            throw new RuntimeException(e);
+        }
+        catch (RequestValidationException e)
+        {
+            throw new RuntimeException("Error validating query " + query, e);
+        }
+    }
+
     public static UntypedResultSet resultify(String query, Row row)
     {
+        return resultify(query, Collections.singletonList(row));
+    }
+
+    public static UntypedResultSet resultify(String query, List<Row> rows)
+    {
         try
         {
             SelectStatement ss = (SelectStatement) getStatement(query, null).statement;
-            ResultSet cqlRows = ss.process(Collections.singletonList(row));
-            return new UntypedResultSet(cqlRows);
+            ResultSet cqlRows = ss.process(rows);
+            return UntypedResultSet.create(cqlRows);
         }
         catch (RequestValidationException e)
         {
@@ -245,6 +392,10 @@
     public static ResultMessage.Prepared prepare(String queryString, ClientState clientState, boolean forThrift)
     throws RequestValidationException
     {
+        ResultMessage.Prepared existing = getStoredPreparedStatement(queryString, clientState.getRawKeyspace(), forThrift);
+        if (existing != null)
+            return existing;
+
         ParsedStatement.Prepared prepared = getStatement(queryString, clientState);
         int boundTerms = prepared.statement.getBoundTerms();
         if (boundTerms > FBUtilities.MAX_UNSIGNED_SHORT)
@@ -254,35 +405,56 @@
         return storePreparedStatement(queryString, clientState.getRawKeyspace(), prepared, forThrift);
     }
 
+    private static MD5Digest computeId(String queryString, String keyspace)
+    {
+        String toHash = keyspace == null ? queryString : keyspace + queryString;
+        return MD5Digest.compute(toHash);
+    }
+
+    private static Integer computeThriftId(String queryString, String keyspace)
+    {
+        String toHash = keyspace == null ? queryString : keyspace + queryString;
+        return toHash.hashCode();
+    }
+
+    private static ResultMessage.Prepared getStoredPreparedStatement(String queryString, String keyspace, boolean forThrift)
+    throws InvalidRequestException
+    {
+        if (forThrift)
+        {
+            Integer thriftStatementId = computeThriftId(queryString, keyspace);
+            ParsedStatement.Prepared existing = thriftPreparedStatements.get(thriftStatementId);
+            return existing == null ? null : ResultMessage.Prepared.forThrift(thriftStatementId, existing.boundNames);
+        }
+        else
+        {
+            MD5Digest statementId = computeId(queryString, keyspace);
+            ParsedStatement.Prepared existing = preparedStatements.get(statementId);
+            return existing == null ? null : new ResultMessage.Prepared(statementId, existing);
+        }
+    }
+
     private static ResultMessage.Prepared storePreparedStatement(String queryString, String keyspace, ParsedStatement.Prepared prepared, boolean forThrift)
     throws InvalidRequestException
     {
         // Concatenate the current keyspace so we don't mix prepared statements between keyspace (#5352).
         // (if the keyspace is null, queryString has to have a fully-qualified keyspace so it's fine.
-        String toHash = keyspace == null ? queryString : keyspace + queryString;
         long statementSize = measure(prepared.statement);
         // don't execute the statement if it's bigger than the allowed threshold
         if (statementSize > MAX_CACHE_PREPARED_MEMORY)
             throw new InvalidRequestException(String.format("Prepared statement of size %d bytes is larger than allowed maximum of %d bytes.",
                                                             statementSize,
                                                             MAX_CACHE_PREPARED_MEMORY));
-
         if (forThrift)
         {
-            int statementId = toHash.hashCode();
-            thriftPreparedStatements.put(statementId, prepared.statement);
-            logger.trace(String.format("Stored prepared statement #%d with %d bind markers",
-                                       statementId,
-                                       prepared.statement.getBoundTerms()));
+            Integer statementId = computeThriftId(queryString, keyspace);
+            thriftPreparedStatements.put(statementId, prepared);
             return ResultMessage.Prepared.forThrift(statementId, prepared.boundNames);
         }
         else
         {
-            MD5Digest statementId = MD5Digest.compute(toHash);
-            preparedStatements.put(statementId, prepared.statement);
-            logger.trace(String.format("Stored prepared statement %s with %d bind markers",
-                                       statementId,
-                                       prepared.statement.getBoundTerms()));
+            MD5Digest statementId = computeId(queryString, keyspace);
+            preparedStatements.put(statementId, prepared);
             return new ResultMessage.Prepared(statementId, prepared);
         }
     }
@@ -306,6 +478,7 @@
                     logger.trace("[{}] '{}'", i+1, variables.get(i));
         }
 
+        metrics.preparedStatementsExecuted.inc();
         return processStatement(statement, queryState, options);
     }
 
@@ -314,9 +487,9 @@
     {
         ClientState clientState = queryState.getClientState();
         batch.checkAccess(clientState);
+        batch.validate();
         batch.validate(clientState);
-
-        return batch.executeWithPerStatementVariables(options.getConsistency(), queryState, options.getValues());
+        return batch.execute(queryState, options);
     }
 
     public static ParsedStatement.Prepared getStatement(String queryStr, ClientState clientState)
@@ -338,18 +511,21 @@
         try
         {
             // Lexer and parser
+            ErrorCollector errorCollector = new ErrorCollector(queryStr);
             CharStream stream = new ANTLRStringStream(queryStr);
             CqlLexer lexer = new CqlLexer(stream);
+            lexer.addErrorListener(errorCollector);
+
             TokenStream tokenStream = new CommonTokenStream(lexer);
             CqlParser parser = new CqlParser(tokenStream);
+            parser.addErrorListener(errorCollector);
 
             // Parse the query string to a statement instance
             ParsedStatement statement = parser.query();
 
-            // The lexer and parser queue up any errors they may have encountered
-            // along the way, if necessary, we turn them into exceptions here.
-            lexer.throwLastRecognitionError();
-            parser.throwLastRecognitionError();
+            // The errorCollector has queue up any errors that the lexer and parser may have encountered
+            // along the way, if necessary, we turn the last error into exceptions here.
+            errorCollector.throwLastSyntaxError();
 
             return statement;
         }
@@ -368,11 +544,70 @@
 
     private static long measure(Object key)
     {
-        if (!MemoryMeter.isInitialized())
-            return 1;
-
         return key instanceof MeasurableForPreparedCache
              ? ((MeasurableForPreparedCache)key).measureForPreparedCache(meter)
              : meter.measureDeep(key);
     }
+
+    private static class MigrationSubscriber implements IMigrationListener
+    {
+        private void removeInvalidPreparedStatements(String ksName, String cfName)
+        {
+            removeInvalidPreparedStatements(preparedStatements.values().iterator(), ksName, cfName);
+            removeInvalidPreparedStatements(thriftPreparedStatements.values().iterator(), ksName, cfName);
+        }
+
+        private void removeInvalidPreparedStatements(Iterator<ParsedStatement.Prepared> iterator, String ksName, String cfName)
+        {
+            while (iterator.hasNext())
+            {
+                if (shouldInvalidate(ksName, cfName, iterator.next().statement))
+                    iterator.remove();
+            }
+        }
+
+        private boolean shouldInvalidate(String ksName, String cfName, CQLStatement statement)
+        {
+            String statementKsName;
+            String statementCfName;
+
+            if (statement instanceof ModificationStatement)
+            {
+                ModificationStatement modificationStatement = ((ModificationStatement) statement);
+                statementKsName = modificationStatement.keyspace();
+                statementCfName = modificationStatement.columnFamily();
+            }
+            else if (statement instanceof SelectStatement)
+            {
+                SelectStatement selectStatement = ((SelectStatement) statement);
+                statementKsName = selectStatement.keyspace();
+                statementCfName = selectStatement.columnFamily();
+            }
+            else
+            {
+                return false;
+            }
+
+            return ksName.equals(statementKsName) && (cfName == null || cfName.equals(statementCfName));
+        }
+
+        public void onCreateKeyspace(String ksName) { }
+        public void onCreateColumnFamily(String ksName, String cfName) { }
+        public void onCreateUserType(String ksName, String typeName) { }
+        public void onUpdateKeyspace(String ksName) { }
+        public void onUpdateColumnFamily(String ksName, String cfName) { }
+        public void onUpdateUserType(String ksName, String typeName) { }
+
+        public void onDropKeyspace(String ksName)
+        {
+            removeInvalidPreparedStatements(ksName, null);
+        }
+
+        public void onDropColumnFamily(String ksName, String cfName)
+        {
+            removeInvalidPreparedStatements(ksName, cfName);
+        }
+
+        public void onDropUserType(String ksName, String typeName) { }
+	}
 }

diff --git a/src/java/org/apache/cassandra/cql3/Relation.java b/src/java/org/apache/cassandra/cql3/Relation.java
index 0f1366d..42373c3 100644
--- a/src/java/org/apache/cassandra/cql3/Relation.java
+++ b/src/java/org/apache/cassandra/cql3/Relation.java

@@ -17,13 +17,27 @@
  */
 package org.apache.cassandra.cql3;
 
+
 public abstract class Relation {
 
     protected Type relationType;
 
     public static enum Type
     {
-        EQ, LT, LTE, GTE, GT, IN;
+        EQ, LT, LTE, GTE, GT, IN, CONTAINS, CONTAINS_KEY, NEQ;
+
+        public boolean allowsIndexQuery()
+        {
+            switch (this)
+            {
+                case EQ:
+                case CONTAINS:
+                case CONTAINS_KEY:
+                    return true;
+                default:
+                    return false;
+            }
+        }
 
         @Override
         public String toString()
@@ -40,8 +54,8 @@
                     return ">";
                 case GTE:
                     return ">=";
-                case IN:
-                    return "IN";
+                case NEQ:
+                    return "!=";
                 default:
                     return this.name();
             }

diff --git a/src/java/org/apache/cassandra/cql3/ResultSet.java b/src/java/org/apache/cassandra/cql3/ResultSet.java
index 4cda0cd..e463b29 100644
--- a/src/java/org/apache/cassandra/cql3/ResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/ResultSet.java

@@ -20,7 +20,7 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.transport.*;
 import org.apache.cassandra.db.marshal.AbstractType;
@@ -60,14 +60,14 @@
 
     public void addRow(List<ByteBuffer> row)
     {
-        assert row.size() == metadata.columnCount;
+        assert row.size() == metadata.valueCount();
         rows.add(row);
     }
 
     public void addColumnValue(ByteBuffer value)
     {
-        if (rows.isEmpty() || lastRow().size() == metadata.columnCount)
-            rows.add(new ArrayList<ByteBuffer>(metadata.columnCount));
+        if (rows.isEmpty() || lastRow().size() == metadata.valueCount())
+            rows.add(new ArrayList<ByteBuffer>(metadata.valueCount()));
 
         lastRow().add(value);
     }
@@ -123,11 +123,12 @@
                 // The 2 following ones shouldn't be needed in CQL3
                 UTF8, UTF8);
 
-        for (ColumnSpecification name : metadata.names)
+        for (int i = 0; i < metadata.columnCount; i++)
         {
-            ByteBuffer colName = ByteBufferUtil.bytes(name.toString());
+            ColumnSpecification spec = metadata.names.get(i);
+            ByteBuffer colName = ByteBufferUtil.bytes(spec.name.toString());
             schema.name_types.put(colName, UTF8);
-            AbstractType<?> normalizedType = name.type instanceof ReversedType ? ((ReversedType)name.type).baseType : name.type;
+            AbstractType<?> normalizedType = spec.type instanceof ReversedType ? ((ReversedType)spec.type).baseType : spec.type;
             schema.value_types.put(colName, normalizedType.toString());
 
         }
@@ -135,10 +136,10 @@
         List<CqlRow> cqlRows = new ArrayList<CqlRow>(rows.size());
         for (List<ByteBuffer> row : rows)
         {
-            List<Column> thriftCols = new ArrayList<Column>(metadata.names.size());
-            for (int i = 0; i < metadata.names.size(); i++)
+            List<Column> thriftCols = new ArrayList<Column>(metadata.columnCount);
+            for (int i = 0; i < metadata.columnCount; i++)
             {
-                Column col = new Column(ByteBufferUtil.bytes(metadata.names.get(i).toString()));
+                Column col = new Column(ByteBufferUtil.bytes(metadata.names.get(i).name.toString()));
                 col.setValue(row.get(i));
                 thriftCols.add(col);
             }
@@ -194,7 +195,7 @@
          *   - rows count (4 bytes)
          *   - rows
          */
-        public ResultSet decode(ChannelBuffer body, int version)
+        public ResultSet decode(ByteBuf body, int version)
         {
             Metadata m = Metadata.codec.decode(body, version);
             int rowCount = body.readInt();
@@ -208,14 +209,16 @@
             return rs;
         }
 
-        public void encode(ResultSet rs, ChannelBuffer dest, int version)
+        public void encode(ResultSet rs, ByteBuf dest, int version)
         {
             Metadata.codec.encode(rs.metadata, dest, version);
             dest.writeInt(rs.rows.size());
             for (List<ByteBuffer> row : rs.rows)
             {
-                for (ByteBuffer bb : row)
-                    CBUtil.writeValue(bb, dest);
+                // Note that we do only want to serialize only the first columnCount values, even if the row
+                // as more: see comment on Metadata.names field.
+                for (int i = 0; i < rs.metadata.columnCount; i++)
+                    CBUtil.writeValue(row.get(i), dest);
             }
         }
 
@@ -224,8 +227,8 @@
             int size = Metadata.codec.encodedSize(rs.metadata, version) + 4;
             for (List<ByteBuffer> row : rs.rows)
             {
-                for (ByteBuffer bb : row)
-                    size += CBUtil.sizeOfValue(bb);
+                for (int i = 0; i < rs.metadata.columnCount; i++)
+                    size += CBUtil.sizeOfValue(row.get(i));
             }
             return size;
         }
@@ -235,32 +238,48 @@
     {
         public static final CBCodec<Metadata> codec = new Codec();
 
-        public static final Metadata EMPTY = new Metadata(EnumSet.of(Flag.NO_METADATA), 0);
+        public static final Metadata EMPTY = new Metadata(EnumSet.of(Flag.NO_METADATA), null, 0, null);
 
-        public final EnumSet<Flag> flags;
+        private final EnumSet<Flag> flags;
+        // Please note that columnCount can actually be smaller than names, even if names is not null. This is
+        // used to include columns in the resultSet that we need to do post-query re-orderings
+        // (SelectStatement.orderResults) but that shouldn't be sent to the user as they haven't been requested
+        // (CASSANDRA-4911). So the serialization code will exclude any columns in name whose index is >= columnCount.
         public final List<ColumnSpecification> names;
-        public final int columnCount;
-        public PagingState pagingState;
+        private final int columnCount;
+        private PagingState pagingState;
 
         public Metadata(List<ColumnSpecification> names)
         {
-            this(EnumSet.noneOf(Flag.class), names);
+            this(EnumSet.noneOf(Flag.class), names, names.size(), null);
             if (!names.isEmpty() && allInSameCF())
                 flags.add(Flag.GLOBAL_TABLES_SPEC);
         }
 
-        private Metadata(EnumSet<Flag> flags, List<ColumnSpecification> names)
+        private Metadata(EnumSet<Flag> flags, List<ColumnSpecification> names, int columnCount, PagingState pagingState)
         {
             this.flags = flags;
             this.names = names;
-            this.columnCount = names.size();
+            this.columnCount = columnCount;
+            this.pagingState = pagingState;
         }
 
-        private Metadata(EnumSet<Flag> flags, int columnCount)
+        public Metadata copy()
         {
-            this.flags = flags;
-            this.names = null;
-            this.columnCount = columnCount;
+            return new Metadata(EnumSet.copyOf(flags), names, columnCount, pagingState);
+        }
+
+        // The maximum number of values that the ResultSet can hold. This can be bigger than columnCount due to CASSANDRA-4911
+        public int valueCount()
+        {
+            return names == null ? columnCount : names.size();
+        }
+
+        public void addNonSerializedColumn(ColumnSpecification name)
+        {
+            // See comment above. Because columnCount doesn't account the newly added name, it
+            // won't be serialized.
+            names.add(name);
         }
 
         private boolean allInSameCF()
@@ -281,14 +300,13 @@
             return true;
         }
 
-        public Metadata setHasMorePages(PagingState pagingState)
+        public void setHasMorePages(PagingState pagingState)
         {
             if (pagingState == null)
-                return this;
+                return;
 
             flags.add(Flag.HAS_MORE_PAGES);
             this.pagingState = pagingState;
-            return this;
         }
 
         public void setSkipMetadata()
@@ -309,7 +327,7 @@
             {
                 for (ColumnSpecification name : names)
                 {
-                    sb.append("[").append(name.toString());
+                    sb.append("[").append(name.name.toString());
                     sb.append("(").append(name.ksName).append(", ").append(name.cfName).append(")");
                     sb.append(", ").append(name.type).append("]");
                 }
@@ -321,7 +339,7 @@
 
         private static class Codec implements CBCodec<Metadata>
         {
-            public Metadata decode(ChannelBuffer body, int version)
+            public Metadata decode(ByteBuf body, int version)
             {
                 // flags & column count
                 int iflags = body.readInt();
@@ -334,7 +352,7 @@
                     state = PagingState.deserialize(CBUtil.readValue(body));
 
                 if (flags.contains(Flag.NO_METADATA))
-                    return new Metadata(flags, columnCount).setHasMorePages(state);
+                    return new Metadata(flags, null, columnCount, state);
 
                 boolean globalTablesSpec = flags.contains(Flag.GLOBAL_TABLES_SPEC);
 
@@ -353,13 +371,13 @@
                     String ksName = globalTablesSpec ? globalKsName : CBUtil.readString(body);
                     String cfName = globalTablesSpec ? globalCfName : CBUtil.readString(body);
                     ColumnIdentifier colName = new ColumnIdentifier(CBUtil.readString(body), true);
-                    AbstractType type = DataType.toType(DataType.codec.decodeOne(body));
+                    AbstractType type = DataType.toType(DataType.codec.decodeOne(body, version));
                     names.add(new ColumnSpecification(ksName, cfName, colName, type));
                 }
-                return new Metadata(flags, names).setHasMorePages(state);
+                return new Metadata(flags, names, names.size(), state);
             }
 
-            public void encode(Metadata m, ChannelBuffer dest, int version)
+            public void encode(Metadata m, ByteBuf dest, int version)
             {
                 boolean noMetadata = m.flags.contains(Flag.NO_METADATA);
                 boolean globalTablesSpec = m.flags.contains(Flag.GLOBAL_TABLES_SPEC);
@@ -381,15 +399,16 @@
                         CBUtil.writeString(m.names.get(0).cfName, dest);
                     }
 
-                    for (ColumnSpecification name : m.names)
+                    for (int i = 0; i < m.columnCount; i++)
                     {
+                        ColumnSpecification name = m.names.get(i);
                         if (!globalTablesSpec)
                         {
                             CBUtil.writeString(name.ksName, dest);
                             CBUtil.writeString(name.cfName, dest);
                         }
-                        CBUtil.writeString(name.toString(), dest);
-                        DataType.codec.writeOne(DataType.fromType(name.type), dest);
+                        CBUtil.writeString(name.name.toString(), dest);
+                        DataType.codec.writeOne(DataType.fromType(name.type, version), dest, version);
                     }
                 }
             }
@@ -412,15 +431,16 @@
                         size += CBUtil.sizeOfString(m.names.get(0).cfName);
                     }
 
-                    for (ColumnSpecification name : m.names)
+                    for (int i = 0; i < m.columnCount; i++)
                     {
+                        ColumnSpecification name = m.names.get(i);
                         if (!globalTablesSpec)
                         {
                             size += CBUtil.sizeOfString(name.ksName);
                             size += CBUtil.sizeOfString(name.cfName);
                         }
-                        size += CBUtil.sizeOfString(name.toString());
-                        size += DataType.codec.oneSerializedSize(DataType.fromType(name.type));
+                        size += CBUtil.sizeOfString(name.name.toString());
+                        size += DataType.codec.oneSerializedSize(DataType.fromType(name.type, version), version);
                     }
                 }
                 return size;

diff --git a/src/java/org/apache/cassandra/cql3/Sets.java b/src/java/org/apache/cassandra/cql3/Sets.java
index 69bc3d3..315d7d3 100644
--- a/src/java/org/apache/cassandra/cql3/Sets.java
+++ b/src/java/org/apache/cassandra/cql3/Sets.java

@@ -22,6 +22,7 @@
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Set;
@@ -29,11 +30,14 @@
 
 import com.google.common.base.Joiner;
 
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.MapType;
 import org.apache.cassandra.db.marshal.SetType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
@@ -59,25 +63,24 @@
             this.elements = elements;
         }
 
-        public Term prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            validateAssignableTo(receiver);
+            validateAssignableTo(keyspace, receiver);
 
             // We've parsed empty maps as a set literal to break the ambiguity so
             // handle that case now
             if (receiver.type instanceof MapType && elements.isEmpty())
                 return new Maps.Value(Collections.<ByteBuffer, ByteBuffer>emptyMap());
 
-
             ColumnSpecification valueSpec = Sets.valueSpecOf(receiver);
             Set<Term> values = new HashSet<Term>(elements.size());
             boolean allTerminal = true;
             for (Term.Raw rt : elements)
             {
-                Term t = rt.prepare(valueSpec);
+                Term t = rt.prepare(keyspace, valueSpec);
 
                 if (t.containsBindMarker())
-                    throw new InvalidRequestException(String.format("Invalid set literal for %s: bind variables are not supported inside collection literals", receiver));
+                    throw new InvalidRequestException(String.format("Invalid set literal for %s: bind variables are not supported inside collection literals", receiver.name));
 
                 if (t instanceof Term.NonTerminal)
                     allTerminal = false;
@@ -85,10 +88,10 @@
                 values.add(t);
             }
             DelayedValue value = new DelayedValue(((SetType)receiver.type).elements, values);
-            return allTerminal ? value.bind(Collections.<ByteBuffer>emptyList()) : value;
+            return allTerminal ? value.bind(QueryOptions.DEFAULT) : value;
         }
 
-        private void validateAssignableTo(ColumnSpecification receiver) throws InvalidRequestException
+        private void validateAssignableTo(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
             if (!(receiver.type instanceof SetType))
             {
@@ -97,22 +100,22 @@
                 if (receiver.type instanceof MapType && elements.isEmpty())
                     return;
 
-                throw new InvalidRequestException(String.format("Invalid set literal for %s of type %s", receiver, receiver.type.asCQL3Type()));
+                throw new InvalidRequestException(String.format("Invalid set literal for %s of type %s", receiver.name, receiver.type.asCQL3Type()));
             }
 
             ColumnSpecification valueSpec = Sets.valueSpecOf(receiver);
             for (Term.Raw rt : elements)
             {
-                if (!rt.isAssignableTo(valueSpec))
-                    throw new InvalidRequestException(String.format("Invalid set literal for %s: value %s is not of type %s", receiver, rt, valueSpec.type.asCQL3Type()));
+                if (!rt.isAssignableTo(keyspace, valueSpec))
+                    throw new InvalidRequestException(String.format("Invalid set literal for %s: value %s is not of type %s", receiver.name, rt, valueSpec.type.asCQL3Type()));
             }
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
         {
             try
             {
-                validateAssignableTo(receiver);
+                validateAssignableTo(keyspace, receiver);
                 return true;
             }
             catch (InvalidRequestException e)
@@ -137,13 +140,13 @@
             this.elements = elements;
         }
 
-        public static Value fromSerialized(ByteBuffer value, SetType type) throws InvalidRequestException
+        public static Value fromSerialized(ByteBuffer value, SetType type, int version) throws InvalidRequestException
         {
             try
             {
                 // Collections have this small hack that validate cannot be called on a serialized object,
                 // but compose does the validation (so we're fine).
-                Set<?> s = (Set<?>)type.compose(value);
+                Set<?> s = (Set<?>)type.getSerializer().deserializeForNativeProtocol(value, version);
                 Set<ByteBuffer> elements = new LinkedHashSet<ByteBuffer>(s.size());
                 for (Object element : s)
                     elements.add(type.elements.decompose(element));
@@ -155,9 +158,23 @@
             }
         }
 
-        public ByteBuffer get()
+        public ByteBuffer get(QueryOptions options)
         {
-            return CollectionType.pack(new ArrayList<ByteBuffer>(elements), elements.size());
+            return CollectionSerializer.pack(new ArrayList<ByteBuffer>(elements), elements.size(), options.getProtocolVersion());
+        }
+
+        public boolean equals(SetType st, Value v)
+        {
+            if (elements.size() != v.elements.size())
+                return false;
+
+            Iterator<ByteBuffer> thisIter = elements.iterator();
+            Iterator<ByteBuffer> thatIter = v.elements.iterator();
+            while (thisIter.hasNext())
+                if (st.elements.compare(thisIter.next(), thatIter.next()) != 0)
+                    return false;
+
+            return true;
         }
     }
 
@@ -183,12 +200,12 @@
         {
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        public Value bind(QueryOptions options) throws InvalidRequestException
         {
             Set<ByteBuffer> buffers = new TreeSet<ByteBuffer>(comparator);
             for (Term t : elements)
             {
-                ByteBuffer bytes = t.bindAndGet(values);
+                ByteBuffer bytes = t.bindAndGet(options);
 
                 if (bytes == null)
                     throw new InvalidRequestException("null is not supported inside collections");
@@ -213,44 +230,44 @@
             assert receiver.type instanceof SetType;
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        public Value bind(QueryOptions options) throws InvalidRequestException
         {
-            ByteBuffer value = values.get(bindIndex);
-            return value == null ? null : Value.fromSerialized(value, (SetType)receiver.type);
+            ByteBuffer value = options.getValues().get(bindIndex);
+            return value == null ? null : Value.fromSerialized(value, (SetType)receiver.type, options.getProtocolVersion());
         }
     }
 
     public static class Setter extends Operation
     {
-        public Setter(ColumnIdentifier column, Term t)
+        public Setter(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
             // delete + add
-            ColumnNameBuilder column = maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key);
-            cf.addAtom(params.makeTombstoneForOverwrite(column.build(), column.buildAsEndOfRange()));
-            Adder.doAdd(t, cf, column, params);
+            CellName name = cf.getComparator().create(prefix, column);
+            cf.addAtom(params.makeTombstoneForOverwrite(name.slice()));
+            Adder.doAdd(t, cf, prefix, column, params);
         }
     }
 
     public static class Adder extends Operation
     {
-        public Adder(ColumnIdentifier column, Term t)
+        public Adder(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            doAdd(t, cf, maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key), params);
+            doAdd(t, cf, prefix, column, params);
         }
 
-        static void doAdd(Term t, ColumnFamily cf, ColumnNameBuilder columnName, UpdateParameters params) throws InvalidRequestException
+        static void doAdd(Term t, ColumnFamily cf, Composite prefix, ColumnDefinition column, UpdateParameters params) throws InvalidRequestException
         {
-            Term.Terminal value = t.bind(params.variables);
+            Term.Terminal value = t.bind(params.options);
             if (value == null)
                 return;
 
@@ -259,22 +276,23 @@
             Set<ByteBuffer> toAdd = ((Sets.Value)value).elements;
             for (ByteBuffer bb : toAdd)
             {
-                ByteBuffer cellName = columnName.copy().add(bb).build();
+                CellName cellName = cf.getComparator().create(prefix, column, bb);
                 cf.addColumn(params.makeColumn(cellName, ByteBufferUtil.EMPTY_BYTE_BUFFER));
             }
         }
     }
 
+    // Note that this is reused for Map subtraction too (we subtract a set from a map)
     public static class Discarder extends Operation
     {
-        public Discarder(ColumnIdentifier column, Term t)
+        public Discarder(ColumnDefinition column, Term t)
         {
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, ColumnNameBuilder prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
         {
-            Term.Terminal value = t.bind(params.variables);
+            Term.Terminal value = t.bind(params.options);
             if (value == null)
                 return;
 
@@ -283,11 +301,9 @@
                                       ? Collections.singleton(((Constants.Value)value).bytes)
                                       : ((Sets.Value)value).elements;
 
-            ColumnNameBuilder column = maybeUpdatePrefix(cf.metadata(), prefix).add(columnName.key);
             for (ByteBuffer bb : toDiscard)
             {
-                ByteBuffer cellName = column.copy().add(bb).build();
-                cf.addColumn(params.makeTombstone(cellName));
+                cf.addColumn(params.makeTombstone(cf.getComparator().create(prefix, column, bb)));
             }
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/Term.java b/src/java/org/apache/cassandra/cql3/Term.java
index 96b4b71..e5206c8 100644
--- a/src/java/org/apache/cassandra/cql3/Term.java
+++ b/src/java/org/apache/cassandra/cql3/Term.java

@@ -44,11 +44,11 @@
      * Bind the values in this term to the values contained in {@code values}.
      * This is obviously a no-op if the term is Terminal.
      *
-     * @param values the values to bind markers to.
+     * @param options the values to bind markers to.
      * @return the result of binding all the variables of this NonTerminal (or
      * 'this' if the term is terminal).
      */
-    public Terminal bind(List<ByteBuffer> values) throws InvalidRequestException;
+    public Terminal bind(QueryOptions options) throws InvalidRequestException;
 
     /**
      * A shorter for bind(values).get().
@@ -56,7 +56,7 @@
      * object between the bind and the get (note that we still want to be able
      * to separate bind and get for collections).
      */
-    public ByteBuffer bindAndGet(List<ByteBuffer> values) throws InvalidRequestException;
+    public ByteBuffer bindAndGet(QueryOptions options) throws InvalidRequestException;
 
     /**
      * Whether or not that term contains at least one bind marker.
@@ -88,12 +88,12 @@
          * case this RawTerm describe a list index or a map key, etc...
          * @return the prepared term.
          */
-        public Term prepare(ColumnSpecification receiver) throws InvalidRequestException;
+        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException;
     }
 
     public interface MultiColumnRaw extends Raw
     {
-        public Term prepare(List<? extends ColumnSpecification> receiver) throws InvalidRequestException;
+        public Term prepare(String keyspace, List<? extends ColumnSpecification> receiver) throws InvalidRequestException;
     }
 
     /**
@@ -113,7 +113,7 @@
     public abstract class Terminal implements Term
     {
         public void collectMarkerSpecification(VariableSpecifications boundNames) {}
-        public Terminal bind(List<ByteBuffer> values) { return this; }
+        public Terminal bind(QueryOptions options) { return this; }
 
         // While some NonTerminal may not have bind markers, no Term can be Terminal
         // with a bind marker
@@ -125,11 +125,11 @@
         /**
          * @return the serialized value of this terminal.
          */
-        public abstract ByteBuffer get();
+        public abstract ByteBuffer get(QueryOptions options);
 
-        public ByteBuffer bindAndGet(List<ByteBuffer> values) throws InvalidRequestException
+        public ByteBuffer bindAndGet(QueryOptions options) throws InvalidRequestException
         {
-            return get();
+            return get(options);
         }
     }
 
@@ -150,10 +150,10 @@
      */
     public abstract class NonTerminal implements Term
     {
-        public ByteBuffer bindAndGet(List<ByteBuffer> values) throws InvalidRequestException
+        public ByteBuffer bindAndGet(QueryOptions options) throws InvalidRequestException
         {
-            Terminal t = bind(values);
-            return t == null ? null : t.get();
+            Terminal t = bind(options);
+            return t == null ? null : t.get(options);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/Tuples.java b/src/java/org/apache/cassandra/cql3/Tuples.java
index 818e3e6..883cc60 100644
--- a/src/java/org/apache/cassandra/cql3/Tuples.java
+++ b/src/java/org/apache/cassandra/cql3/Tuples.java

@@ -17,17 +17,15 @@
  */
 package org.apache.cassandra.cql3;
 
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CollectionType;
-import org.apache.cassandra.db.marshal.ListType;
-import org.apache.cassandra.db.marshal.TupleType;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.serializers.MarshalException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.nio.ByteBuffer;
-import java.util.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.serializers.MarshalException;
 
 /**
  * Static helper methods and classes for tuples.
@@ -36,6 +34,16 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(Tuples.class);
 
+    private Tuples() {}
+
+    public static ColumnSpecification componentSpecOf(ColumnSpecification column, int component)
+    {
+        return new ColumnSpecification(column.ksName,
+                                       column.cfName,
+                                       new ColumnIdentifier(String.format("%s[%d]", column.name, component), true),
+                                       ((TupleType)column.type).type(component));
+    }
+
     /**
      * A raw, literal tuple.  When prepared, this will become a Tuples.Value or Tuples.DelayedValue, depending
      * on whether the tuple holds NonTerminals.
@@ -49,34 +57,75 @@
             this.elements = elements;
         }
 
-        public Term prepare(List<? extends ColumnSpecification> receivers) throws InvalidRequestException
+        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            if (elements.size() != receivers.size())
-                throw new InvalidRequestException(String.format("Expected %d elements in value tuple, but got %d: %s", receivers.size(), elements.size(), this));
+            validateAssignableTo(keyspace, receiver);
 
             List<Term> values = new ArrayList<>(elements.size());
             boolean allTerminal = true;
             for (int i = 0; i < elements.size(); i++)
             {
-                Term t = elements.get(i).prepare(receivers.get(i));
+                Term value = elements.get(i).prepare(keyspace, componentSpecOf(receiver, i));
+                if (value instanceof Term.NonTerminal)
+                    allTerminal = false;
+
+                values.add(value);
+            }
+            DelayedValue value = new DelayedValue((TupleType)receiver.type, values);
+            return allTerminal ? value.bind(QueryOptions.DEFAULT) : value;
+        }
+
+        public Term prepare(String keyspace, List<? extends ColumnSpecification> receivers) throws InvalidRequestException
+        {
+            if (elements.size() != receivers.size())
+                throw new InvalidRequestException(String.format("Expected %d elements in value tuple, but got %d: %s", receivers.size(), elements.size(), this));
+
+            List<Term> values = new ArrayList<>(elements.size());
+            List<AbstractType<?>> types = new ArrayList<>(elements.size());
+            boolean allTerminal = true;
+            for (int i = 0; i < elements.size(); i++)
+            {
+                Term t = elements.get(i).prepare(keyspace, receivers.get(i));
                 if (t instanceof Term.NonTerminal)
                     allTerminal = false;
 
                 values.add(t);
+                types.add(receivers.get(i).type);
             }
-            DelayedValue value = new DelayedValue(values);
-            return allTerminal ? value.bind(Collections.<ByteBuffer>emptyList()) : value;
+            DelayedValue value = new DelayedValue(new TupleType(types), values);
+            return allTerminal ? value.bind(QueryOptions.DEFAULT) : value;
         }
 
-        public Term prepare(ColumnSpecification receiver)
+        private void validateAssignableTo(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            throw new AssertionError("Tuples.Literal instances require a list of receivers for prepare()");
+            if (!(receiver.type instanceof TupleType))
+                throw new InvalidRequestException(String.format("Invalid tuple type literal for %s of type %s", receiver.name, receiver.type.asCQL3Type()));
+
+            TupleType tt = (TupleType)receiver.type;
+            for (int i = 0; i < elements.size(); i++)
+            {
+                if (i >= tt.size())
+                    throw new InvalidRequestException(String.format("Invalid tuple literal for %s: too many elements. Type %s expects %d but got %d",
+                                                                    receiver.name, tt.asCQL3Type(), tt.size(), elements.size()));
+
+                Term.Raw value = elements.get(i);
+                ColumnSpecification spec = componentSpecOf(receiver, i);
+                if (!value.isAssignableTo(keyspace, spec))
+                    throw new InvalidRequestException(String.format("Invalid tuple literal for %s: component %d is not of type %s", receiver.name, i, spec.type.asCQL3Type()));
+            }
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
         {
-            // tuples shouldn't be assignable to anything right now
-            return false;
+            try
+            {
+                validateAssignableTo(keyspace, receiver);
+                return true;
+            }
+            catch (InvalidRequestException e)
+            {
+                return false;
+            }
         }
 
         @Override
@@ -103,9 +152,9 @@
             return new Value(type.split(bytes));
         }
 
-        public ByteBuffer get()
+        public ByteBuffer get(QueryOptions options)
         {
-            throw new UnsupportedOperationException();
+            return TupleType.buildValue(elements);
         }
 
         public List<ByteBuffer> getElements()
@@ -119,10 +168,12 @@
      */
     public static class DelayedValue extends Term.NonTerminal
     {
+        public final TupleType type;
         public final List<Term> elements;
 
-        public DelayedValue(List<Term> elements)
+        public DelayedValue(TupleType type, List<Term> elements)
         {
+            this.type = type;
             this.elements = elements;
         }
 
@@ -141,18 +192,32 @@
                 term.collectMarkerSpecification(boundNames);
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        private ByteBuffer[] bindInternal(QueryOptions options) throws InvalidRequestException
         {
-            ByteBuffer[] buffers = new ByteBuffer[elements.size()];
-            for (int i=0; i < elements.size(); i++)
-            {
-                ByteBuffer bytes = elements.get(i).bindAndGet(values);
-                if (bytes == null)
-                    throw new InvalidRequestException("Tuples may not contain null values");
+            int version = options.getProtocolVersion();
 
-                buffers[i] = elements.get(i).bindAndGet(values);
+            ByteBuffer[] buffers = new ByteBuffer[elements.size()];
+            for (int i = 0; i < elements.size(); i++)
+            {
+                buffers[i] = elements.get(i).bindAndGet(options);
+                // Inside tuples, we must force the serialization of collections to v3 whatever protocol
+                // version is in use since we're going to store directly that serialized value.
+                if (version < 3 && type.type(i).isCollection())
+                    buffers[i] = ((CollectionType)type.type(i)).getSerializer().reserializeToV3(buffers[i]);
             }
-            return new Value(buffers);
+            return buffers;
+        }
+
+        public Value bind(QueryOptions options) throws InvalidRequestException
+        {
+            return new Value(bindInternal(options));
+        }
+
+        @Override
+        public ByteBuffer bindAndGet(QueryOptions options) throws InvalidRequestException
+        {
+            // We don't "need" that override but it saves us the allocation of a Value object if used
+            return TupleType.buildValue(bindInternal(options));
         }
 
         @Override
@@ -175,13 +240,13 @@
             this.elements = items;
         }
 
-        public static InValue fromSerialized(ByteBuffer value, ListType type) throws InvalidRequestException
+        public static InValue fromSerialized(ByteBuffer value, ListType type, QueryOptions options) throws InvalidRequestException
         {
             try
             {
                 // Collections have this small hack that validate cannot be called on a serialized object,
-                // but compose does the validation (so we're fine).
-                List<?> l = (List<?>)type.compose(value);
+                // but the deserialization does the validation (so we're fine).
+                List<?> l = (List<?>)type.getSerializer().deserializeForNativeProtocol(value, options.getProtocolVersion());
 
                 assert type.elements instanceof TupleType;
                 TupleType tupleType = (TupleType) type.elements;
@@ -198,7 +263,7 @@
             }
         }
 
-        public ByteBuffer get()
+        public ByteBuffer get(QueryOptions options)
         {
             throw new UnsupportedOperationException();
         }
@@ -239,13 +304,13 @@
             return new ColumnSpecification(receivers.get(0).ksName, receivers.get(0).cfName, identifier, type);
         }
 
-        public AbstractMarker prepare(List<? extends ColumnSpecification> receivers) throws InvalidRequestException
+        public AbstractMarker prepare(String keyspace, List<? extends ColumnSpecification> receivers) throws InvalidRequestException
         {
             return new Tuples.Marker(bindIndex, makeReceiver(receivers));
         }
 
         @Override
-        public AbstractMarker prepare(ColumnSpecification receiver)
+        public AbstractMarker prepare(String keyspace, ColumnSpecification receiver)
         {
             throw new AssertionError("Tuples.Raw.prepare() requires a list of receivers");
         }
@@ -283,13 +348,13 @@
             return new ColumnSpecification(receivers.get(0).ksName, receivers.get(0).cfName, identifier, ListType.getInstance(type));
         }
 
-        public AbstractMarker prepare(List<? extends ColumnSpecification> receivers) throws InvalidRequestException
+        public AbstractMarker prepare(String keyspace, List<? extends ColumnSpecification> receivers) throws InvalidRequestException
         {
             return new InMarker(bindIndex, makeInReceiver(receivers));
         }
 
         @Override
-        public AbstractMarker prepare(ColumnSpecification receiver)
+        public AbstractMarker prepare(String keyspace, ColumnSpecification receiver)
         {
             throw new AssertionError("Tuples.INRaw.prepare() requires a list of receivers");
         }
@@ -305,9 +370,9 @@
             super(bindIndex, receiver);
         }
 
-        public Value bind(List<ByteBuffer> values) throws InvalidRequestException
+        public Value bind(QueryOptions options) throws InvalidRequestException
         {
-            ByteBuffer value = values.get(bindIndex);
+            ByteBuffer value = options.getValues().get(bindIndex);
             return value == null ? null : Value.fromSerialized(value, (TupleType)receiver.type);
         }
     }
@@ -323,10 +388,10 @@
             assert receiver.type instanceof ListType;
         }
 
-        public InValue bind(List<ByteBuffer> values) throws InvalidRequestException
+        public InValue bind(QueryOptions options) throws InvalidRequestException
         {
-            ByteBuffer value = values.get(bindIndex);
-            return value == null ? null : InValue.fromSerialized(value, (ListType)receiver.type);
+            ByteBuffer value = options.getValues().get(bindIndex);
+            return value == null ? null : InValue.fromSerialized(value, (ListType)receiver.type, options);
         }
     }
 
@@ -343,4 +408,4 @@
         sb.append(')');
         return sb.toString();
     }
-}
\ No newline at end of file
+}

diff --git a/src/java/org/apache/cassandra/cql3/TypeCast.java b/src/java/org/apache/cassandra/cql3/TypeCast.java
index 64261fa..e325e4d 100644
--- a/src/java/org/apache/cassandra/cql3/TypeCast.java
+++ b/src/java/org/apache/cassandra/cql3/TypeCast.java

@@ -21,34 +21,41 @@
 
 public class TypeCast implements Term.Raw
 {
-    private final CQL3Type type;
+    private final CQL3Type.Raw type;
     private final Term.Raw term;
 
-    public TypeCast(CQL3Type type, Term.Raw term)
+    public TypeCast(CQL3Type.Raw type, Term.Raw term)
     {
         this.type = type;
         this.term = term;
     }
 
-    public Term prepare(ColumnSpecification receiver) throws InvalidRequestException
+    public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
     {
-        if (!term.isAssignableTo(castedSpecOf(receiver)))
+        if (!term.isAssignableTo(keyspace, castedSpecOf(keyspace, receiver)))
             throw new InvalidRequestException(String.format("Cannot cast value %s to type %s", term, type));
 
-        if (!isAssignableTo(receiver))
+        if (!isAssignableTo(keyspace, receiver))
             throw new InvalidRequestException(String.format("Cannot assign value %s to %s of type %s", this, receiver, receiver.type.asCQL3Type()));
 
-        return term.prepare(receiver);
+        return term.prepare(keyspace, receiver);
     }
 
-    private ColumnSpecification castedSpecOf(ColumnSpecification receiver)
+    private ColumnSpecification castedSpecOf(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
     {
-        return new ColumnSpecification(receiver.ksName, receiver.cfName, new ColumnIdentifier(toString(), true), type.getType());
+        return new ColumnSpecification(receiver.ksName, receiver.cfName, new ColumnIdentifier(toString(), true), type.prepare(keyspace).getType());
     }
 
-    public boolean isAssignableTo(ColumnSpecification receiver)
+    public boolean isAssignableTo(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
     {
-        return receiver.type.isValueCompatibleWith(type.getType());
+        try
+        {
+            return receiver.type.isValueCompatibleWith(type.prepare(keyspace).getType());
+        }
+        catch (InvalidRequestException e)
+        {
+            throw new AssertionError();
+        }
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/cql3/UTName.java b/src/java/org/apache/cassandra/cql3/UTName.java
new file mode 100644
index 0000000..c856797
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/UTName.java

@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.nio.ByteBuffer;
+
+public class UTName
+{
+    private String ksName;
+    private final ColumnIdentifier utName;
+
+    public UTName(ColumnIdentifier ksName, ColumnIdentifier utName)
+    {
+        this.ksName = ksName == null ? null : ksName.toString();
+        this.utName = utName;
+    }
+
+    public boolean hasKeyspace()
+    {
+        return ksName != null;
+    }
+
+    public void setKeyspace(String keyspace)
+    {
+        this.ksName = keyspace;
+    }
+
+    public String getKeyspace()
+    {
+        return ksName;
+    }
+
+    public ByteBuffer getUserTypeName()
+    {
+        return utName.bytes;
+    }
+
+    public String getStringTypeName()
+    {
+        return utName.toString();
+    }
+
+    @Override
+    public String toString()
+    {
+        return (hasKeyspace() ? (ksName + ".") : "") + utName;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
index e5bd863..42d0cb8 100644
--- a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java

@@ -24,49 +24,176 @@
 
 import com.google.common.collect.AbstractIterator;
 
+import org.apache.cassandra.cql3.statements.SelectStatement;
 import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.cql3.ResultSet;
+import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.service.pager.QueryPager;
 
 /** a utility for doing internal cql-based queries */
-public class UntypedResultSet implements Iterable<UntypedResultSet.Row>
+public abstract class UntypedResultSet implements Iterable<UntypedResultSet.Row>
 {
-    private final ResultSet cqlRows;
-
-    public UntypedResultSet(ResultSet cqlRows)
+    public static UntypedResultSet create(ResultSet rs)
     {
-        this.cqlRows = cqlRows;
+        return new FromResultSet(rs);
+    }
+
+    public static UntypedResultSet create(List<Map<String, ByteBuffer>> results)
+    {
+        return new FromResultList(results);
+    }
+
+    public static UntypedResultSet create(SelectStatement select, QueryPager pager, int pageSize)
+    {
+        return new FromPager(select, pager, pageSize);
     }
 
     public boolean isEmpty()
     {
-        return cqlRows.size() == 0;
+        return size() == 0;
     }
 
-    public int size()
-    {
-        return cqlRows.size();
-    }
+    public abstract int size();
+    public abstract Row one();
 
-    public Row one()
-    {
-        if (cqlRows.rows.size() != 1)
-            throw new IllegalStateException("One row required, " + cqlRows.rows.size() + " found");
-        return new Row(cqlRows.metadata.names, cqlRows.rows.get(0));
-    }
+    // No implemented by all subclasses, but we use it when we know it's there (for tests)
+    public abstract List<ColumnSpecification> metadata();
 
-    public Iterator<Row> iterator()
+    private static class FromResultSet extends UntypedResultSet
     {
-        return new AbstractIterator<Row>()
+        private final ResultSet cqlRows;
+
+        private FromResultSet(ResultSet cqlRows)
         {
-            Iterator<List<ByteBuffer>> iter = cqlRows.rows.iterator();
+            this.cqlRows = cqlRows;
+        }
 
-            protected Row computeNext()
+        public int size()
+        {
+            return cqlRows.size();
+        }
+
+        public Row one()
+        {
+            if (cqlRows.rows.size() != 1)
+                throw new IllegalStateException("One row required, " + cqlRows.rows.size() + " found");
+            return new Row(cqlRows.metadata.names, cqlRows.rows.get(0));
+        }
+
+        public Iterator<Row> iterator()
+        {
+            return new AbstractIterator<Row>()
             {
-                if (!iter.hasNext())
-                    return endOfData();
-                return new Row(cqlRows.metadata.names, iter.next());
-            }
-        };
+                Iterator<List<ByteBuffer>> iter = cqlRows.rows.iterator();
+
+                protected Row computeNext()
+                {
+                    if (!iter.hasNext())
+                        return endOfData();
+                    return new Row(cqlRows.metadata.names, iter.next());
+                }
+            };
+        }
+
+        public List<ColumnSpecification> metadata()
+        {
+            return cqlRows.metadata.names;
+        }
+    }
+
+    private static class FromResultList extends UntypedResultSet
+    {
+        private final List<Map<String, ByteBuffer>> cqlRows;
+
+        private FromResultList(List<Map<String, ByteBuffer>> cqlRows)
+        {
+            this.cqlRows = cqlRows;
+        }
+
+        public int size()
+        {
+            return cqlRows.size();
+        }
+
+        public Row one()
+        {
+            if (cqlRows.size() != 1)
+                throw new IllegalStateException("One row required, " + cqlRows.size() + " found");
+            return new Row(cqlRows.get(0));
+        }
+
+        public Iterator<Row> iterator()
+        {
+            return new AbstractIterator<Row>()
+            {
+                Iterator<Map<String, ByteBuffer>> iter = cqlRows.iterator();
+
+                protected Row computeNext()
+                {
+                    if (!iter.hasNext())
+                        return endOfData();
+                    return new Row(iter.next());
+                }
+            };
+        }
+
+        public List<ColumnSpecification> metadata()
+        {
+            throw new UnsupportedOperationException();
+        }
+    }
+
+    private static class FromPager extends UntypedResultSet
+    {
+        private final SelectStatement select;
+        private final QueryPager pager;
+        private final int pageSize;
+        private final List<ColumnSpecification> metadata;
+
+        private FromPager(SelectStatement select, QueryPager pager, int pageSize)
+        {
+            this.select = select;
+            this.pager = pager;
+            this.pageSize = pageSize;
+            this.metadata = select.getResultMetadata().names;
+        }
+
+        public int size()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Row one()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Iterator<Row> iterator()
+        {
+            return new AbstractIterator<Row>()
+            {
+                private Iterator<List<ByteBuffer>> currentPage;
+
+                protected Row computeNext()
+                {
+                    try {
+                        while (currentPage == null || !currentPage.hasNext())
+                        {
+                            if (pager.isExhausted())
+                                return endOfData();
+                            currentPage = select.process(pager.fetchPage(pageSize)).rows.iterator();
+                        }
+                        return new Row(metadata, currentPage.next());
+                    } catch (RequestValidationException | RequestExecutionException e) {
+                        throw new RuntimeException(e);
+                    }
+                }
+            };
+        }
+
+        public List<ColumnSpecification> metadata()
+        {
+            return metadata;
+        }
     }
 
     public static class Row
@@ -83,7 +210,7 @@
         {
             this.columns.addAll(names);
             for (int i = 0; i < names.size(); i++)
-                data.put(names.get(i).toString(), columns.get(i));
+                data.put(names.get(i).name.toString(), columns.get(i));
         }
 
         public boolean has(String column)

diff --git a/src/java/org/apache/cassandra/cql3/UpdateParameters.java b/src/java/org/apache/cassandra/cql3/UpdateParameters.java
index c543d6c..d31b8d9 100644
--- a/src/java/org/apache/cassandra/cql3/UpdateParameters.java
+++ b/src/java/org/apache/cassandra/cql3/UpdateParameters.java

@@ -23,10 +23,11 @@
 import java.util.Map;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.statements.ColumnGroupMap;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.filter.ColumnSlice;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.FBUtilities;
 
 /**
  * A simple container that simplify passing parameters for collections methods.
@@ -34,62 +35,62 @@
 public class UpdateParameters
 {
     public final CFMetaData metadata;
-    public final List<ByteBuffer> variables;
+    public final QueryOptions options;
     public final long timestamp;
     private final int ttl;
     public final int localDeletionTime;
 
     // For lists operation that require a read-before-write. Will be null otherwise.
-    private final Map<ByteBuffer, ColumnGroupMap> prefetchedLists;
+    private final Map<ByteBuffer, CQL3Row> prefetchedLists;
 
-    public UpdateParameters(CFMetaData metadata, List<ByteBuffer> variables, long timestamp, int ttl, Map<ByteBuffer, ColumnGroupMap> prefetchedLists)
+    public UpdateParameters(CFMetaData metadata, QueryOptions options, long timestamp, int ttl, Map<ByteBuffer, CQL3Row> prefetchedLists)
     {
         this.metadata = metadata;
-        this.variables = variables;
+        this.options = options;
         this.timestamp = timestamp;
         this.ttl = ttl;
         this.localDeletionTime = (int)(System.currentTimeMillis() / 1000);
         this.prefetchedLists = prefetchedLists;
     }
 
-    public Column makeColumn(ByteBuffer name, ByteBuffer value) throws InvalidRequestException
+    public Cell makeColumn(CellName name, ByteBuffer value) throws InvalidRequestException
     {
-        QueryProcessor.validateCellName(name);
-        return Column.create(name, value, timestamp, ttl, metadata);
+        QueryProcessor.validateCellName(name, metadata.comparator);
+        return AbstractCell.create(name, value, timestamp, ttl, metadata);
     }
 
-    public Column makeCounter(ByteBuffer name, long delta) throws InvalidRequestException
+     public Cell makeCounter(CellName name, long delta) throws InvalidRequestException
+     {
+         QueryProcessor.validateCellName(name, metadata.comparator);
+         return new BufferCounterUpdateCell(name, delta, FBUtilities.timestampMicros());
+     }
+
+    public Cell makeTombstone(CellName name) throws InvalidRequestException
     {
-        QueryProcessor.validateCellName(name);
-        return new CounterUpdateColumn(name, delta, System.currentTimeMillis());
+        QueryProcessor.validateCellName(name, metadata.comparator);
+        return new BufferDeletedCell(name, localDeletionTime, timestamp);
     }
 
-    public Column makeTombstone(ByteBuffer name) throws InvalidRequestException
+    public RangeTombstone makeRangeTombstone(ColumnSlice slice) throws InvalidRequestException
     {
-        QueryProcessor.validateCellName(name);
-        return new DeletedColumn(name, localDeletionTime, timestamp);
+        QueryProcessor.validateComposite(slice.start, metadata.comparator);
+        QueryProcessor.validateComposite(slice.finish, metadata.comparator);
+        return new RangeTombstone(slice.start, slice.finish, timestamp, localDeletionTime);
     }
 
-    public RangeTombstone makeRangeTombstone(ByteBuffer start, ByteBuffer end) throws InvalidRequestException
+    public RangeTombstone makeTombstoneForOverwrite(ColumnSlice slice) throws InvalidRequestException
     {
-        QueryProcessor.validateCellName(start);
-        QueryProcessor.validateCellName(end);
-        return new RangeTombstone(start, end, timestamp, localDeletionTime);
+        QueryProcessor.validateComposite(slice.start, metadata.comparator);
+        QueryProcessor.validateComposite(slice.finish, metadata.comparator);
+        return new RangeTombstone(slice.start, slice.finish, timestamp - 1, localDeletionTime);
     }
 
-    public RangeTombstone makeTombstoneForOverwrite(ByteBuffer start, ByteBuffer end) throws InvalidRequestException
-    {
-        QueryProcessor.validateCellName(start);
-        QueryProcessor.validateCellName(end);
-        return new RangeTombstone(start, end, timestamp - 1, localDeletionTime);
-    }
-
-    public List<Pair<ByteBuffer, Column>> getPrefetchedList(ByteBuffer rowKey, ByteBuffer cql3ColumnName)
+    public List<Cell> getPrefetchedList(ByteBuffer rowKey, ColumnIdentifier cql3ColumnName)
     {
         if (prefetchedLists == null)
             return Collections.emptyList();
 
-        ColumnGroupMap m = prefetchedLists.get(rowKey);
-        return m == null ? Collections.<Pair<ByteBuffer, Column>>emptyList() : m.getCollection(cql3ColumnName);
+        CQL3Row row = prefetchedLists.get(rowKey);
+        return row == null ? Collections.<Cell>emptyList() : row.getCollection(cql3ColumnName);
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/UserTypes.java b/src/java/org/apache/cassandra/cql3/UserTypes.java
new file mode 100644
index 0000000..9d66c16
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/UserTypes.java

@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+
+/**
+ * Static helper methods and classes for user types.
+ */
+public abstract class UserTypes
+{
+    private UserTypes() {}
+
+    public static ColumnSpecification fieldSpecOf(ColumnSpecification column, int field)
+    {
+        UserType ut = (UserType)column.type;
+        return new ColumnSpecification(column.ksName,
+                                       column.cfName,
+                                       new ColumnIdentifier(column.name + "." + UTF8Type.instance.compose(ut.fieldName(field)), true),
+                                       ut.fieldType(field));
+    }
+
+    public static class Literal implements Term.Raw
+    {
+        public final Map<ColumnIdentifier, Term.Raw> entries;
+
+        public Literal(Map<ColumnIdentifier, Term.Raw> entries)
+        {
+            this.entries = entries;
+        }
+
+        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
+        {
+            validateAssignableTo(keyspace, receiver);
+
+            UserType ut = (UserType)receiver.type;
+            boolean allTerminal = true;
+            List<Term> values = new ArrayList<>(entries.size());
+            int foundValues = 0;
+            for (int i = 0; i < ut.size(); i++)
+            {
+                ColumnIdentifier field = new ColumnIdentifier(ut.fieldName(i), UTF8Type.instance);
+                Term.Raw raw = entries.get(field);
+                if (raw == null)
+                    raw = Constants.NULL_LITERAL;
+                else
+                    ++foundValues;
+                Term value = raw.prepare(keyspace, fieldSpecOf(receiver, i));
+
+                if (value instanceof Term.NonTerminal)
+                    allTerminal = false;
+
+                values.add(value);
+            }
+            if (foundValues != entries.size())
+            {
+                // We had some field that are not part of the type
+                for (ColumnIdentifier id : entries.keySet())
+                    if (!ut.fieldNames().contains(id.bytes))
+                        throw new InvalidRequestException(String.format("Unknown field '%s' in value of user defined type %s", id, ut.getNameAsString()));
+            }
+
+            DelayedValue value = new DelayedValue(((UserType)receiver.type), values);
+            return allTerminal ? value.bind(QueryOptions.DEFAULT) : value;
+        }
+
+        private void validateAssignableTo(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
+        {
+            if (!(receiver.type instanceof UserType))
+                throw new InvalidRequestException(String.format("Invalid user type literal for %s of type %s", receiver, receiver.type.asCQL3Type()));
+
+            UserType ut = (UserType)receiver.type;
+            for (int i = 0; i < ut.size(); i++)
+            {
+                ColumnIdentifier field = new ColumnIdentifier(ut.fieldName(i), UTF8Type.instance);
+                Term.Raw value = entries.get(field);
+                if (value == null)
+                    continue;
+
+                ColumnSpecification fieldSpec = fieldSpecOf(receiver, i);
+                if (!value.isAssignableTo(keyspace, fieldSpec))
+                    throw new InvalidRequestException(String.format("Invalid user type literal for %s: field %s is not of type %s", receiver, field, fieldSpec.type.asCQL3Type()));
+            }
+        }
+
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
+        {
+            try
+            {
+                validateAssignableTo(keyspace, receiver);
+                return true;
+            }
+            catch (InvalidRequestException e)
+            {
+                return false;
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            StringBuilder sb = new StringBuilder();
+            sb.append("{");
+            Iterator<Map.Entry<ColumnIdentifier, Term.Raw>> iter = entries.entrySet().iterator();
+            while (iter.hasNext())
+            {
+                Map.Entry<ColumnIdentifier, Term.Raw> entry = iter.next();
+                sb.append(entry.getKey()).append(":").append(entry.getValue());
+                if (iter.hasNext())
+                    sb.append(", ");
+            }
+            sb.append("}");
+            return sb.toString();
+        }
+    }
+
+    // Same purpose than Lists.DelayedValue, except we do handle bind marker in that case
+    public static class DelayedValue extends Term.NonTerminal
+    {
+        private final UserType type;
+        private final List<Term> values;
+
+        public DelayedValue(UserType type, List<Term> values)
+        {
+            this.type = type;
+            this.values = values;
+        }
+
+        public boolean containsBindMarker()
+        {
+            for (Term t : values)
+                if (t.containsBindMarker())
+                    return true;
+            return false;
+        }
+
+        public void collectMarkerSpecification(VariableSpecifications boundNames)
+        {
+            for (int i = 0; i < type.size(); i++)
+                values.get(i).collectMarkerSpecification(boundNames);
+        }
+
+        private ByteBuffer[] bindInternal(QueryOptions options) throws InvalidRequestException
+        {
+            int version = options.getProtocolVersion();
+
+            ByteBuffer[] buffers = new ByteBuffer[values.size()];
+            for (int i = 0; i < type.size(); i++)
+            {
+                buffers[i] = values.get(i).bindAndGet(options);
+                // Inside UDT values, we must force the serialization of collections to v3 whatever protocol
+                // version is in use since we're going to store directly that serialized value.
+                if (version < 3 && type.fieldType(i).isCollection() && buffers[i] != null)
+                    buffers[i] = ((CollectionType)type.fieldType(i)).getSerializer().reserializeToV3(buffers[i]);
+            }
+            return buffers;
+        }
+
+        public Constants.Value bind(QueryOptions options) throws InvalidRequestException
+        {
+            return new Constants.Value(bindAndGet(options));
+        }
+
+        @Override
+        public ByteBuffer bindAndGet(QueryOptions options) throws InvalidRequestException
+        {
+            return UserType.buildValue(bindInternal(options));
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/VariableSpecifications.java b/src/java/org/apache/cassandra/cql3/VariableSpecifications.java
index 297999a..ef78619 100644
--- a/src/java/org/apache/cassandra/cql3/VariableSpecifications.java
+++ b/src/java/org/apache/cassandra/cql3/VariableSpecifications.java

@@ -24,7 +24,6 @@
 {
     private final List<ColumnIdentifier> variableNames;
     private final ColumnSpecification[] specs;
-    private int collectedCount;
 
     public VariableSpecifications(List<ColumnIdentifier> variableNames)
     {
@@ -49,11 +48,11 @@
         if (name != null)
             spec = new ColumnSpecification(spec.ksName, spec.cfName, name, spec.type);
         specs[bindIndex] = spec;
-        collectedCount++;
     }
 
-    public int getCollectedCount()
+    @Override
+    public String toString()
     {
-        return collectedCount;
+        return Arrays.toString(specs);
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java b/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java
index 66e498f..4ae7c98 100644
--- a/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java
+++ b/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java

@@ -48,19 +48,19 @@
             t.collectMarkerSpecification(boundNames);
     }
 
-    public Term.Terminal bind(List<ByteBuffer> values) throws InvalidRequestException
+    public Term.Terminal bind(QueryOptions options) throws InvalidRequestException
     {
-        return makeTerminal(fun, bindAndGet(values));
+        return makeTerminal(fun, bindAndGet(options), options.getProtocolVersion());
     }
 
-    public ByteBuffer bindAndGet(List<ByteBuffer> values) throws InvalidRequestException
+    public ByteBuffer bindAndGet(QueryOptions options) throws InvalidRequestException
     {
         List<ByteBuffer> buffers = new ArrayList<ByteBuffer>(terms.size());
         for (Term t : terms)
         {
             // For now, we don't allow nulls as argument as no existing function needs it and it
             // simplify things.
-            ByteBuffer val = t.bindAndGet(values);
+            ByteBuffer val = t.bindAndGet(options);
             if (val == null)
                 throw new InvalidRequestException(String.format("Invalid null value for argument to %s", fun));
             buffers.add(val);
@@ -95,16 +95,16 @@
         return false;
     }
 
-    private static Term.Terminal makeTerminal(Function fun, ByteBuffer result) throws InvalidRequestException
+    private static Term.Terminal makeTerminal(Function fun, ByteBuffer result, int version) throws InvalidRequestException
     {
         if (!(fun.returnType() instanceof CollectionType))
             return new Constants.Value(result);
 
         switch (((CollectionType)fun.returnType()).kind)
         {
-            case LIST: return Lists.Value.fromSerialized(result, (ListType)fun.returnType());
-            case SET:  return Sets.Value.fromSerialized(result, (SetType)fun.returnType());
-            case MAP:  return Maps.Value.fromSerialized(result, (MapType)fun.returnType());
+            case LIST: return Lists.Value.fromSerialized(result, (ListType)fun.returnType(), version);
+            case SET:  return Sets.Value.fromSerialized(result, (SetType)fun.returnType(), version);
+            case MAP:  return Maps.Value.fromSerialized(result, (MapType)fun.returnType(), version);
         }
         throw new AssertionError();
     }
@@ -120,15 +120,15 @@
             this.terms = terms;
         }
 
-        public Term prepare(ColumnSpecification receiver) throws InvalidRequestException
+        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            Function fun = Functions.get(functionName, terms, receiver);
+            Function fun = Functions.get(keyspace, functionName, terms, receiver);
 
             List<Term> parameters = new ArrayList<Term>(terms.size());
             boolean allTerminal = true;
             for (int i = 0; i < terms.size(); i++)
             {
-                Term t = terms.get(i).prepare(Functions.makeArgSpec(receiver, fun, i));
+                Term t = terms.get(i).prepare(keyspace, Functions.makeArgSpec(receiver, fun, i));
                 if (t instanceof NonTerminal)
                     allTerminal = false;
                 parameters.add(t);
@@ -137,7 +137,7 @@
             // If all parameters are terminal and the function is pure, we can
             // evaluate it now, otherwise we'd have to wait execution time
             return allTerminal && fun.isPure()
-                ? makeTerminal(fun, execute(fun, parameters))
+                ? makeTerminal(fun, execute(fun, parameters), QueryOptions.DEFAULT.getProtocolVersion())
                 : new FunctionCall(fun, parameters);
         }
 
@@ -148,13 +148,13 @@
             for (Term t : parameters)
             {
                 assert t instanceof Term.Terminal;
-                buffers.add(((Term.Terminal)t).get());
+                buffers.add(((Term.Terminal)t).get(QueryOptions.DEFAULT));
             }
 
             return executeInternal(fun, buffers);
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
         {
             AbstractType<?> returnType = Functions.getReturnType(functionName, receiver.ksName, receiver.cfName);
             // Note: if returnType == null, it means the function doesn't exist. We may get this if an undefined function

diff --git a/src/java/org/apache/cassandra/cql3/functions/Functions.java b/src/java/org/apache/cassandra/cql3/functions/Functions.java
index 4f108cb..605e7b3 100644
--- a/src/java/org/apache/cassandra/cql3/functions/Functions.java
+++ b/src/java/org/apache/cassandra/cql3/functions/Functions.java

@@ -78,7 +78,7 @@
                 fun.argsType().get(i));
     }
 
-    public static Function get(String name, List<? extends AssignementTestable> providedArgs, ColumnSpecification receiver) throws InvalidRequestException
+    public static Function get(String keyspace, String name, List<? extends AssignementTestable> providedArgs, ColumnSpecification receiver) throws InvalidRequestException
     {
         List<Function.Factory> factories = declared.get(name.toLowerCase());
         if (factories.isEmpty())
@@ -88,7 +88,7 @@
         if (factories.size() == 1)
         {
             Function fun = factories.get(0).create(receiver.ksName, receiver.cfName);
-            validateTypes(fun, providedArgs, receiver);
+            validateTypes(keyspace, fun, providedArgs, receiver);
             return fun;
         }
 
@@ -96,7 +96,7 @@
         for (Function.Factory factory : factories)
         {
             Function toTest = factory.create(receiver.ksName, receiver.cfName);
-            if (!isValidType(toTest, providedArgs, receiver))
+            if (!isValidType(keyspace, toTest, providedArgs, receiver))
                 continue;
 
             if (candidate == null)
@@ -109,7 +109,7 @@
         return candidate;
     }
 
-    private static void validateTypes(Function fun, List<? extends AssignementTestable> providedArgs, ColumnSpecification receiver) throws InvalidRequestException
+    private static void validateTypes(String keyspace, Function fun, List<? extends AssignementTestable> providedArgs, ColumnSpecification receiver) throws InvalidRequestException
     {
         if (!receiver.type.isValueCompatibleWith(fun.returnType()))
             throw new InvalidRequestException(String.format("Type error: cannot assign result of function %s (type %s) to %s (type %s)", fun.name(), fun.returnType().asCQL3Type(), receiver, receiver.type.asCQL3Type()));
@@ -127,12 +127,12 @@
                 continue;
 
             ColumnSpecification expected = makeArgSpec(receiver, fun, i);
-            if (!provided.isAssignableTo(expected))
+            if (!provided.isAssignableTo(keyspace, expected))
                 throw new InvalidRequestException(String.format("Type error: %s cannot be passed as argument %d of function %s of type %s", provided, i, fun.name(), expected.type.asCQL3Type()));
         }
     }
 
-    private static boolean isValidType(Function fun, List<? extends AssignementTestable> providedArgs, ColumnSpecification receiver)
+    private static boolean isValidType(String keyspace, Function fun, List<? extends AssignementTestable> providedArgs, ColumnSpecification receiver) throws InvalidRequestException
     {
         if (!receiver.type.isValueCompatibleWith(fun.returnType()))
             return false;
@@ -150,7 +150,7 @@
                 continue;
 
             ColumnSpecification expected = makeArgSpec(receiver, fun, i);
-            if (!provided.isAssignableTo(expected))
+            if (!provided.isAssignableTo(keyspace, expected))
                 return false;
         }
         return true;

diff --git a/src/java/org/apache/cassandra/cql3/functions/TimeuuidFcts.java b/src/java/org/apache/cassandra/cql3/functions/TimeuuidFcts.java
index 9c69c19..be20102 100644
--- a/src/java/org/apache/cassandra/cql3/functions/TimeuuidFcts.java
+++ b/src/java/org/apache/cassandra/cql3/functions/TimeuuidFcts.java

@@ -91,3 +91,4 @@
         }
     };
 }
+

diff --git a/src/java/org/apache/cassandra/cql3/functions/TokenFct.java b/src/java/org/apache/cassandra/cql3/functions/TokenFct.java
index 4f3ff4a..5093a72 100644
--- a/src/java/org/apache/cassandra/cql3/functions/TokenFct.java
+++ b/src/java/org/apache/cassandra/cql3/functions/TokenFct.java

@@ -21,9 +21,9 @@
 import java.util.List;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.cql3.CFDefinition;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
+import org.apache.cassandra.db.composites.CBuilder;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -42,26 +42,26 @@
         }
     };
 
-    private final CFDefinition cfDef;
+    private final CFMetaData cfm;
 
     public TokenFct(CFMetaData cfm)
     {
         super("token", partitioner.getTokenValidator(), getKeyTypes(cfm));
-        this.cfDef = cfm.getCfDef();
+        this.cfm = cfm;
     }
 
     private static AbstractType[] getKeyTypes(CFMetaData cfm)
     {
-        AbstractType[] types = new AbstractType[cfm.getCfDef().partitionKeyCount()];
+        AbstractType[] types = new AbstractType[cfm.partitionKeyColumns().size()];
         int i = 0;
-        for (CFDefinition.Name name : cfm.getCfDef().partitionKeys())
-            types[i++] = name.type;
+        for (ColumnDefinition def : cfm.partitionKeyColumns())
+            types[i++] = def.type;
         return types;
     }
 
     public ByteBuffer execute(List<ByteBuffer> parameters) throws InvalidRequestException
     {
-        ColumnNameBuilder builder = cfDef.getKeyNameBuilder();
+        CBuilder builder = cfm.getKeyValidatorAsCType().builder();
         for (int i = 0; i < parameters.size(); i++)
         {
             ByteBuffer bb = parameters.get(i);
@@ -69,6 +69,6 @@
                 return null;
             builder.add(bb);
         }
-        return partitioner.getTokenFactory().toByteArray(partitioner.getToken(builder.build()));
+        return partitioner.getTokenFactory().toByteArray(partitioner.getToken(builder.build().toByteBuffer()));
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/AlterKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/AlterKeyspaceStatement.java
index 4f6d1f2..e65a51e 100644
--- a/src/java/org/apache/cassandra/cql3/statements/AlterKeyspaceStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/AlterKeyspaceStatement.java

@@ -27,7 +27,7 @@
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 
 public class AlterKeyspaceStatement extends SchemaAlteringStatement
 {
@@ -79,19 +79,19 @@
         }
     }
 
-    public boolean announceMigration() throws RequestValidationException
+    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
     {
         KSMetaData ksm = Schema.instance.getKSMetaData(name);
         // In the (very) unlikely case the keyspace was dropped since validate()
         if (ksm == null)
             throw new InvalidRequestException("Unknown keyspace " + name);
 
-        MigrationManager.announceKeyspaceUpdate(attrs.asKSMetadataUpdate(ksm));
+        MigrationManager.announceKeyspaceUpdate(attrs.asKSMetadataUpdate(ksm), isLocalOnly);
         return true;
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
-        return ResultMessage.SchemaChange.Change.UPDATED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, keyspace());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/AlterTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/AlterTableStatement.java
index 698c8b8..33f4fff 100644
--- a/src/java/org/apache/cassandra/cql3/statements/AlterTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/AlterTableStatement.java

@@ -17,9 +17,7 @@
  */
 package org.apache.cassandra.cql3.statements;
 
-import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -31,7 +29,7 @@
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 
 import static org.apache.cassandra.thrift.ThriftValidation.validateColumnFamily;
 
@@ -43,7 +41,7 @@
     }
 
     public final Type oType;
-    public final CQL3Type validator;
+    public final CQL3Type.Raw validator;
     public final ColumnIdentifier columnName;
     private final CFPropDefs cfProps;
     private final Map<ColumnIdentifier, ColumnIdentifier> renames;
@@ -52,7 +50,7 @@
     public AlterTableStatement(CFName name,
                                Type type,
                                ColumnIdentifier columnName,
-                               CQL3Type validator,
+                               CQL3Type.Raw validator,
                                CFPropDefs cfProps,
                                Map<ColumnIdentifier, ColumnIdentifier> renames,
                                boolean isStatic)
@@ -76,33 +74,34 @@
         // validated in announceMigration()
     }
 
-    public boolean announceMigration() throws RequestValidationException
+    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
     {
         CFMetaData meta = validateColumnFamily(keyspace(), columnFamily());
-        CFMetaData cfm = meta.clone();
+        CFMetaData cfm = meta.copy();
 
-        CFDefinition cfDef = meta.getCfDef();
-        CFDefinition.Name name = columnName == null ? null : cfDef.get(columnName);
+        CQL3Type validator = this.validator == null ? null : this.validator.prepare(keyspace());
+
+        ColumnDefinition def = columnName == null ? null : cfm.getColumnDefinition(columnName);
         switch (oType)
         {
             case ADD:
-                if (cfDef.isCompact)
+                if (cfm.comparator.isDense())
                     throw new InvalidRequestException("Cannot add new column to a COMPACT STORAGE table");
 
                 if (isStatic)
                 {
-                    if (!cfDef.isComposite)
+                    if (!cfm.comparator.isCompound())
                         throw new InvalidRequestException("Static columns are not allowed in COMPACT STORAGE tables");
-                    if (cfDef.clusteringColumns().isEmpty())
+                    if (cfm.clusteringColumns().isEmpty())
                         throw new InvalidRequestException("Static columns are only useful (and thus allowed) if the table has at least one clustering column");
                 }
 
-                if (name != null)
+                if (def != null)
                 {
-                    switch (name.kind)
+                    switch (def.kind)
                     {
-                        case KEY_ALIAS:
-                        case COLUMN_ALIAS:
+                        case PARTITION_KEY:
+                        case CLUSTERING_COLUMN:
                             throw new InvalidRequestException(String.format("Invalid column name %s because it conflicts with a PRIMARY KEY part", columnName));
                         default:
                             throw new InvalidRequestException(String.format("Invalid column name %s because it conflicts with an existing column", columnName));
@@ -110,68 +109,59 @@
                 }
 
                 // Cannot re-add a dropped counter column. See #7831.
-                if (meta.getDefaultValidator().isCommutative() && meta.getDroppedColumns().containsKey(columnName.key))
+                if (meta.isCounter() && meta.getDroppedColumns().containsKey(columnName))
                     throw new InvalidRequestException(String.format("Cannot re-add previously dropped counter column %s", columnName));
 
                 AbstractType<?> type = validator.getType();
                 if (type instanceof CollectionType)
                 {
-                    if (!cfDef.isComposite)
+                    if (!cfm.comparator.supportCollections())
                         throw new InvalidRequestException("Cannot use collection types with non-composite PRIMARY KEY");
-                    if (cfDef.cfm.isSuper())
+                    if (cfm.isSuper())
                         throw new InvalidRequestException("Cannot use collection types with Super column family");
 
-                    Map<ByteBuffer, CollectionType> collections = cfDef.hasCollections
-                                                                ? new HashMap<ByteBuffer, CollectionType>(cfDef.getCollectionType().defined)
-                                                                : new HashMap<ByteBuffer, CollectionType>();
 
                     // If there used to be a collection column with the same name (that has been dropped), it will
                     // still be appear in the ColumnToCollectionType because or reasons explained on #6276. The same
                     // reason mean that we can't allow adding a new collection with that name (see the ticket for details).
-                    CollectionType previous = collections.get(columnName.key);
-                    if (previous != null && !type.isCompatibleWith(previous))
-                        throw new InvalidRequestException(String.format("Cannot add a collection with the name %s " +
-                                    "because a collection with the same name and a different type has already been used in the past", columnName));
+                    if (cfm.comparator.hasCollections())
+                    {
+                        CollectionType previous = cfm.comparator.collectionType() == null ? null : cfm.comparator.collectionType().defined.get(columnName.bytes);
+                        if (previous != null && !type.isCompatibleWith(previous))
+                            throw new InvalidRequestException(String.format("Cannot add a collection with the name %s " +
+                                        "because a collection with the same name and a different type has already been used in the past", columnName));
+                    }
 
-                    collections.put(columnName.key, (CollectionType)type);
-                    ColumnToCollectionType newColType = ColumnToCollectionType.getInstance(collections);
-                    List<AbstractType<?>> ctypes = new ArrayList<AbstractType<?>>(((CompositeType)cfm.comparator).types);
-                    if (cfDef.hasCollections)
-                        ctypes.set(ctypes.size() - 1, newColType);
-                    else
-                        ctypes.add(newColType);
-                    cfm.comparator = CompositeType.getInstance(ctypes);
+                    cfm.comparator = cfm.comparator.addOrUpdateCollection(columnName, (CollectionType)type);
                 }
 
-                Integer componentIndex = cfDef.isComposite
-                                       ? ((CompositeType)meta.comparator).types.size() - (cfDef.hasCollections ? 2 : 1)
-                                       : null;
+                Integer componentIndex = cfm.comparator.isCompound() ? cfm.comparator.clusteringPrefixSize() : null;
                 cfm.addColumnDefinition(isStatic
-                                        ? ColumnDefinition.staticDef(columnName.key, type, componentIndex)
-                                        : ColumnDefinition.regularDef(columnName.key, type, componentIndex));
+                                        ? ColumnDefinition.staticDef(cfm, columnName.bytes, type, componentIndex)
+                                        : ColumnDefinition.regularDef(cfm, columnName.bytes, type, componentIndex));
                 break;
 
             case ALTER:
-                if (name == null)
-                    throw new InvalidRequestException(String.format("Column %s was not found in table %s", columnName, columnFamily()));
+                if (def == null)
+                    throw new InvalidRequestException(String.format("Cell %s was not found in table %s", columnName, columnFamily()));
 
-                switch (name.kind)
+                switch (def.kind)
                 {
-                    case KEY_ALIAS:
+                    case PARTITION_KEY:
                         AbstractType<?> newType = validator.getType();
                         if (newType instanceof CounterColumnType)
                             throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", columnName));
-                        if (cfDef.hasCompositeKey)
+                        if (cfm.getKeyValidator() instanceof CompositeType)
                         {
                             List<AbstractType<?>> oldTypes = ((CompositeType) cfm.getKeyValidator()).types;
-                            if (!newType.isValueCompatibleWith(oldTypes.get(name.position)))
+                            if (!newType.isValueCompatibleWith(oldTypes.get(def.position())))
                                 throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
                                                                                columnName,
-                                                                               oldTypes.get(name.position).asCQL3Type(),
+                                                                               oldTypes.get(def.position()).asCQL3Type(),
                                                                                validator));
 
                             List<AbstractType<?>> newTypes = new ArrayList<AbstractType<?>>(oldTypes);
-                            newTypes.set(name.position, newType);
+                            newTypes.set(def.position(), newType);
                             cfm.keyValidator(CompositeType.getInstance(newTypes));
                         }
                         else
@@ -184,22 +174,20 @@
                             cfm.keyValidator(newType);
                         }
                         break;
-                    case COLUMN_ALIAS:
-                        assert cfDef.isComposite;
-                        List<AbstractType<?>> oldTypes = ((CompositeType) cfm.comparator).types;
+                    case CLUSTERING_COLUMN:
+                        AbstractType<?> oldType = cfm.comparator.subtype(def.position());
                         // Note that CFMetaData.validateCompatibility already validate the change we're about to do. However, the error message it
                         // sends is a bit cryptic for a CQL3 user, so validating here for a sake of returning a better error message
                         // Do note that we need isCompatibleWith here, not just isValueCompatibleWith.
-                        if (!validator.getType().isCompatibleWith(oldTypes.get(name.position)))
+                        if (!validator.getType().isCompatibleWith(oldType))
                             throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are not order-compatible.",
                                                                            columnName,
-                                                                           oldTypes.get(name.position).asCQL3Type(),
+                                                                           oldType.asCQL3Type(),
                                                                            validator));
-                        List<AbstractType<?>> newTypes = new ArrayList<AbstractType<?>>(oldTypes);
-                        newTypes.set(name.position, validator.getType());
-                        cfm.comparator = CompositeType.getInstance(newTypes);
+
+                        cfm.comparator = cfm.comparator.setSubtype(def.position(), validator.getType());
                         break;
-                    case VALUE_ALIAS:
+                    case COMPACT_VALUE:
                         // See below
                         if (!validator.getType().isValueCompatibleWith(cfm.getDefaultValidator()))
                             throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
@@ -208,42 +196,49 @@
                                                                            validator));
                         cfm.defaultValidator(validator.getType());
                         break;
-                    case COLUMN_METADATA:
+                    case REGULAR:
                     case STATIC:
-                        ColumnDefinition column = cfm.getColumnDefinition(columnName.key);
                         // Thrift allows to change a column validator so CFMetaData.validateCompatibility will let it slide
                         // if we change to an incompatible type (contrarily to the comparator case). But we don't want to
                         // allow it for CQL3 (see #5882) so validating it explicitly here. We only care about value compatibility
                         // though since we won't compare values (except when there is an index, but that is validated by
                         // ColumnDefinition already).
-                        if (!validator.getType().isValueCompatibleWith(column.getValidator()))
+                        if (!validator.getType().isValueCompatibleWith(def.type))
                             throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
                                                                            columnName,
-                                                                           column.getValidator().asCQL3Type(),
+                                                                           def.type.asCQL3Type(),
                                                                            validator));
 
-                        column.setValidator(validator.getType());
+                        // For collections, if we alter the type, we need to update the comparator too since it includes
+                        // the type too (note that isValueCompatibleWith above has validated that the need type don't really
+                        // change the underlying sorting order, but we still don't want to have a discrepancy between the type
+                        // in the comparator and the one in the ColumnDefinition as that would be dodgy).
+                        if (validator.getType() instanceof CollectionType)
+                            cfm.comparator = cfm.comparator.addOrUpdateCollection(def.name, (CollectionType)validator.getType());
+
                         break;
                 }
+                // In any case, we update the column definition
+                cfm.addOrReplaceColumnDefinition(def.withNewType(validator.getType()));
                 break;
 
             case DROP:
-                if (cfDef.isCompact || !cfDef.isComposite)
-                    throw new InvalidRequestException("Cannot drop columns from a COMPACT STORAGE table");
-                if (name == null)
+                if (!cfm.isCQL3Table())
+                    throw new InvalidRequestException("Cannot drop columns from a non-CQL3 table");
+                if (def == null)
                     throw new InvalidRequestException(String.format("Column %s was not found in table %s", columnName, columnFamily()));
 
-                switch (name.kind)
+                switch (def.kind)
                 {
-                    case KEY_ALIAS:
-                    case COLUMN_ALIAS:
+                    case PARTITION_KEY:
+                    case CLUSTERING_COLUMN:
                         throw new InvalidRequestException(String.format("Cannot drop PRIMARY KEY part %s", columnName));
-                    case COLUMN_METADATA:
+                    case REGULAR:
                     case STATIC:
                         ColumnDefinition toDelete = null;
                         for (ColumnDefinition columnDef : cfm.regularAndStaticColumns())
                         {
-                            if (columnDef.name.equals(columnName.key))
+                            if (columnDef.name.equals(columnName))
                                 toDelete = columnDef;
                         }
                         assert toDelete != null;
@@ -261,15 +256,11 @@
                 break;
             case RENAME:
                 for (Map.Entry<ColumnIdentifier, ColumnIdentifier> entry : renames.entrySet())
-                {
-                    ColumnIdentifier from = entry.getKey();
-                    ColumnIdentifier to = entry.getValue();
-                    cfm.renameColumn(from.key, from.toString(), to.key, to.toString());
-                }
+                    cfm.renameColumn(entry.getKey(), entry.getValue());
                 break;
         }
 
-        MigrationManager.announceColumnFamilyUpdate(cfm, false);
+        MigrationManager.announceColumnFamilyUpdate(cfm, false, isLocalOnly);
         return true;
     }
 
@@ -282,8 +273,8 @@
                              validator);
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
-        return ResultMessage.SchemaChange.Change.UPDATED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/AlterTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/AlterTypeStatement.java
new file mode 100644
index 0000000..cfdd65f
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/AlterTypeStatement.java

@@ -0,0 +1,344 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.transport.Event;
+
+public abstract class AlterTypeStatement extends SchemaAlteringStatement
+{
+    protected final UTName name;
+
+    protected AlterTypeStatement(UTName name)
+    {
+        super();
+        this.name = name;
+    }
+
+    @Override
+    public void prepareKeyspace(ClientState state) throws InvalidRequestException
+    {
+        if (!name.hasKeyspace())
+            name.setKeyspace(state.getKeyspace());
+
+        if (name.getKeyspace() == null)
+            throw new InvalidRequestException("You need to be logged in a keyspace or use a fully qualified user type name");
+    }
+
+    protected abstract UserType makeUpdatedType(UserType toUpdate) throws InvalidRequestException;
+
+    public static AlterTypeStatement addition(UTName name, ColumnIdentifier fieldName, CQL3Type.Raw type)
+    {
+        return new AddOrAlter(name, true, fieldName, type);
+    }
+
+    public static AlterTypeStatement alter(UTName name, ColumnIdentifier fieldName, CQL3Type.Raw type)
+    {
+        return new AddOrAlter(name, false, fieldName, type);
+    }
+
+    public static AlterTypeStatement renames(UTName name, Map<ColumnIdentifier, ColumnIdentifier> renames)
+    {
+        return new Renames(name, renames);
+    }
+
+    public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
+    {
+        state.hasKeyspaceAccess(keyspace(), Permission.ALTER);
+    }
+
+    public void validate(ClientState state) throws RequestValidationException
+    {
+        // Validation is left to announceMigration as it's easier to do it while constructing the updated type.
+        // It doesn't really change anything anyway.
+    }
+
+    public Event.SchemaChange changeEvent()
+    {
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
+    }
+
+    @Override
+    public String keyspace()
+    {
+        return name.getKeyspace();
+    }
+
+    public boolean announceMigration(boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
+    {
+        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        if (ksm == null)
+            throw new InvalidRequestException(String.format("Cannot alter type in unknown keyspace %s", name.getKeyspace()));
+
+        UserType toUpdate = ksm.userTypes.getType(name.getUserTypeName());
+        // Shouldn't happen, unless we race with a drop
+        if (toUpdate == null)
+            throw new InvalidRequestException(String.format("No user type named %s exists.", name));
+
+        UserType updated = makeUpdatedType(toUpdate);
+
+        // Now, we need to announce the type update to basically change it for new tables using this type,
+        // but we also need to find all existing user types and CF using it and change them.
+        MigrationManager.announceTypeUpdate(updated, isLocalOnly);
+
+        for (KSMetaData ksm2 : Schema.instance.getKeyspaceDefinitions())
+        {
+            for (CFMetaData cfm : ksm2.cfMetaData().values())
+            {
+                CFMetaData copy = cfm.copy();
+                boolean modified = false;
+                for (ColumnDefinition def : copy.allColumns())
+                    modified |= updateDefinition(copy, def, toUpdate.keyspace, toUpdate.name, updated);
+                if (modified)
+                    MigrationManager.announceColumnFamilyUpdate(copy, false, isLocalOnly);
+            }
+
+            // Other user types potentially using the updated type
+            for (UserType ut : ksm2.userTypes.getAllTypes().values())
+            {
+                // Re-updating the type we've just updated would be harmless but useless so we avoid it.
+                // Besides, we use the occasion to drop the old version of the type if it's a type rename
+                if (ut.keyspace.equals(toUpdate.keyspace) && ut.name.equals(toUpdate.name))
+                {
+                    if (!ut.keyspace.equals(updated.keyspace) || !ut.name.equals(updated.name))
+                        MigrationManager.announceTypeDrop(ut);
+                    continue;
+                }
+                AbstractType<?> upd = updateWith(ut, toUpdate.keyspace, toUpdate.name, updated);
+                if (upd != null)
+                    MigrationManager.announceTypeUpdate((UserType)upd, isLocalOnly);
+            }
+        }
+        return true;
+    }
+
+    private static int getIdxOfField(UserType type, ColumnIdentifier field)
+    {
+        for (int i = 0; i < type.size(); i++)
+            if (field.bytes.equals(type.fieldName(i)))
+                return i;
+        return -1;
+    }
+
+    private boolean updateDefinition(CFMetaData cfm, ColumnDefinition def, String keyspace, ByteBuffer toReplace, UserType updated)
+    {
+        AbstractType<?> t = updateWith(def.type, keyspace, toReplace, updated);
+        if (t == null)
+            return false;
+
+        // We need to update this validator ...
+        cfm.addOrReplaceColumnDefinition(def.withNewType(t));
+
+        // ... but if it's part of the comparator or key validator, we need to go update those too.
+        switch (def.kind)
+        {
+            case PARTITION_KEY:
+                cfm.keyValidator(updateWith(cfm.getKeyValidator(), keyspace, toReplace, updated));
+                break;
+            case CLUSTERING_COLUMN:
+                cfm.comparator = CellNames.fromAbstractType(updateWith(cfm.comparator.asAbstractType(), keyspace, toReplace, updated), cfm.comparator.isDense());
+                break;
+            default:
+                // If it's a collection, we still want to modify the comparator because the collection is aliased in it
+                if (def.type instanceof CollectionType)
+                    cfm.comparator = CellNames.fromAbstractType(updateWith(cfm.comparator.asAbstractType(), keyspace, toReplace, updated), cfm.comparator.isDense());
+        }
+        return true;
+    }
+
+    // Update the provided type were all instance of a given userType is replaced by a new version
+    // Note that this methods reaches inside other UserType, CompositeType and CollectionType.
+    private static AbstractType<?> updateWith(AbstractType<?> type, String keyspace, ByteBuffer toReplace, UserType updated)
+    {
+        if (type instanceof UserType)
+        {
+            UserType ut = (UserType)type;
+
+            // If it's directly the type we've updated, then just use the new one.
+            if (keyspace.equals(ut.keyspace) && toReplace.equals(ut.name))
+                return updated;
+
+            // Otherwise, check for nesting
+            List<AbstractType<?>> updatedTypes = updateTypes(ut.fieldTypes(), keyspace, toReplace, updated);
+            return updatedTypes == null ? null : new UserType(ut.keyspace, ut.name, new ArrayList<>(ut.fieldNames()), updatedTypes);
+        }
+        else if (type instanceof CompositeType)
+        {
+            CompositeType ct = (CompositeType)type;
+            List<AbstractType<?>> updatedTypes = updateTypes(ct.types, keyspace, toReplace, updated);
+            return updatedTypes == null ? null : CompositeType.getInstance(updatedTypes);
+        }
+        else if (type instanceof ColumnToCollectionType)
+        {
+            ColumnToCollectionType ctct = (ColumnToCollectionType)type;
+            Map<ByteBuffer, CollectionType> updatedTypes = null;
+            for (Map.Entry<ByteBuffer, CollectionType> entry : ctct.defined.entrySet())
+            {
+                AbstractType<?> t = updateWith(entry.getValue(), keyspace, toReplace, updated);
+                if (t == null)
+                    continue;
+
+                if (updatedTypes == null)
+                    updatedTypes = new HashMap<>(ctct.defined);
+
+                updatedTypes.put(entry.getKey(), (CollectionType)t);
+            }
+            return updatedTypes == null ? null : ColumnToCollectionType.getInstance(updatedTypes);
+        }
+        else if (type instanceof CollectionType)
+        {
+            if (type instanceof ListType)
+            {
+                AbstractType<?> t = updateWith(((ListType)type).elements, keyspace, toReplace, updated);
+                return t == null ? null : ListType.getInstance(t);
+            }
+            else if (type instanceof SetType)
+            {
+                AbstractType<?> t = updateWith(((SetType)type).elements, keyspace, toReplace, updated);
+                return t == null ? null : SetType.getInstance(t);
+            }
+            else
+            {
+                assert type instanceof MapType;
+                MapType mt = (MapType)type;
+                AbstractType<?> k = updateWith(mt.keys, keyspace, toReplace, updated);
+                AbstractType<?> v = updateWith(mt.values, keyspace, toReplace, updated);
+                if (k == null && v == null)
+                    return null;
+                return MapType.getInstance(k == null ? mt.keys : k, v == null ? mt.values : v);
+            }
+        }
+        else
+        {
+            return null;
+        }
+    }
+
+    private static List<AbstractType<?>> updateTypes(List<AbstractType<?>> toUpdate, String keyspace, ByteBuffer toReplace, UserType updated)
+    {
+        // But this can also be nested.
+        List<AbstractType<?>> updatedTypes = null;
+        for (int i = 0; i < toUpdate.size(); i++)
+        {
+            AbstractType<?> t = updateWith(toUpdate.get(i), keyspace, toReplace, updated);
+            if (t == null)
+                continue;
+
+            if (updatedTypes == null)
+                updatedTypes = new ArrayList<>(toUpdate);
+
+            updatedTypes.set(i, t);
+        }
+        return updatedTypes;
+    }
+
+    private static class AddOrAlter extends AlterTypeStatement
+    {
+        private final boolean isAdd;
+        private final ColumnIdentifier fieldName;
+        private final CQL3Type.Raw type;
+
+        public AddOrAlter(UTName name, boolean isAdd, ColumnIdentifier fieldName, CQL3Type.Raw type)
+        {
+            super(name);
+            this.isAdd = isAdd;
+            this.fieldName = fieldName;
+            this.type = type;
+        }
+
+        private UserType doAdd(UserType toUpdate) throws InvalidRequestException
+        {
+            if (getIdxOfField(toUpdate, fieldName) >= 0)
+                throw new InvalidRequestException(String.format("Cannot add new field %s to type %s: a field of the same name already exists", fieldName, name));
+
+            List<ByteBuffer> newNames = new ArrayList<>(toUpdate.size() + 1);
+            newNames.addAll(toUpdate.fieldNames());
+            newNames.add(fieldName.bytes);
+
+            List<AbstractType<?>> newTypes = new ArrayList<>(toUpdate.size() + 1);
+            newTypes.addAll(toUpdate.fieldTypes());
+            newTypes.add(type.prepare(keyspace()).getType());
+
+            return new UserType(toUpdate.keyspace, toUpdate.name, newNames, newTypes);
+        }
+
+        private UserType doAlter(UserType toUpdate) throws InvalidRequestException
+        {
+            int idx = getIdxOfField(toUpdate, fieldName);
+            if (idx < 0)
+                throw new InvalidRequestException(String.format("Unknown field %s in type %s", fieldName, name));
+
+            AbstractType<?> previous = toUpdate.fieldType(idx);
+            if (!type.prepare(keyspace()).getType().isCompatibleWith(previous))
+                throw new InvalidRequestException(String.format("Type %s is incompatible with previous type %s of field %s in user type %s", type, previous.asCQL3Type(), fieldName, name));
+
+            List<ByteBuffer> newNames = new ArrayList<>(toUpdate.fieldNames());
+            List<AbstractType<?>> newTypes = new ArrayList<>(toUpdate.fieldTypes());
+            newTypes.set(idx, type.prepare(keyspace()).getType());
+
+            return new UserType(toUpdate.keyspace, toUpdate.name, newNames, newTypes);
+        }
+
+        protected UserType makeUpdatedType(UserType toUpdate) throws InvalidRequestException
+        {
+            return isAdd ? doAdd(toUpdate) : doAlter(toUpdate);
+        }
+    }
+
+    private static class Renames extends AlterTypeStatement
+    {
+        private final Map<ColumnIdentifier, ColumnIdentifier> renames;
+
+        public Renames(UTName name, Map<ColumnIdentifier, ColumnIdentifier> renames)
+        {
+            super(name);
+            this.renames = renames;
+        }
+
+        protected UserType makeUpdatedType(UserType toUpdate) throws InvalidRequestException
+        {
+            List<ByteBuffer> newNames = new ArrayList<>(toUpdate.fieldNames());
+            List<AbstractType<?>> newTypes = new ArrayList<>(toUpdate.fieldTypes());
+
+            for (Map.Entry<ColumnIdentifier, ColumnIdentifier> entry : renames.entrySet())
+            {
+                ColumnIdentifier from = entry.getKey();
+                ColumnIdentifier to = entry.getValue();
+                int idx = getIdxOfField(toUpdate, from);
+                if (idx < 0)
+                    throw new InvalidRequestException(String.format("Unknown field %s in type %s", from, name));
+                newNames.set(idx, to.bytes);
+            }
+
+            UserType updated = new UserType(toUpdate.keyspace, toUpdate.name, newNames, newTypes);
+            CreateTypeStatement.checkForDuplicateNames(updated);
+            return updated;
+        }
+
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
index 8a9a8f0..17d1771 100644
--- a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java

@@ -27,8 +27,10 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
@@ -41,9 +43,6 @@
  */
 public class BatchStatement implements CQLStatement, MeasurableForPreparedCache
 {
-    private static boolean loggedCASTimestamp = false;
-    private static boolean loggedCounterTimestamp = false;
-
     public static enum Type
     {
         LOGGED, UNLOGGED, COUNTER
@@ -64,8 +63,12 @@
      * @param statements a list of UpdateStatements
      * @param attrs additional attributes for statement (CL, timestamp, timeToLive)
      */
-    public BatchStatement(int boundTerms, Type type, List<ModificationStatement> statements, Attributes attrs, boolean hasConditions)
+    public BatchStatement(int boundTerms, Type type, List<ModificationStatement> statements, Attributes attrs)
     {
+        boolean hasConditions = false;
+        for (ModificationStatement statement : statements)
+            hasConditions |= statement.hasConditions();
+
         this.boundTerms = boundTerms;
         this.type = type;
         this.statements = statements;
@@ -95,7 +98,8 @@
             statement.checkAccess(state);
     }
 
-    public void validate(ClientState state) throws InvalidRequestException
+    // Validates a prepared batch statement without validating its nested statements.
+    public void validate() throws InvalidRequestException
     {
         if (attrs.isTimeToLiveSet())
             throw new InvalidRequestException("Global TTL on the BATCH statement is not supported.");
@@ -103,32 +107,58 @@
         boolean timestampSet = attrs.isTimestampSet();
         if (timestampSet)
         {
-            if (hasConditions && !loggedCASTimestamp)
-            {
-                logger.warn("Detected use of 'USING TIMESTAMP' on a BATCH with conditions. This is invalid, " +
-                            "custom timestamps are not allowed when conditions are used and the timestamp has been ignored. " +
-                            "Such queries will be rejected in Cassandra 2.1+ - please fix your queries before then.");
-                loggedCASTimestamp = true;
-            }
-            if (type == Type.COUNTER && !loggedCounterTimestamp)
-            {
-                logger.warn("Detected use of 'USING TIMESTAMP' in a counter BATCH. This is invalid " +
-                            "because counters do not use timestamps, and the timestamp has been ignored. " +
-                            "Such queries will be rejected in Cassandra 2.1+ - please fix your queries before then.");
-                loggedCounterTimestamp = true;
-            }
+            if (hasConditions)
+                throw new InvalidRequestException("Cannot provide custom timestamp for conditional BATCH");
+            if (type == Type.COUNTER)
+                throw new InvalidRequestException("Cannot provide custom timestamp for counter BATCH");
         }
 
+        boolean hasCounters = false;
+        boolean hasNonCounters = false;
+
         for (ModificationStatement statement : statements)
         {
+            if (timestampSet && statement.isCounter())
+                throw new InvalidRequestException("Cannot provide custom timestamp for a BATCH containing counters");
+
             if (timestampSet && statement.isTimestampSet())
                 throw new InvalidRequestException("Timestamp must be set either on BATCH or individual statements");
 
-            statement.validate(state);
+            if (type == Type.COUNTER && !statement.isCounter())
+                throw new InvalidRequestException("Cannot include non-counter statement in a counter batch");
 
-            if (hasConditions && statement.requiresRead())
-                throw new InvalidRequestException("Operations on lists requiring a read (setting by index and deletions by index or value) are not allowed with IF conditions");
+            if (type == Type.LOGGED && statement.isCounter())
+                throw new InvalidRequestException("Cannot include a counter statement in a logged batch");
+
+            if (statement.isCounter())
+                hasCounters = true;
+            else
+                hasNonCounters = true;
         }
+
+        if (hasCounters && hasNonCounters)
+            throw new InvalidRequestException("Counter and non-counter mutations cannot exist in the same batch");
+
+        if (hasConditions)
+        {
+            String ksName = null;
+            String cfName = null;
+            for (ModificationStatement stmt : statements)
+            {
+                if (ksName != null && (!stmt.keyspace().equals(ksName) || !stmt.columnFamily().equals(cfName)))
+                    throw new InvalidRequestException("Batch with conditions cannot span multiple tables");
+                ksName = stmt.keyspace();
+                cfName = stmt.columnFamily();
+            }
+        }
+    }
+
+    // The batch itself will be validated in either Parsed#prepare() - for regular CQL3 batches,
+    //   or in QueryProcessor.processBatch() - for native protocol batches.
+    public void validate(ClientState state) throws InvalidRequestException
+    {
+        for (ModificationStatement statement : statements)
+            statement.validate(state);
     }
 
     public List<ModificationStatement> getStatements()
@@ -136,16 +166,16 @@
         return statements;
     }
 
-    private Collection<? extends IMutation> getMutations(BatchVariables variables, boolean local, ConsistencyLevel cl, long now)
+    private Collection<? extends IMutation> getMutations(BatchQueryOptions options, boolean local, long now)
     throws RequestExecutionException, RequestValidationException
     {
         Map<String, Map<ByteBuffer, IMutation>> mutations = new HashMap<>();
         for (int i = 0; i < statements.size(); i++)
         {
             ModificationStatement statement = statements.get(i);
-            List<ByteBuffer> statementVariables = variables.getVariablesForStatement(i);
-            long timestamp = attrs.getTimestamp(now, statementVariables);
-            addStatementMutations(statement, statementVariables, local, cl, timestamp, mutations);
+            QueryOptions statementOptions = options.forStatement(i);
+            long timestamp = attrs.getTimestamp(now, statementOptions);
+            addStatementMutations(statement, statementOptions, local, timestamp, mutations);
         }
         return unzipMutations(mutations);
     }
@@ -163,9 +193,8 @@
     }
 
     private void addStatementMutations(ModificationStatement statement,
-                                       List<ByteBuffer> variables,
+                                       QueryOptions options,
                                        boolean local,
-                                       ConsistencyLevel cl,
                                        long now,
                                        Map<String, Map<ByteBuffer, IMutation>> mutations)
     throws RequestExecutionException, RequestValidationException
@@ -181,26 +210,26 @@
         // The following does the same than statement.getMutations(), but we inline it here because
         // we don't want to recreate mutations every time as this is particularly inefficient when applying
         // multiple batch to the same partition (see #6737).
-        List<ByteBuffer> keys = statement.buildPartitionKeyNames(variables);
-        ColumnNameBuilder clusteringPrefix = statement.createClusteringPrefixBuilder(variables);
-        UpdateParameters params = statement.makeUpdateParameters(keys, clusteringPrefix, variables, local, cl, now);
+        List<ByteBuffer> keys = statement.buildPartitionKeyNames(options);
+        Composite clusteringPrefix = statement.createClusteringPrefix(options);
+        UpdateParameters params = statement.makeUpdateParameters(keys, clusteringPrefix, options, local, now);
 
         for (ByteBuffer key : keys)
         {
             IMutation mutation = ksMap.get(key);
-            RowMutation rm;
+            Mutation mut;
             if (mutation == null)
             {
-                rm = new RowMutation(ksName, key);
-                mutation = type == Type.COUNTER ? new CounterMutation(rm, cl) : rm;
+                mut = new Mutation(ksName, key);
+                mutation = statement.cfm.isCounter() ? new CounterMutation(mut, options.getConsistency()) : mut;
                 ksMap.put(key, mutation);
             }
             else
             {
-                rm = type == Type.COUNTER ? ((CounterMutation)mutation).rowMutation() : (RowMutation)mutation;
+                mut = statement.cfm.isCounter() ? ((CounterMutation)mutation).getMutation() : (Mutation)mutation;
             }
 
-            statement.addUpdateForKey(rm.addOrGet(statement.cfm, UnsortedColumns.factory), key, clusteringPrefix, params);
+            statement.addUpdateForKey(mut.addOrGet(statement.cfm), key, clusteringPrefix, params);
         }
     }
 
@@ -208,7 +237,7 @@
      * Checks batch size to ensure threshold is met. If not, a warning is logged.
      * @param cfs ColumnFamilies that will store the batch's mutations.
      */
-    private void verifyBatchSize(Iterable<ColumnFamily> cfs)
+    public static void verifyBatchSize(Iterable<ColumnFamily> cfs)
     {
         long size = 0;
         long warnThreshold = DatabaseDescriptor.getBatchSizeWarnThreshold();
@@ -229,29 +258,26 @@
 
     public ResultMessage execute(QueryState queryState, QueryOptions options) throws RequestExecutionException, RequestValidationException
     {
-        if (options.getConsistency() == null)
-            throw new InvalidRequestException("Invalid empty consistency level");
-
-        return execute(new PreparedBatchVariables(options.getValues()), false, options.getConsistency(), options.getSerialConsistency(), queryState.getTimestamp());
+        return execute(queryState, BatchQueryOptions.withoutPerStatementVariables(options));
     }
 
-    public ResultMessage executeWithPerStatementVariables(ConsistencyLevel cl, QueryState queryState, List<List<ByteBuffer>> variables) throws RequestExecutionException, RequestValidationException
+    public ResultMessage execute(QueryState queryState, BatchQueryOptions options) throws RequestExecutionException, RequestValidationException
     {
-        if (cl == null)
-            throw new InvalidRequestException("Invalid empty consistency level");
-
-        return execute(new BatchOfPreparedVariables(variables), false, cl, ConsistencyLevel.SERIAL, queryState.getTimestamp());
+        return execute(queryState, options, false, options.getTimestamp(queryState));
     }
 
-    public ResultMessage execute(BatchVariables variables, boolean local, ConsistencyLevel cl, ConsistencyLevel serialCl, long now)
+    private ResultMessage execute(QueryState queryState, BatchQueryOptions options, boolean local, long now)
     throws RequestExecutionException, RequestValidationException
     {
-        // TODO: we don't support a serial consistency for batches in the protocol so defaulting to SERIAL for now.
-        // We'll need to fix that.
-        if (hasConditions)
-            return executeWithConditions(variables, cl, serialCl, now);
+        if (options.getConsistency() == null)
+            throw new InvalidRequestException("Invalid empty consistency level");
+        if (options.getSerialConsistency() == null)
+            throw new InvalidRequestException("Invalid empty serial consistency level");
 
-        executeWithoutConditions(getMutations(variables, local, cl, now), cl);
+        if (hasConditions)
+            return executeWithConditions(options, now);
+
+        executeWithoutConditions(getMutations(options, local, now), options.getConsistency());
         return new ResultMessage.Void();
     }
 
@@ -271,23 +297,21 @@
         StorageProxy.mutateWithTriggers(mutations, cl, mutateAtomic);
     }
 
-
-    private ResultMessage executeWithConditions(BatchVariables variables, ConsistencyLevel cl, ConsistencyLevel serialCf, long now)
+    private ResultMessage executeWithConditions(BatchQueryOptions options, long now)
     throws RequestExecutionException, RequestValidationException
     {
         ByteBuffer key = null;
         String ksName = null;
         String cfName = null;
-        ColumnFamily updates = null;
-        CQL3CasConditions conditions = null;
-        Set<ColumnIdentifier> columnsWithConditions = new LinkedHashSet<ColumnIdentifier>();
+        CQL3CasRequest casRequest = null;
+        Set<ColumnDefinition> columnsWithConditions = new LinkedHashSet<>();
 
         for (int i = 0; i < statements.size(); i++)
         {
             ModificationStatement statement = statements.get(i);
-            List<ByteBuffer> statementVariables = variables.getVariablesForStatement(i);
-            long timestamp = attrs.getTimestamp(now, statementVariables);
-            List<ByteBuffer> pks = statement.buildPartitionKeyNames(statementVariables);
+            QueryOptions statementOptions = options.forStatement(i);
+            long timestamp = attrs.getTimestamp(now, statementOptions);
+            List<ByteBuffer> pks = statement.buildPartitionKeyNames(statementOptions);
             if (pks.size() > 1)
                 throw new IllegalArgumentException("Batch with conditions cannot span multiple partitions (you cannot use IN on the partition key)");
             if (key == null)
@@ -295,42 +319,40 @@
                 key = pks.get(0);
                 ksName = statement.cfm.ksName;
                 cfName = statement.cfm.cfName;
-                conditions = new CQL3CasConditions(statement.cfm, now);
-                updates = UnsortedColumns.factory.create(statement.cfm);
+                casRequest = new CQL3CasRequest(statement.cfm, key, true);
             }
             else if (!key.equals(pks.get(0)))
             {
                 throw new InvalidRequestException("Batch with conditions cannot span multiple partitions");
             }
 
-            ColumnNameBuilder clusteringPrefix = statement.createClusteringPrefixBuilder(statementVariables);
+            Composite clusteringPrefix = statement.createClusteringPrefix(statementOptions);
             if (statement.hasConditions())
             {
-                statement.addUpdatesAndConditions(key, clusteringPrefix, updates, conditions, statementVariables, timestamp);
+                statement.addConditions(clusteringPrefix, casRequest, statementOptions);
                 // As soon as we have a ifNotExists, we set columnsWithConditions to null so that everything is in the resultSet
                 if (statement.hasIfNotExistCondition() || statement.hasIfExistCondition())
                     columnsWithConditions = null;
                 else if (columnsWithConditions != null)
                     Iterables.addAll(columnsWithConditions, statement.getColumnsWithConditions());
             }
-            else
-            {
-                UpdateParameters params = statement.makeUpdateParameters(Collections.singleton(key), clusteringPrefix, statementVariables, false, cl, now);
-                statement.addUpdateForKey(updates, key, clusteringPrefix, params);
-            }
+            casRequest.addRowUpdate(clusteringPrefix, statement, statementOptions, timestamp);
         }
 
-        verifyBatchSize(Collections.singleton(updates));
-        ColumnFamily result = StorageProxy.cas(ksName, cfName, key, conditions, updates, serialCf, cl);
-        return new ResultMessage.Rows(ModificationStatement.buildCasResultSet(ksName, key, cfName, result, columnsWithConditions, true));
+        ColumnFamily result = StorageProxy.cas(ksName, cfName, key, casRequest, options.getSerialConsistency(), options.getConsistency());
+
+        return new ResultMessage.Rows(ModificationStatement.buildCasResultSet(ksName, key, cfName, result, columnsWithConditions, true, options.forStatement(0)));
     }
 
     public ResultMessage executeInternal(QueryState queryState, QueryOptions options) throws RequestValidationException, RequestExecutionException
     {
         assert !hasConditions;
-
-        for (IMutation mutation : getMutations(new PreparedBatchVariables(options.getValues()), true, null, queryState.getTimestamp()))
-            mutation.apply();
+        for (IMutation mutation : getMutations(BatchQueryOptions.withoutPerStatementVariables(options), true, queryState.getTimestamp()))
+        {
+            // We don't use counters internally.
+            assert mutation instanceof Mutation;
+            ((Mutation) mutation).apply();
+        }
         return null;
     }
 
@@ -339,36 +361,6 @@
         public List<ByteBuffer> getVariablesForStatement(int statementInBatch);
     }
 
-    public static class PreparedBatchVariables implements BatchVariables
-    {
-        private final List<ByteBuffer> variables;
-
-        public PreparedBatchVariables(List<ByteBuffer> variables)
-        {
-            this.variables = variables;
-        }
-
-        public List<ByteBuffer> getVariablesForStatement(int statementInBatch)
-        {
-            return variables;
-        }
-    }
-
-    public static class BatchOfPreparedVariables implements BatchVariables
-    {
-        private final List<List<ByteBuffer>> variables;
-
-        public BatchOfPreparedVariables(List<List<ByteBuffer>> variables)
-        {
-            this.variables = variables;
-        }
-
-        public List<ByteBuffer> getVariablesForStatement(int statementInBatch)
-        {
-            return variables.get(statementInBatch);
-        }
-    }
-
     public String toString()
     {
         return String.format("BatchStatement(type=%s, statements=%s)", type, statements);
@@ -399,40 +391,17 @@
         {
             VariableSpecifications boundNames = getBoundVariables();
 
-            List<ModificationStatement> statements = new ArrayList<ModificationStatement>(parsedStatements.size());
-            boolean hasConditions = false;
+            List<ModificationStatement> statements = new ArrayList<>(parsedStatements.size());
             for (ModificationStatement.Parsed parsed : parsedStatements)
-            {
-                ModificationStatement stmt = parsed.prepare(boundNames);
-                if (stmt.hasConditions())
-                    hasConditions = true;
-
-                if (stmt.isCounter() && type != Type.COUNTER)
-                    throw new InvalidRequestException("Counter mutations are only allowed in COUNTER batches");
-
-                if (!stmt.isCounter() && type == Type.COUNTER)
-                    throw new InvalidRequestException("Only counter mutations are allowed in COUNTER batches");
-
-                statements.add(stmt);
-            }
-
-            if (hasConditions)
-            {
-                String ksName = null;
-                String cfName = null;
-                for (ModificationStatement stmt : statements)
-                {
-                    if (ksName != null && (!stmt.keyspace().equals(ksName) || !stmt.columnFamily().equals(cfName)))
-                        throw new InvalidRequestException("Batch with conditions cannot span multiple tables");
-                    ksName = stmt.keyspace();
-                    cfName = stmt.columnFamily();
-                }
-            }
+                statements.add(parsed.prepare(boundNames));
 
             Attributes prepAttrs = attrs.prepare("[batch]", "[batch]");
             prepAttrs.collectMarkerSpecification(boundNames);
 
-            return new ParsedStatement.Prepared(new BatchStatement(boundNames.size(), type, statements, prepAttrs, hasConditions), boundNames);
+            BatchStatement batchStatement = new BatchStatement(boundNames.size(), type, statements, prepAttrs);
+            batchStatement.validate();
+
+            return new ParsedStatement.Prepared(batchStatement, boundNames);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CFPropDefs.java b/src/java/org/apache/cassandra/cql3/statements/CFPropDefs.java
index 6ce6406..aee86a8 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CFPropDefs.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CFPropDefs.java

@@ -17,8 +17,12 @@
  */
 package org.apache.cassandra.cql3.statements;
 
-import java.util.*;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
 
+import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.CFMetaData.SpeculativeRetry;
 import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
@@ -34,12 +38,11 @@
     public static final String KW_GCGRACESECONDS = "gc_grace_seconds";
     public static final String KW_MINCOMPACTIONTHRESHOLD = "min_threshold";
     public static final String KW_MAXCOMPACTIONTHRESHOLD = "max_threshold";
-    public static final String KW_REPLICATEONWRITE = "replicate_on_write";
     public static final String KW_CACHING = "caching";
     public static final String KW_DEFAULT_TIME_TO_LIVE = "default_time_to_live";
-    public static final String KW_INDEX_INTERVAL = "index_interval";
+    public static final String KW_MIN_INDEX_INTERVAL = "min_index_interval";
+    public static final String KW_MAX_INDEX_INTERVAL = "max_index_interval";
     public static final String KW_SPECULATIVE_RETRY = "speculative_retry";
-    public static final String KW_POPULATE_IO_CACHE_ON_FLUSH = "populate_io_cache_on_flush";
     public static final String KW_BF_FP_CHANCE = "bloom_filter_fp_chance";
     public static final String KW_MEMTABLE_FLUSH_PERIOD = "memtable_flush_period_in_ms";
 
@@ -57,16 +60,19 @@
         keywords.add(KW_READREPAIRCHANCE);
         keywords.add(KW_DCLOCALREADREPAIRCHANCE);
         keywords.add(KW_GCGRACESECONDS);
-        keywords.add(KW_REPLICATEONWRITE);
         keywords.add(KW_CACHING);
         keywords.add(KW_DEFAULT_TIME_TO_LIVE);
-        keywords.add(KW_INDEX_INTERVAL);
+        keywords.add(KW_MIN_INDEX_INTERVAL);
+        keywords.add(KW_MAX_INDEX_INTERVAL);
         keywords.add(KW_SPECULATIVE_RETRY);
-        keywords.add(KW_POPULATE_IO_CACHE_ON_FLUSH);
         keywords.add(KW_BF_FP_CHANCE);
         keywords.add(KW_COMPACTION);
         keywords.add(KW_COMPRESSION);
         keywords.add(KW_MEMTABLE_FLUSH_PERIOD);
+
+        obsoleteKeywords.add("index_interval");
+        obsoleteKeywords.add("replicate_on_write");
+        obsoleteKeywords.add("populate_io_cache_on_flush");
     }
 
     private Class<? extends AbstractCompactionStrategy> compactionStrategyClass = null;
@@ -112,7 +118,13 @@
         }
 
         validateMinimumInt(KW_DEFAULT_TIME_TO_LIVE, 0, CFMetaData.DEFAULT_DEFAULT_TIME_TO_LIVE);
-        validateMinimumInt(KW_INDEX_INTERVAL, 1, CFMetaData.DEFAULT_INDEX_INTERVAL);
+
+        Integer minIndexInterval = getInt(KW_MIN_INDEX_INTERVAL, null);
+        Integer maxIndexInterval = getInt(KW_MAX_INDEX_INTERVAL, null);
+        if (minIndexInterval != null && minIndexInterval < 1)
+            throw new ConfigurationException(KW_MIN_INDEX_INTERVAL + " must be greater than 0");
+        if (maxIndexInterval != null && minIndexInterval != null && maxIndexInterval < minIndexInterval)
+            throw new ConfigurationException(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
 
         SpeculativeRetry.fromString(getString(KW_SPECULATIVE_RETRY, SpeculativeRetry.RetryType.NONE.name()));
     }
@@ -137,6 +149,21 @@
             return new HashMap<>();
         return compressionOptions;
     }
+    public CachingOptions getCachingOptions() throws SyntaxException, ConfigurationException
+    {
+        CachingOptions options = null;
+        Object val = properties.get(KW_CACHING);
+        if (val == null)
+            return null;
+        else if (val instanceof Map)
+            options = CachingOptions.fromMap(getMap(KW_CACHING));
+        else if (val instanceof String) // legacy syntax
+        {
+            options = CachingOptions.fromString(getSimple(KW_CACHING));
+            logger.warn("Setting caching options with deprecated syntax.");
+        }
+        return options;
+    }
 
     public void applyToCFMetadata(CFMetaData cfm) throws ConfigurationException, SyntaxException
     {
@@ -146,19 +173,17 @@
         cfm.readRepairChance(getDouble(KW_READREPAIRCHANCE, cfm.getReadRepairChance()));
         cfm.dcLocalReadRepairChance(getDouble(KW_DCLOCALREADREPAIRCHANCE, cfm.getDcLocalReadRepair()));
         cfm.gcGraceSeconds(getInt(KW_GCGRACESECONDS, cfm.getGcGraceSeconds()));
-        cfm.replicateOnWrite(getBoolean(KW_REPLICATEONWRITE, cfm.getReplicateOnWrite()));
         int minCompactionThreshold = toInt(KW_MINCOMPACTIONTHRESHOLD, getCompactionOptions().get(KW_MINCOMPACTIONTHRESHOLD), cfm.getMinCompactionThreshold());
         int maxCompactionThreshold = toInt(KW_MAXCOMPACTIONTHRESHOLD, getCompactionOptions().get(KW_MAXCOMPACTIONTHRESHOLD), cfm.getMaxCompactionThreshold());
         if (minCompactionThreshold <= 0 || maxCompactionThreshold <= 0)
             throw new ConfigurationException("Disabling compaction by setting compaction thresholds to 0 has been deprecated, set the compaction option 'enabled' to false instead.");
         cfm.minCompactionThreshold(minCompactionThreshold);
         cfm.maxCompactionThreshold(maxCompactionThreshold);
-        cfm.caching(CFMetaData.Caching.fromString(getString(KW_CACHING, cfm.getCaching().toString())));
         cfm.defaultTimeToLive(getInt(KW_DEFAULT_TIME_TO_LIVE, cfm.getDefaultTimeToLive()));
         cfm.speculativeRetry(CFMetaData.SpeculativeRetry.fromString(getString(KW_SPECULATIVE_RETRY, cfm.getSpeculativeRetry().toString())));
         cfm.memtableFlushPeriod(getInt(KW_MEMTABLE_FLUSH_PERIOD, cfm.getMemtableFlushPeriod()));
-        cfm.populateIoCacheOnFlush(getBoolean(KW_POPULATE_IO_CACHE_ON_FLUSH, cfm.populateIoCacheOnFlush()));
-        cfm.indexInterval(getInt(KW_INDEX_INTERVAL, cfm.getIndexInterval()));
+        cfm.minIndexInterval(getInt(KW_MIN_INDEX_INTERVAL, cfm.getMinIndexInterval()));
+        cfm.maxIndexInterval(getInt(KW_MAX_INDEX_INTERVAL, cfm.getMaxIndexInterval()));
 
         if (compactionStrategyClass != null)
         {
@@ -170,6 +195,9 @@
 
         if (!getCompressionOptions().isEmpty())
             cfm.compressionParameters(CompressionParameters.create(getCompressionOptions()));
+        CachingOptions cachingOptions = getCachingOptions();
+        if (cachingOptions != null)
+            cfm.caching(cachingOptions);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/cql3/statements/CFStatement.java b/src/java/org/apache/cassandra/cql3/statements/CFStatement.java
index 2ccc203..9b2987c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CFStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CFStatement.java

@@ -37,11 +37,20 @@
     {
         if (!cfName.hasKeyspace())
         {
-            // XXX: We explicitely only want to call state.getKeyspace() in this case, don't move it outside the if.
+            // XXX: We explicitely only want to call state.getKeyspace() in this case, as we don't want to throw
+            // if not logged in any keyspace but a keyspace is explicitely set on the statement. So don't move
+            // the call outside the 'if' or replace the method by 'prepareKeyspace(state.getKeyspace())'
             cfName.setKeyspace(state.getKeyspace(), true);
         }
     }
 
+    // Only for internal calls, use the version with ClientState for user queries
+    public void prepareKeyspace(String keyspace)
+    {
+        if (!cfName.hasKeyspace())
+            cfName.setKeyspace(keyspace, true);
+    }
+
     public String keyspace()
     {
         assert cfName.hasKeyspace() : "The statement hasn't be prepared correctly";

diff --git a/src/java/org/apache/cassandra/cql3/statements/CQL3CasConditions.java b/src/java/org/apache/cassandra/cql3/statements/CQL3CasConditions.java
deleted file mode 100644
index 775a236..0000000
--- a/src/java/org/apache/cassandra/cql3/statements/CQL3CasConditions.java
+++ /dev/null

@@ -1,202 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3.statements;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.service.CASConditions;
-import org.apache.cassandra.utils.Pair;
-
-/**
- * Processed CAS conditions on potentially multiple rows of the same partition.
- */
-public class CQL3CasConditions implements CASConditions
-{
-    private final CFMetaData cfm;
-    private final long now;
-
-    // We index RowCondition by the prefix of the row they applied to for 2 reasons:
-    //   1) this allows to keep things sorted to build the ColumnSlice array below
-    //   2) this allows to detect when contradictory conditions are set (not exists with some other conditions on the same row)
-    private final SortedMap<ByteBuffer, RowCondition> conditions;
-
-    public CQL3CasConditions(CFMetaData cfm, long now)
-    {
-        this.cfm = cfm;
-        // We will use now for Column.isLive() which expects milliseconds but the argument is in microseconds.
-        this.now = now / 1000;
-        this.conditions = new TreeMap<>(cfm.comparator);
-    }
-
-    public void addNotExist(ColumnNameBuilder prefix) throws InvalidRequestException
-    {
-        RowCondition previous = conditions.put(prefix.build(), new NotExistCondition(prefix, now));
-        if (previous != null && !(previous instanceof NotExistCondition))
-        {
-            // these should be prevented by the parser, but it doesn't hurt to check
-            if (previous instanceof ExistCondition)
-                throw new InvalidRequestException("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row");
-            else
-                throw new InvalidRequestException("Cannot mix IF conditions and IF NOT EXISTS for the same row");
-        }
-    }
-
-    public void addExist(ColumnNameBuilder prefix) throws InvalidRequestException
-    {
-        RowCondition previous = conditions.put(prefix.build(), new ExistCondition(prefix, now));
-        // this should be prevented by the parser, but it doesn't hurt to check
-        if (previous != null && previous instanceof NotExistCondition)
-            throw new InvalidRequestException("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row");
-    }
-
-    public void addConditions(ColumnNameBuilder prefix, Collection<ColumnCondition> conds, List<ByteBuffer> variables) throws InvalidRequestException
-    {
-        ByteBuffer b = prefix.build();
-        RowCondition condition = conditions.get(b);
-        if (condition == null)
-        {
-            condition = new ColumnsConditions(prefix, now);
-            conditions.put(b, condition);
-        }
-        else if (!(condition instanceof ColumnsConditions))
-        {
-            throw new InvalidRequestException("Cannot mix IF conditions and IF NOT EXISTS for the same row");
-        }
-        ((ColumnsConditions)condition).addConditions(conds, variables);
-    }
-
-    public IDiskAtomFilter readFilter()
-    {
-        assert !conditions.isEmpty();
-        ColumnSlice[] slices = new ColumnSlice[conditions.size()];
-        int i = 0;
-        // We always read CQL rows entirely as on CAS failure we want to be able to distinguish between "row exists
-        // but all values on why there were conditions are null" and "row doesn't exists", and we can't rely on the
-        // row marker for that (see #6623)
-        for (Map.Entry<ByteBuffer, RowCondition> entry : conditions.entrySet())
-            slices[i++] = new ColumnSlice(entry.getKey(), entry.getValue().rowPrefix.buildAsEndOfRange());
-
-        int toGroup = cfm.getCfDef().isCompact ? -1 : cfm.clusteringKeyColumns().size();
-        return new SliceQueryFilter(slices, false, slices.length, toGroup);
-    }
-
-    public boolean appliesTo(ColumnFamily current) throws InvalidRequestException
-    {
-        for (RowCondition condition : conditions.values())
-        {
-            if (!condition.appliesTo(current))
-                return false;
-        }
-        return true;
-    }
-
-    private static abstract class RowCondition
-    {
-        public final ColumnNameBuilder rowPrefix;
-        protected final long now;
-
-        protected RowCondition(ColumnNameBuilder rowPrefix, long now)
-        {
-            this.rowPrefix = rowPrefix;
-            this.now = now;
-        }
-
-        public abstract boolean appliesTo(ColumnFamily current) throws InvalidRequestException;
-    }
-
-    private static class NotExistCondition extends RowCondition
-    {
-        private NotExistCondition(ColumnNameBuilder rowPrefix, long now)
-        {
-            super(rowPrefix, now);
-        }
-
-        public boolean appliesTo(ColumnFamily current)
-        {
-            if (current == null)
-                return true;
-
-            Iterator<Column> iter = current.iterator(new ColumnSlice[]{ new ColumnSlice(rowPrefix.build(), rowPrefix.buildAsEndOfRange()) });
-            while (iter.hasNext())
-                if (iter.next().isLive(now))
-                    return false;
-            return true;
-        }
-    }
-
-    private static class ExistCondition extends RowCondition
-    {
-        private ExistCondition(ColumnNameBuilder rowPrefix, long now)
-        {
-            super (rowPrefix, now);
-        }
-
-        public boolean appliesTo(ColumnFamily current)
-        {
-            if (current == null)
-                return false;
-
-            Iterator<Column> iter = current.iterator(new ColumnSlice[]{ new ColumnSlice(rowPrefix.build(), rowPrefix.buildAsEndOfRange())});
-            while (iter.hasNext())
-                if (iter.next().isLive(now))
-                    return true;
-            return false;
-        }
-    }
-
-    private static class ColumnsConditions extends RowCondition
-    {
-        private final Map<Pair<ColumnIdentifier, ByteBuffer>, ColumnCondition.Bound> conditions = new HashMap<>();
-
-        private ColumnsConditions(ColumnNameBuilder rowPrefix, long now)
-        {
-            super(rowPrefix, now);
-        }
-
-        public void addConditions(Collection<ColumnCondition> conds, List<ByteBuffer> variables) throws InvalidRequestException
-        {
-            for (ColumnCondition condition : conds)
-            {
-                // We will need the variables in appliesTo but with protocol batches, each condition in this object can have a
-                // different list of variables.
-                ColumnCondition.Bound current = condition.bind(variables);
-                ColumnCondition.Bound previous = conditions.put(Pair.create(condition.column.name, current.getCollectionElementValue()), current);
-                // If 2 conditions are actually equal, let it slide
-                if (previous != null && !previous.equals(current))
-                    throw new InvalidRequestException("Duplicate and incompatible conditions for column " + condition.column.name);
-            }
-        }
-
-        public boolean appliesTo(ColumnFamily current) throws InvalidRequestException
-        {
-            if (current == null)
-                return conditions.isEmpty();
-
-            for (ColumnCondition.Bound condition : conditions.values())
-                if (!condition.appliesTo(rowPrefix, current, now))
-                    return false;
-            return true;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
new file mode 100644
index 0000000..eb29012
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java

@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.service.CASRequest;
+import org.apache.cassandra.utils.Pair;
+
+/**
+ * Processed CAS conditions and update on potentially multiple rows of the same partition.
+ */
+public class CQL3CasRequest implements CASRequest
+{
+    private final CFMetaData cfm;
+    private final ByteBuffer key;
+    private final long now;
+    private final boolean isBatch;
+
+    // We index RowCondition by the prefix of the row they applied to for 2 reasons:
+    //   1) this allows to keep things sorted to build the ColumnSlice array below
+    //   2) this allows to detect when contradictory conditions are set (not exists with some other conditions on the same row)
+    private final SortedMap<Composite, RowCondition> conditions;
+
+    private final List<RowUpdate> updates = new ArrayList<>();
+
+    public CQL3CasRequest(CFMetaData cfm, ByteBuffer key, boolean isBatch)
+    {
+        this.cfm = cfm;
+        // When checking if conditions apply, we want to use a fixed reference time for a whole request to check
+        // for expired cells. Note that this is unrelated to the cell timestamp.
+        this.now = System.currentTimeMillis();
+        this.key = key;
+        this.conditions = new TreeMap<>(cfm.comparator);
+        this.isBatch = isBatch;
+    }
+
+    public void addRowUpdate(Composite prefix, ModificationStatement stmt, QueryOptions options, long timestamp)
+    {
+        updates.add(new RowUpdate(prefix, stmt, options, timestamp));
+    }
+
+    public void addNotExist(Composite prefix) throws InvalidRequestException
+    {
+        RowCondition previous = conditions.put(prefix, new NotExistCondition(prefix, now));
+        if (previous != null && !(previous instanceof NotExistCondition))
+        {
+            // these should be prevented by the parser, but it doesn't hurt to check
+            if (previous instanceof ExistCondition)
+                throw new InvalidRequestException("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row");
+            else
+                throw new InvalidRequestException("Cannot mix IF conditions and IF NOT EXISTS for the same row");
+        }
+    }
+
+    public void addExist(Composite prefix) throws InvalidRequestException
+    {
+        RowCondition previous = conditions.put(prefix, new ExistCondition(prefix, now));
+        // this should be prevented by the parser, but it doesn't hurt to check
+        if (previous != null && previous instanceof NotExistCondition)
+            throw new InvalidRequestException("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row");
+    }
+
+    public void addConditions(Composite prefix, Collection<ColumnCondition> conds, QueryOptions options) throws InvalidRequestException
+    {
+        RowCondition condition = conditions.get(prefix);
+        if (condition == null)
+        {
+            condition = new ColumnsConditions(prefix, now);
+            conditions.put(prefix, condition);
+        }
+        else if (!(condition instanceof ColumnsConditions))
+        {
+            throw new InvalidRequestException("Cannot mix IF conditions and IF NOT EXISTS for the same row");
+        }
+        ((ColumnsConditions)condition).addConditions(conds, options);
+    }
+
+    public IDiskAtomFilter readFilter()
+    {
+        assert !conditions.isEmpty();
+        ColumnSlice[] slices = new ColumnSlice[conditions.size()];
+        int i = 0;
+        // We always read CQL rows entirely as on CAS failure we want to be able to distinguish between "row exists
+        // but all values for which there were conditions are null" and "row doesn't exists", and we can't rely on the
+        // row marker for that (see #6623)
+        for (Composite prefix : conditions.keySet())
+            slices[i++] = prefix.slice();
+
+        int toGroup = cfm.comparator.isDense() ? -1 : cfm.clusteringColumns().size();
+        slices = ColumnSlice.deoverlapSlices(slices, cfm.comparator);
+        assert ColumnSlice.validateSlices(slices, cfm.comparator, false);
+        return new SliceQueryFilter(slices, false, slices.length, toGroup);
+    }
+
+    public boolean appliesTo(ColumnFamily current) throws InvalidRequestException
+    {
+        for (RowCondition condition : conditions.values())
+        {
+            if (!condition.appliesTo(current))
+                return false;
+        }
+        return true;
+    }
+
+    public ColumnFamily makeUpdates(ColumnFamily current) throws InvalidRequestException
+    {
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfm);
+        for (RowUpdate upd : updates)
+            upd.applyUpdates(current, cf);
+
+        if (isBatch)
+            BatchStatement.verifyBatchSize(Collections.singleton(cf));
+
+        return cf;
+    }
+
+    /**
+     * Due to some operation on lists, we can't generate the update that a given Modification statement does before
+     * we get the values read by the initial read of Paxos. A RowUpdate thus just store the relevant information
+     * (include the statement iself) to generate those updates. We'll have multiple RowUpdate for a Batch, otherwise
+     * we'll have only one.
+     */
+    private class RowUpdate
+    {
+        private final Composite rowPrefix;
+        private final ModificationStatement stmt;
+        private final QueryOptions options;
+        private final long timestamp;
+
+        private RowUpdate(Composite rowPrefix, ModificationStatement stmt, QueryOptions options, long timestamp)
+        {
+            this.rowPrefix = rowPrefix;
+            this.stmt = stmt;
+            this.options = options;
+            this.timestamp = timestamp;
+        }
+
+        public void applyUpdates(ColumnFamily current, ColumnFamily updates) throws InvalidRequestException
+        {
+            Map<ByteBuffer, CQL3Row> map = null;
+            if (stmt.requiresRead())
+            {
+                // Uses the "current" values read by Paxos for lists operation that requires a read
+                Iterator<CQL3Row> iter = cfm.comparator.CQL3RowBuilder(cfm, now).group(current.iterator(new ColumnSlice[]{ rowPrefix.slice() }));
+                if (iter.hasNext())
+                {
+                    map = Collections.singletonMap(key, iter.next());
+                    assert !iter.hasNext() : "We shoudn't be updating more than one CQL row per-ModificationStatement";
+                }
+            }
+
+            UpdateParameters params = new UpdateParameters(cfm, options, timestamp, stmt.getTimeToLive(options), map);
+            stmt.addUpdateForKey(updates, key, rowPrefix, params);
+        }
+    }
+
+    private static abstract class RowCondition
+    {
+        public final Composite rowPrefix;
+        protected final long now;
+
+        protected RowCondition(Composite rowPrefix, long now)
+        {
+            this.rowPrefix = rowPrefix;
+            this.now = now;
+        }
+
+        public abstract boolean appliesTo(ColumnFamily current) throws InvalidRequestException;
+    }
+
+    private static class NotExistCondition extends RowCondition
+    {
+        private NotExistCondition(Composite rowPrefix, long now)
+        {
+            super(rowPrefix, now);
+        }
+
+        public boolean appliesTo(ColumnFamily current)
+        {
+            if (current == null)
+                return true;
+
+            Iterator<Cell> iter = current.iterator(new ColumnSlice[]{ rowPrefix.slice() });
+            while (iter.hasNext())
+                if (iter.next().isLive(now))
+                    return false;
+            return true;
+        }
+    }
+
+    private static class ExistCondition extends RowCondition
+    {
+        private ExistCondition(Composite rowPrefix, long now)
+        {
+            super (rowPrefix, now);
+        }
+
+        public boolean appliesTo(ColumnFamily current)
+        {
+            if (current == null)
+                return false;
+
+            Iterator<Cell> iter = current.iterator(new ColumnSlice[]{ rowPrefix.slice() });
+            while (iter.hasNext())
+                if (iter.next().isLive(now))
+                    return true;
+            return false;
+        }
+    }
+
+    private static class ColumnsConditions extends RowCondition
+    {
+        private final Multimap<Pair<ColumnIdentifier, ByteBuffer>, ColumnCondition.Bound> conditions = HashMultimap.create();
+
+        private ColumnsConditions(Composite rowPrefix, long now)
+        {
+            super(rowPrefix, now);
+        }
+
+        public void addConditions(Collection<ColumnCondition> conds, QueryOptions options) throws InvalidRequestException
+        {
+            for (ColumnCondition condition : conds)
+            {
+                ColumnCondition.Bound current = condition.bind(options);
+                conditions.put(Pair.create(condition.column.name, current.getCollectionElementValue()), current);
+            }
+        }
+
+        public boolean appliesTo(ColumnFamily current) throws InvalidRequestException
+        {
+            if (current == null)
+                return conditions.isEmpty();
+
+            for (ColumnCondition.Bound condition : conditions.values())
+                if (!condition.appliesTo(rowPrefix, current, now))
+                    return false;
+            return true;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/ColumnGroupMap.java b/src/java/org/apache/cassandra/cql3/statements/ColumnGroupMap.java
deleted file mode 100644
index 58428ed..0000000
--- a/src/java/org/apache/cassandra/cql3/statements/ColumnGroupMap.java
+++ /dev/null

@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3.statements;
-
-import java.nio.ByteBuffer;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.cassandra.db.Column;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.utils.Pair;
-
-public class ColumnGroupMap
-{
-    public static ColumnGroupMap EMPTY = new ColumnGroupMap(null, false);
-
-    private final ByteBuffer[] fullPath;
-    private final Map<ByteBuffer, Value> map = new HashMap<ByteBuffer, Value>();
-    public final boolean isStatic; // Whether or not the group correspond to "static" cells
-
-    private ColumnGroupMap(ByteBuffer[] fullPath, boolean isStatic)
-    {
-        this.fullPath = fullPath;
-        this.isStatic = isStatic;
-    }
-
-    private void add(ByteBuffer[] fullName, int idx, Column column)
-    {
-        ByteBuffer columnName = fullName[idx];
-        if (fullName.length == idx + 2)
-        {
-            // It's a collection
-            Value v = map.get(columnName);
-            if (v == null)
-            {
-                v = new Collection();
-                map.put(columnName, v);
-            }
-            assert v instanceof Collection;
-
-            ((Collection)v).add(Pair.create(fullName[idx + 1], column));
-        }
-        else
-        {
-            assert !map.containsKey(columnName);
-            map.put(columnName, new Simple(column));
-        }
-    }
-
-    public ByteBuffer getKeyComponent(int pos)
-    {
-        return fullPath == null ? null : fullPath[pos];
-    }
-
-    public Column getSimple(ByteBuffer key)
-    {
-        Value v = map.get(key);
-        if (v == null)
-            return null;
-
-        assert v instanceof Simple;
-        return ((Simple)v).column;
-    }
-
-    public List<Pair<ByteBuffer, Column>> getCollection(ByteBuffer key)
-    {
-        Value v = map.get(key);
-        if (v == null)
-            return null;
-
-        assert v instanceof Collection;
-        return (List<Pair<ByteBuffer, Column>>)v;
-    }
-
-    public boolean hasValueFor(ByteBuffer key)
-    {
-        return map.containsKey(key);
-    }
-
-    private interface Value {};
-
-    private static class Simple implements Value
-    {
-        public final Column column;
-
-        Simple(Column column)
-        {
-            this.column = column;
-        }
-    }
-
-    private static class Collection extends ArrayList<Pair<ByteBuffer, Column>> implements Value {}
-
-    public static class Builder
-    {
-        private final CompositeType composite;
-        private final int idx;
-        private final long now;
-        private ByteBuffer[] previous;
-
-        private final List<ColumnGroupMap> groups = new ArrayList<ColumnGroupMap>();
-        private ColumnGroupMap currentGroup;
-
-        public Builder(CompositeType composite, boolean hasCollections, long now)
-        {
-            this.composite = composite;
-            this.idx = composite.types.size() - (hasCollections ? 2 : 1);
-            this.now = now;
-        }
-
-        public void add(Column c)
-        {
-            if (c.isMarkedForDelete(now))
-                return;
-
-            ByteBuffer[] current = composite.split(c.name());
-
-            if (currentGroup == null)
-            {
-                currentGroup = new ColumnGroupMap(current, CompositeType.isStaticName(c.name()));
-                currentGroup.add(current, idx, c);
-                previous = current;
-                return;
-            }
-
-            if ((currentGroup.isStatic && !CompositeType.isStaticName(c.name())) || !isSameGroup(current))
-            {
-                groups.add(currentGroup);
-                // Note that we know that only the first group built can be static
-                currentGroup = new ColumnGroupMap(current, false);
-            }
-            currentGroup.add(current, idx, c);
-            previous = current;
-        }
-
-        /**
-         * For sparse composite, returns wheter the column belong to the same
-         * cqlRow than the previously added, based on the full list of component
-         * in the name.
-         * Two columns do belong together if they differ only by the last
-         * component.
-         */
-        private boolean isSameGroup(ByteBuffer[] c)
-        {
-            for (int i = 0; i < idx; i++)
-            {
-                AbstractType<?> comp = composite.types.get(i);
-                if (comp.compare(c[i], previous[i]) != 0)
-                    return false;
-            }
-            return true;
-        }
-
-        public List<ColumnGroupMap> groups()
-        {
-            if (currentGroup != null)
-            {
-                groups.add(currentGroup);
-                currentGroup = null;
-            }
-            return groups;
-        }
-
-        public boolean isEmpty()
-        {
-            return currentGroup == null && groups.isEmpty();
-        }
-
-        public ColumnGroupMap firstGroup()
-        {
-            if (currentGroup != null)
-            {
-                groups.add(currentGroup);
-                currentGroup = null;
-            }
-            return groups.get(0);
-        }
-
-        public void discardFirst()
-        {
-            groups.remove(0);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateIndexStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateIndexStatement.java
index 8b40978..4809187 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateIndexStatement.java

@@ -18,21 +18,24 @@
 package org.apache.cassandra.cql3.statements;
 
 import java.util.Collections;
+import java.util.Map;
 
+import com.google.common.collect.ImmutableMap;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.IndexType;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.marshal.MapType;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.thrift.IndexType;
 import org.apache.cassandra.thrift.ThriftValidation;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 
 /** A <code>CREATE INDEX</code> statement parsed from a CQL query. */
 public class CreateIndexStatement extends SchemaAlteringStatement
@@ -40,19 +43,19 @@
     private static final Logger logger = LoggerFactory.getLogger(CreateIndexStatement.class);
 
     private final String indexName;
-    private final ColumnIdentifier columnName;
+    private final IndexTarget target;
     private final IndexPropDefs properties;
     private final boolean ifNotExists;
 
     public CreateIndexStatement(CFName name,
                                 String indexName,
-                                ColumnIdentifier columnName,
+                                IndexTarget target,
                                 IndexPropDefs properties,
                                 boolean ifNotExists)
     {
         super(name);
         this.indexName = indexName;
-        this.columnName = columnName;
+        this.target = target;
         this.properties = properties;
         this.ifNotExists = ifNotExists;
     }
@@ -65,16 +68,30 @@
     public void validate(ClientState state) throws RequestValidationException
     {
         CFMetaData cfm = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
-        if (cfm.getDefaultValidator().isCommutative())
+        if (cfm.isCounter())
             throw new InvalidRequestException("Secondary indexes are not supported on counter tables");
 
-        ColumnDefinition cd = cfm.getColumnDefinition(columnName.key);
+        ColumnDefinition cd = cfm.getColumnDefinition(target.column);
 
         if (cd == null)
-            throw new InvalidRequestException("No column definition found for column " + columnName);
+            throw new InvalidRequestException("No column definition found for column " + target.column);
+
+        boolean isMap = cd.type instanceof MapType;
+        if (target.isCollectionKeys && !isMap)
+            throw new InvalidRequestException("Cannot create index on keys of column " + target + " with non map type");
 
         if (cd.getIndexType() != null)
         {
+            boolean previousIsKeys = cd.getIndexOptions().containsKey("index_keys");
+            if (isMap && target.isCollectionKeys != previousIsKeys)
+            {
+                String msg = "Cannot create index on %s %s, an index on %s %s already exists and indexing "
+                           + "a map on both keys and values at the same time is not currently supported";
+                throw new InvalidRequestException(String.format(msg,
+                                                                target.column, target.isCollectionKeys ? "keys" : "values",
+                                                                target.column, previousIsKeys ? "keys" : "values"));
+            }
+
             if (ifNotExists)
                 return;
             else
@@ -84,8 +101,8 @@
         properties.validate();
 
         // TODO: we could lift that limitation
-        if (cfm.getCfDef().isCompact && cd.type != ColumnDefinition.Type.REGULAR)
-            throw new InvalidRequestException(String.format("Secondary index on %s column %s is not yet supported for compact table", cd.type, columnName));
+        if (cfm.comparator.isDense() && cd.kind != ColumnDefinition.Kind.REGULAR)
+            throw new InvalidRequestException(String.format("Secondary index on %s column %s is not yet supported for compact table", cd.kind, target.column));
 
         // It would be possible to support 2ndary index on static columns (but not without modifications of at least ExtendedFilter and
         // CompositesIndex) and maybe we should, but that means a query like:
@@ -93,41 +110,50 @@
         // would pull the full partition every time the static column of partition is 'bar', which sounds like offering a
         // fair potential for foot-shooting, so I prefer leaving that to a follow up ticket once we have identified cases where
         // such indexing is actually useful.
-        if (cd.type == ColumnDefinition.Type.STATIC)
+        if (cd.isStatic())
             throw new InvalidRequestException("Secondary indexes are not allowed on static columns");
 
-        if (cd.getValidator().isCollection() && !properties.isCustom)
-            throw new InvalidRequestException("Indexes on collections are no yet supported");
-
-        if (cd.type == ColumnDefinition.Type.PARTITION_KEY && cd.componentIndex == null)
-            throw new InvalidRequestException(String.format("Cannot add secondary index to already primarily indexed column %s", columnName));
+        if (cd.kind == ColumnDefinition.Kind.PARTITION_KEY && cd.isOnAllComponents())
+            throw new InvalidRequestException(String.format("Cannot add secondary index to already primarily indexed column %s", target.column));
     }
 
-    public boolean announceMigration() throws RequestValidationException
+    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
     {
-        logger.debug("Updating column {} definition for index {}", columnName, indexName);
-        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).clone();
-        ColumnDefinition cd = cfm.getColumnDefinition(columnName.key);
+        logger.debug("Updating column {} definition for index {}", target.column, indexName);
+        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).copy();
+        ColumnDefinition cd = cfm.getColumnDefinition(target.column);
 
         if (cd.getIndexType() != null && ifNotExists)
             return false;
 
         if (properties.isCustom)
+        {
             cd.setIndexType(IndexType.CUSTOM, properties.getOptions());
-        else if (cfm.getCfDef().isComposite)
-            cd.setIndexType(IndexType.COMPOSITES, Collections.<String, String>emptyMap());
+        }
+        else if (cfm.comparator.isCompound())
+        {
+            Map<String, String> options = Collections.emptyMap();
+            // For now, we only allow indexing values for collections, but we could later allow
+            // to also index map keys, so we record that this is the values we index to make our
+            // lives easier then.
+            if (cd.type.isCollection())
+                options = ImmutableMap.of(target.isCollectionKeys ? "index_keys" : "index_values", "");
+            cd.setIndexType(IndexType.COMPOSITES, options);
+        }
         else
+        {
             cd.setIndexType(IndexType.KEYS, Collections.<String, String>emptyMap());
+        }
 
         cd.setIndexName(indexName);
         cfm.addDefaultIndexNames();
-        MigrationManager.announceColumnFamilyUpdate(cfm, false);
+        MigrationManager.announceColumnFamilyUpdate(cfm, false, isLocalOnly);
         return true;
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
         // Creating an index is akin to updating the CF
-        return ResultMessage.SchemaChange.Change.UPDATED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateKeyspaceStatement.java
index 7a8473a..8281cbd 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateKeyspaceStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateKeyspaceStatement.java

@@ -30,7 +30,7 @@
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.thrift.ThriftValidation;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 
 /** A <code>CREATE KEYSPACE</code> statement parsed from a CQL query. */
 public class CreateKeyspaceStatement extends SchemaAlteringStatement
@@ -97,11 +97,11 @@
                                                                 attrs.getReplicationOptions());
     }
 
-    public boolean announceMigration() throws RequestValidationException
+    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
     {
         try
         {
-            MigrationManager.announceNewKeyspace(attrs.asKSMetadata(name));
+            MigrationManager.announceNewKeyspace(attrs.asKSMetadata(name), isLocalOnly);
             return true;
         }
         catch (AlreadyExistsException e)
@@ -112,8 +112,8 @@
         }
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
-        return ResultMessage.SchemaChange.Change.CREATED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, keyspace());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateTableStatement.java
index efaf36d..44b187d 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateTableStatement.java

@@ -30,6 +30,7 @@
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.ColumnFamilyType;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.AlreadyExistsException;
@@ -37,13 +38,13 @@
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.thrift.CqlResult;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 /** A <code>CREATE TABLE</code> parsed from a CQL query statement. */
 public class CreateTableStatement extends SchemaAlteringStatement
 {
-    public AbstractType<?> comparator;
+    public CellNameType comparator;
     private AbstractType<?> defaultValidator;
     private AbstractType<?> keyValidator;
 
@@ -91,34 +92,26 @@
     }
 
     // Column definitions
-    private Map<ByteBuffer, ColumnDefinition> getColumns()
+    private List<ColumnDefinition> getColumns(CFMetaData cfm)
     {
-        Map<ByteBuffer, ColumnDefinition> columnDefs = new HashMap<ByteBuffer, ColumnDefinition>();
-        Integer componentIndex = null;
-        if (comparator instanceof CompositeType)
-        {
-            CompositeType ct = (CompositeType) comparator;
-            componentIndex = ct.types.get(ct.types.size() - 1) instanceof ColumnToCollectionType
-                           ? ct.types.size() - 2
-                           : ct.types.size() - 1;
-        }
-
+        List<ColumnDefinition> columnDefs = new ArrayList<>(columns.size());
+        Integer componentIndex = comparator.isCompound() ? comparator.clusteringPrefixSize() : null;
         for (Map.Entry<ColumnIdentifier, AbstractType> col : columns.entrySet())
         {
             ColumnIdentifier id = col.getKey();
-            columnDefs.put(id.key, staticColumns.contains(id)
-                                   ? ColumnDefinition.staticDef(id.key, col.getValue(), componentIndex)
-                                   : ColumnDefinition.regularDef(id.key, col.getValue(), componentIndex));
+            columnDefs.add(staticColumns.contains(id)
+                           ? ColumnDefinition.staticDef(cfm, col.getKey().bytes, col.getValue(), componentIndex)
+                           : ColumnDefinition.regularDef(cfm, col.getKey().bytes, col.getValue(), componentIndex));
         }
 
         return columnDefs;
     }
 
-    public boolean announceMigration() throws RequestValidationException
+    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
     {
         try
         {
-            MigrationManager.announceNewColumnFamily(getCFMetaData());
+            MigrationManager.announceNewColumnFamily(getCFMetaData(), isLocalOnly);
             return true;
         }
         catch (AlreadyExistsException e)
@@ -129,9 +122,9 @@
         }
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
-        return ResultMessage.SchemaChange.Change.CREATED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 
     /**
@@ -147,8 +140,7 @@
         newCFMD = new CFMetaData(keyspace(),
                                  columnFamily(),
                                  ColumnFamilyType.Standard,
-                                 comparator,
-                                 null);
+                                 comparator);
         applyPropertiesTo(newCFMD);
         return newCFMD;
     }
@@ -157,20 +149,20 @@
     {
         cfmd.defaultValidator(defaultValidator)
             .keyValidator(keyValidator)
-            .columnMetadata(getColumns())
+            .addAllColumnDefinitions(getColumns(cfmd))
             .isDense(isDense);
 
-        cfmd.addColumnMetadataFromAliases(keyAliases, keyValidator, ColumnDefinition.Type.PARTITION_KEY);
-        cfmd.addColumnMetadataFromAliases(columnAliases, comparator, ColumnDefinition.Type.CLUSTERING_KEY);
+        cfmd.addColumnMetadataFromAliases(keyAliases, keyValidator, ColumnDefinition.Kind.PARTITION_KEY);
+        cfmd.addColumnMetadataFromAliases(columnAliases, comparator.asAbstractType(), ColumnDefinition.Kind.CLUSTERING_COLUMN);
         if (valueAlias != null)
-            cfmd.addColumnMetadataFromAliases(Collections.<ByteBuffer>singletonList(valueAlias), defaultValidator, ColumnDefinition.Type.COMPACT_VALUE);
+            cfmd.addColumnMetadataFromAliases(Collections.<ByteBuffer>singletonList(valueAlias), defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
 
         properties.applyToCFMetadata(cfmd);
     }
 
     public static class RawStatement extends CFStatement
     {
-        private final Map<ColumnIdentifier, CQL3Type> definitions = new HashMap<ColumnIdentifier, CQL3Type>();
+        private final Map<ColumnIdentifier, CQL3Type.Raw> definitions = new HashMap<>();
         public final CFPropDefs properties = new CFPropDefs();
 
         private final List<List<ColumnIdentifier>> keyAliases = new ArrayList<List<ColumnIdentifier>>();
@@ -209,15 +201,15 @@
             CreateTableStatement stmt = new CreateTableStatement(cfName, properties, ifNotExists, staticColumns);
 
             Map<ByteBuffer, CollectionType> definedCollections = null;
-            for (Map.Entry<ColumnIdentifier, CQL3Type> entry : definitions.entrySet())
+            for (Map.Entry<ColumnIdentifier, CQL3Type.Raw> entry : definitions.entrySet())
             {
                 ColumnIdentifier id = entry.getKey();
-                CQL3Type pt = entry.getValue();
+                CQL3Type pt = entry.getValue().prepare(keyspace());
                 if (pt.isCollection())
                 {
                     if (definedCollections == null)
                         definedCollections = new HashMap<ByteBuffer, CollectionType>();
-                    definedCollections.put(id.key, (CollectionType)pt.getType());
+                    definedCollections.put(id.bytes, (CollectionType)pt.getType());
                 }
                 stmt.columns.put(id, pt.getType()); // we'll remove what is not a column below
             }
@@ -232,7 +224,7 @@
             List<AbstractType<?>> keyTypes = new ArrayList<AbstractType<?>>(kAliases.size());
             for (ColumnIdentifier alias : kAliases)
             {
-                stmt.keyAliases.add(alias.key);
+                stmt.keyAliases.add(alias.bytes);
                 AbstractType<?> t = getTypeAndRemove(stmt.columns, alias);
                 if (t instanceof CounterColumnType)
                     throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", alias));
@@ -258,15 +250,13 @@
                     if (definedCollections != null)
                         throw new InvalidRequestException("Collection types are not supported with COMPACT STORAGE");
 
-                    stmt.comparator = CFDefinition.definitionType;
+                    stmt.comparator = new SimpleSparseCellNameType(UTF8Type.instance);
                 }
                 else
                 {
-                    List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(definedCollections == null ? 1 : 2);
-                    types.add(CFDefinition.definitionType);
-                    if (definedCollections != null)
-                        types.add(ColumnToCollectionType.getInstance(definedCollections));
-                    stmt.comparator = CompositeType.getInstance(types);
+                    stmt.comparator = definedCollections == null
+                                    ? new CompoundSparseCellNameType(Collections.<AbstractType<?>>emptyList())
+                                    : new CompoundSparseCellNameType.WithCollection(Collections.<AbstractType<?>>emptyList(), ColumnToCollectionType.getInstance(definedCollections));
                 }
             }
             else
@@ -277,20 +267,23 @@
                 {
                     if (definedCollections != null)
                         throw new InvalidRequestException("Collection types are not supported with COMPACT STORAGE");
+
                     ColumnIdentifier alias = columnAliases.get(0);
-                    stmt.columnAliases.add(alias.key);
-                    stmt.comparator = getTypeAndRemove(stmt.columns, alias);
-                    if (stmt.comparator instanceof CounterColumnType)
-                        throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", alias));
                     if (staticColumns.contains(alias))
                         throw new InvalidRequestException(String.format("Static column %s cannot be part of the PRIMARY KEY", alias));
+
+                    stmt.columnAliases.add(alias.bytes);
+                    AbstractType<?> at = getTypeAndRemove(stmt.columns, alias);
+                    if (at instanceof CounterColumnType)
+                        throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", stmt.columnAliases.get(0)));
+                    stmt.comparator = new SimpleDenseCellNameType(at);
                 }
                 else
                 {
                     List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(columnAliases.size() + 1);
                     for (ColumnIdentifier t : columnAliases)
                     {
-                        stmt.columnAliases.add(t.key);
+                        stmt.columnAliases.add(t.bytes);
 
                         AbstractType<?> type = getTypeAndRemove(stmt.columns, t);
                         if (type instanceof CounterColumnType)
@@ -304,19 +297,15 @@
                     {
                         if (definedCollections != null)
                             throw new InvalidRequestException("Collection types are not supported with COMPACT STORAGE");
+
+                        stmt.comparator = new CompoundDenseCellNameType(types);
                     }
                     else
                     {
-                        // For sparse, we must add the last UTF8 component
-                        // and the collection type if there is one
-                        types.add(CFDefinition.definitionType);
-                        if (definedCollections != null)
-                            types.add(ColumnToCollectionType.getInstance(definedCollections));
+                        stmt.comparator = definedCollections == null
+                                        ? new CompoundSparseCellNameType(types)
+                                        : new CompoundSparseCellNameType.WithCollection(types, ColumnToCollectionType.getInstance(definedCollections));
                     }
-
-                    if (types.isEmpty())
-                        throw new IllegalStateException("Nonsensical empty parameter list for CompositeType");
-                    stmt.comparator = CompositeType.getInstance(types);
                 }
             }
 
@@ -349,7 +338,7 @@
 
                     Map.Entry<ColumnIdentifier, AbstractType> lastEntry = stmt.columns.entrySet().iterator().next();
                     stmt.defaultValidator = lastEntry.getValue();
-                    stmt.valueAlias = lastEntry.getKey().key;
+                    stmt.valueAlias = lastEntry.getKey().bytes;
                     stmt.columns.remove(lastEntry.getKey());
                 }
             }
@@ -405,7 +394,7 @@
             return isReversed != null && isReversed ? ReversedType.getInstance(type) : type;
         }
 
-        public void addDefinition(ColumnIdentifier def, CQL3Type type, boolean isStatic)
+        public void addDefinition(ColumnIdentifier def, CQL3Type.Raw type, boolean isStatic)
         {
             definedNames.add(def);
             definitions.put(def, type);

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateTriggerStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateTriggerStatement.java
index 70b3acb..6ebe0d3 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateTriggerStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateTriggerStatement.java

@@ -25,12 +25,13 @@
 import org.apache.cassandra.config.TriggerDefinition;
 import org.apache.cassandra.cql3.CFName;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.thrift.ThriftValidation;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.triggers.TriggerExecutor;
 
 public class CreateTriggerStatement extends SchemaAlteringStatement
@@ -39,12 +40,14 @@
 
     private final String triggerName;
     private final String triggerClass;
+    private final boolean ifNotExists;
 
-    public CreateTriggerStatement(CFName name, String triggerName, String clazz)
+    public CreateTriggerStatement(CFName name, String triggerName, String clazz, boolean ifNotExists)
     {
         super(name);
         this.triggerName = triggerName;
         this.triggerClass = clazz;
+        this.ifNotExists = ifNotExists;
     }
 
     public void checkAccess(ClientState state) throws UnauthorizedException
@@ -65,17 +68,24 @@
         }
     }
 
-    public boolean announceMigration() throws ConfigurationException
+    public boolean announceMigration(boolean isLocalOnly) throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).clone();
-        cfm.addTriggerDefinition(TriggerDefinition.create(triggerName, triggerClass));
-        logger.info("Adding trigger with name {} and class {}", triggerName, triggerClass);
-        MigrationManager.announceColumnFamilyUpdate(cfm, false);
-        return true;
+        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).copy();
+
+        TriggerDefinition triggerDefinition = TriggerDefinition.create(triggerName, triggerClass);
+
+        if (!ifNotExists || !cfm.containsTriggerDefinition(triggerDefinition))
+        {
+            cfm.addTriggerDefinition(triggerDefinition);
+            logger.info("Adding trigger with name {} and class {}", triggerName, triggerClass);
+            MigrationManager.announceColumnFamilyUpdate(cfm, false, isLocalOnly);
+            return true;
+        }
+        return false;
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
-        return ResultMessage.SchemaChange.Change.UPDATED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateTypeStatement.java
new file mode 100644
index 0000000..82c2808
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateTypeStatement.java

@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.transport.Event;
+
+public class CreateTypeStatement extends SchemaAlteringStatement
+{
+    private final UTName name;
+    private final List<ColumnIdentifier> columnNames = new ArrayList<>();
+    private final List<CQL3Type.Raw> columnTypes = new ArrayList<>();
+    private final boolean ifNotExists;
+
+    public CreateTypeStatement(UTName name, boolean ifNotExists)
+    {
+        super();
+        this.name = name;
+        this.ifNotExists = ifNotExists;
+    }
+
+    @Override
+    public void prepareKeyspace(ClientState state) throws InvalidRequestException
+    {
+        if (!name.hasKeyspace())
+            name.setKeyspace(state.getKeyspace());
+    }
+
+    public void addDefinition(ColumnIdentifier name, CQL3Type.Raw type)
+    {
+        columnNames.add(name);
+        columnTypes.add(type);
+    }
+
+    public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
+    {
+        state.hasKeyspaceAccess(keyspace(), Permission.CREATE);
+    }
+
+    public void validate(ClientState state) throws RequestValidationException
+    {
+        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        if (ksm == null)
+            throw new InvalidRequestException(String.format("Cannot add type in unknown keyspace %s", name.getKeyspace()));
+
+        if (ksm.userTypes.getType(name.getUserTypeName()) != null && !ifNotExists)
+            throw new InvalidRequestException(String.format("A user type of name %s already exists", name));
+
+        for (CQL3Type.Raw type : columnTypes)
+            if (type.isCounter())
+                throw new InvalidRequestException("A user type cannot contain counters");
+    }
+
+    public static void checkForDuplicateNames(UserType type) throws InvalidRequestException
+    {
+        for (int i = 0; i < type.size() - 1; i++)
+        {
+            ByteBuffer fieldName = type.fieldName(i);
+            for (int j = i+1; j < type.size(); j++)
+            {
+                if (fieldName.equals(type.fieldName(j)))
+                    throw new InvalidRequestException(String.format("Duplicate field name %s in type %s",
+                                                                    UTF8Type.instance.getString(fieldName),
+                                                                    UTF8Type.instance.getString(type.name)));
+            }
+        }
+    }
+
+    public Event.SchemaChange changeEvent()
+    {
+        return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
+    }
+
+    @Override
+    public String keyspace()
+    {
+        return name.getKeyspace();
+    }
+
+    private UserType createType() throws InvalidRequestException
+    {
+        List<ByteBuffer> names = new ArrayList<>(columnNames.size());
+        for (ColumnIdentifier name : columnNames)
+            names.add(name.bytes);
+
+        List<AbstractType<?>> types = new ArrayList<>(columnTypes.size());
+        for (CQL3Type.Raw type : columnTypes)
+            types.add(type.prepare(keyspace()).getType());
+
+        return new UserType(name.getKeyspace(), name.getUserTypeName(), names, types);
+    }
+
+    public boolean announceMigration(boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
+    {
+        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        assert ksm != null; // should haven't validate otherwise
+
+        // Can happen with ifNotExists
+        if (ksm.userTypes.getType(name.getUserTypeName()) != null)
+            return false;
+
+        UserType type = createType();
+        checkForDuplicateNames(type);
+        MigrationManager.announceNewType(type, isLocalOnly);
+        return true;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
index 6c1c6ed..93dd8c5 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java

@@ -24,7 +24,9 @@
 
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.utils.Pair;
 
@@ -33,9 +35,9 @@
  */
 public class DeleteStatement extends ModificationStatement
 {
-    private DeleteStatement(StatementType type, CFMetaData cfm, Attributes attrs)
+    private DeleteStatement(StatementType type, int boundTerms, CFMetaData cfm, Attributes attrs)
     {
-        super(type, cfm, attrs);
+        super(type, boundTerms, cfm, attrs);
     }
 
     public boolean requireFullClusteringKey()
@@ -43,69 +45,59 @@
         return false;
     }
 
-    public void addUpdateForKey(ColumnFamily cf, ByteBuffer key, ColumnNameBuilder builder, UpdateParameters params)
+    public void addUpdateForKey(ColumnFamily cf, ByteBuffer key, Composite prefix, UpdateParameters params)
     throws InvalidRequestException
     {
-        CFDefinition cfDef = cfm.getCfDef();
         List<Operation> deletions = getOperations();
 
-        boolean fullKey = builder.componentCount() == cfDef.clusteringColumnsCount();
-        boolean isRange = cfDef.isCompact ? !fullKey : (!fullKey || deletions.isEmpty());
-
-        if (!deletions.isEmpty() && isRange)
+        if (prefix.size() < cfm.clusteringColumns().size() && !deletions.isEmpty())
         {
-            // We only get there if we have at least one non-static columns selected, as otherwise the builder will be
-            // the "static" builder and isRange will be false. But we may still have static columns, so pick the first
-            // non static one for the error message so it's not confusing
+            // In general, we can't delete specific columns if not all clustering columns have been specified.
+            // However, if we delete only static colums, it's fine since we won't really use the prefix anyway.
             for (Operation deletion : deletions)
-                if (cfm.getCfDef().get(deletion.columnName).kind != CFDefinition.Name.Kind.STATIC)
-                    throw new InvalidRequestException(String.format("Missing mandatory PRIMARY KEY part %s since %s specified", getFirstEmptyKey(), deletion.columnName));
-            throw new AssertionError();
+                if (!deletion.column.isStatic())
+                    throw new InvalidRequestException(String.format("Missing mandatory PRIMARY KEY part %s since %s specified", getFirstEmptyKey(), deletion.column.name));
         }
 
-        if (deletions.isEmpty() && builder.componentCount() == 0)
+        if (deletions.isEmpty())
         {
-            // No columns specified, delete the row
-            cf.delete(new DeletionInfo(params.timestamp, params.localDeletionTime));
-        }
-        else
-        {
-            if (isRange)
+            // We delete the slice selected by the prefix.
+            // However, for performance reasons, we distinguish 2 cases:
+            //   - It's a full internal row delete
+            //   - It's a full cell name (i.e it's a dense layout and the prefix is full)
+            if (prefix.isEmpty())
             {
-                assert deletions.isEmpty();
-                ByteBuffer start = builder.build();
-                ByteBuffer end = builder.buildAsEndOfRange();
-                cf.addAtom(params.makeRangeTombstone(start, end));
+                // No columns specified, delete the row
+                cf.delete(new DeletionInfo(params.timestamp, params.localDeletionTime));
+            }
+            else if (cfm.comparator.isDense() && prefix.size() == cfm.clusteringColumns().size())
+            {
+                cf.addAtom(params.makeTombstone(cfm.comparator.create(prefix, null)));
             }
             else
             {
-                // Delete specific columns
-                if (cfDef.isCompact)
-                {
-                    ByteBuffer columnName = builder.build();
-                    cf.addColumn(params.makeTombstone(columnName));
-                }
-                else
-                {
-                    for (Operation deletion : deletions)
-                        deletion.execute(key, cf, builder.copy(), params);
-                }
+                cf.addAtom(params.makeRangeTombstone(prefix.slice()));
             }
         }
+        else
+        {
+            for (Operation op : deletions)
+                op.execute(key, cf, prefix, params);
+        }
     }
 
     protected void validateWhereClauseForConditions() throws InvalidRequestException
     {
-        Iterator<CFDefinition.Name> iterator = Iterators.concat(cfm.getCfDef().partitionKeys().iterator(), cfm.getCfDef().clusteringColumns().iterator());
+        Iterator<ColumnDefinition> iterator = Iterators.concat(cfm.partitionKeyColumns().iterator(), cfm.clusteringColumns().iterator());
         while (iterator.hasNext())
         {
-            CFDefinition.Name name = iterator.next();
-            Restriction restriction = processedKeys.get(name.name);
+            ColumnDefinition def = iterator.next();
+            Restriction restriction = processedKeys.get(def.name);
             if (restriction == null || !(restriction.isEQ() || restriction.isIN()))
             {
                 throw new InvalidRequestException(
                         String.format("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order " +
-                                      "to use IF conditions, but column '%s' is not restricted", name.name));
+                                      "to use IF conditions, but column '%s' is not restricted", def.name));
             }
         }
 
@@ -128,22 +120,22 @@
             this.whereClause = whereClause;
         }
 
-        protected ModificationStatement prepareInternal(CFDefinition cfDef, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
+        protected ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
         {
-            DeleteStatement stmt = new DeleteStatement(ModificationStatement.StatementType.DELETE, cfDef.cfm, attrs);
+            DeleteStatement stmt = new DeleteStatement(ModificationStatement.StatementType.DELETE, boundNames.size(), cfm, attrs);
 
             for (Operation.RawDeletion deletion : deletions)
             {
-                CFDefinition.Name name = cfDef.get(deletion.affectedColumn());
-                if (name == null)
+                ColumnDefinition def = cfm.getColumnDefinition(deletion.affectedColumn());
+                if (def == null)
                     throw new InvalidRequestException(String.format("Unknown identifier %s", deletion.affectedColumn()));
 
                 // For compact, we only have one value except the key, so the only form of DELETE that make sense is without a column
                 // list. However, we support having the value name for coherence with the static/sparse case
-                if (name.isPrimaryKeyColumn())
-                    throw new InvalidRequestException(String.format("Invalid identifier %s for deletion (should not be a PRIMARY KEY part)", name));
+                if (def.isPrimaryKeyColumn())
+                    throw new InvalidRequestException(String.format("Invalid identifier %s for deletion (should not be a PRIMARY KEY part)", def.name));
 
-                Operation op = deletion.prepare(name);
+                Operation op = deletion.prepare(cfm.ksName, def);
                 op.collectMarkerSpecification(boundNames);
                 stmt.addOperation(op);
             }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropIndexStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropIndexStatement.java
index ac5262e..5df8188 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropIndexStatement.java

@@ -23,9 +23,12 @@
 import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.KeyspaceNotDefinedException;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.transport.messages.ResultMessage;
 
 public class DropIndexStatement extends SchemaAlteringStatement
@@ -33,10 +36,13 @@
     public final String indexName;
     public final boolean ifExists;
 
-    public DropIndexStatement(String indexName, boolean ifExists)
+    // initialized in announceMigration()
+    private String indexedCF;
+
+    public DropIndexStatement(IndexName indexName, boolean ifExists)
     {
-        super(new CFName());
-        this.indexName = indexName;
+        super(indexName.getCfName());
+        this.indexName = indexName.getIdx();
         this.ifExists = ifExists;
     }
 
@@ -54,20 +60,28 @@
         // validated in findIndexedCf()
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
         // Dropping an index is akin to updating the CF
-        return ResultMessage.SchemaChange.Change.UPDATED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 
-    public boolean announceMigration() throws InvalidRequestException, ConfigurationException
+    @Override
+    public ResultMessage execute(QueryState state, QueryOptions options) throws RequestValidationException
+    {
+        announceMigration(false);
+        return indexedCF == null ? null : new ResultMessage.SchemaChange(changeEvent());
+    }
+
+    public boolean announceMigration(boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
     {
         CFMetaData cfm = findIndexedCF();
         if (cfm == null)
             return false;
 
         CFMetaData updatedCfm = updateCFMetadata(cfm);
-        MigrationManager.announceColumnFamilyUpdate(updatedCfm, false);
+        indexedCF = updatedCfm.cfName;
+        MigrationManager.announceColumnFamilyUpdate(updatedCfm, false, isLocalOnly);
         return true;
     }
 
@@ -75,7 +89,7 @@
     {
         ColumnDefinition column = findIndexedColumn(cfm);
         assert column != null;
-        CFMetaData cloned = cfm.clone();
+        CFMetaData cloned = cfm.copy();
         ColumnDefinition toChange = cloned.getColumnDefinition(column.name);
         assert toChange.getIndexName() != null && toChange.getIndexName().equals(indexName);
         toChange.setIndexName(null);
@@ -86,6 +100,8 @@
     private CFMetaData findIndexedCF() throws InvalidRequestException
     {
         KSMetaData ksm = Schema.instance.getKSMetaData(keyspace());
+        if (ksm == null)
+            throw new KeyspaceNotDefinedException("Keyspace " + keyspace() + " does not exist");
         for (CFMetaData cfm : ksm.cfMetaData().values())
         {
             if (findIndexedColumn(cfm) != null)
@@ -95,7 +111,7 @@
         if (ifExists)
             return null;
         else
-            throw new InvalidRequestException("Index '" + indexName + "' could not be found in any of the column families of keyspace '" + keyspace() + "'");
+            throw new InvalidRequestException("Index '" + indexName + "' could not be found in any of the tables of keyspace '" + keyspace() + '\'');
     }
 
     private ColumnDefinition findIndexedColumn(CFMetaData cfm)
@@ -107,4 +123,11 @@
         }
         return null;
     }
+
+    @Override
+    public String columnFamily()
+    {
+        assert indexedCF != null;
+        return indexedCF;
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropKeyspaceStatement.java
index 7582af0..ba6b917 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropKeyspaceStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropKeyspaceStatement.java

@@ -25,7 +25,7 @@
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.thrift.ThriftValidation;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 
 public class DropKeyspaceStatement extends SchemaAlteringStatement
 {
@@ -55,11 +55,11 @@
         return keyspace;
     }
 
-    public boolean announceMigration() throws ConfigurationException
+    public boolean announceMigration(boolean isLocalOnly) throws ConfigurationException
     {
         try
         {
-            MigrationManager.announceKeyspaceDrop(keyspace);
+            MigrationManager.announceKeyspaceDrop(keyspace, isLocalOnly);
             return true;
         }
         catch(ConfigurationException e)
@@ -70,8 +70,8 @@
         }
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
-        return ResultMessage.SchemaChange.Change.DROPPED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, keyspace());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropTableStatement.java
index 65a3f14..e690c3e4 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropTableStatement.java

@@ -24,7 +24,7 @@
 import org.apache.cassandra.exceptions.UnauthorizedException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 
 public class DropTableStatement extends SchemaAlteringStatement
 {
@@ -54,11 +54,11 @@
         // validated in announceMigration()
     }
 
-    public boolean announceMigration() throws ConfigurationException
+    public boolean announceMigration(boolean isLocalOnly) throws ConfigurationException
     {
         try
         {
-            MigrationManager.announceColumnFamilyDrop(keyspace(), columnFamily());
+            MigrationManager.announceColumnFamilyDrop(keyspace(), columnFamily(), isLocalOnly);
             return true;
         }
         catch (ConfigurationException e)
@@ -69,8 +69,8 @@
         }
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
-        return ResultMessage.SchemaChange.Change.DROPPED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropTriggerStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropTriggerStatement.java
index f0bd637..e3db1e1 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropTriggerStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropTriggerStatement.java

@@ -24,12 +24,13 @@
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.CFName;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.thrift.ThriftValidation;
-import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.transport.Event;
 
 public class DropTriggerStatement extends SchemaAlteringStatement
 {
@@ -37,10 +38,13 @@
 
     private final String triggerName;
 
-    public DropTriggerStatement(CFName name, String triggerName)
+    private final boolean ifExists;
+
+    public DropTriggerStatement(CFName name, String triggerName, boolean ifExists)
     {
         super(name);
         this.triggerName = triggerName;
+        this.ifExists = ifExists;
     }
 
     public void checkAccess(ClientState state) throws UnauthorizedException
@@ -53,18 +57,22 @@
         ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
     }
 
-    public boolean announceMigration() throws ConfigurationException
+    public boolean announceMigration(boolean isLocalOnly) throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).clone();
-        if (!cfm.removeTrigger(triggerName))
-            throw new ConfigurationException(String.format("Trigger %s was not found", triggerName));
-        logger.info("Dropping trigger with name {}", triggerName);
-        MigrationManager.announceColumnFamilyUpdate(cfm, false);
-        return true;
+        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).copy();
+        if (cfm.removeTrigger(triggerName))
+        {
+            logger.info("Dropping trigger with name {}", triggerName);
+            MigrationManager.announceColumnFamilyUpdate(cfm, false, isLocalOnly);
+            return true;
+        }
+        if (!ifExists)
+            throw new InvalidRequestException(String.format("Trigger %s was not found", triggerName));
+        return false;
     }
 
-    public ResultMessage.SchemaChange.Change changeType()
+    public Event.SchemaChange changeEvent()
     {
-        return ResultMessage.SchemaChange.Change.UPDATED;
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropTypeStatement.java
new file mode 100644
index 0000000..8bcaaf6
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/DropTypeStatement.java

@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.transport.Event;
+
+public class DropTypeStatement extends SchemaAlteringStatement
+{
+    private final UTName name;
+    private final boolean ifExists;
+
+    public DropTypeStatement(UTName name, boolean ifExists)
+    {
+        super();
+        this.name = name;
+        this.ifExists = ifExists;
+    }
+
+    @Override
+    public void prepareKeyspace(ClientState state) throws InvalidRequestException
+    {
+        if (!name.hasKeyspace())
+            name.setKeyspace(state.getKeyspace());
+    }
+
+    public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
+    {
+        state.hasKeyspaceAccess(keyspace(), Permission.DROP);
+    }
+
+    public void validate(ClientState state) throws RequestValidationException
+    {
+        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        if (ksm == null)
+            throw new InvalidRequestException(String.format("Cannot drop type in unknown keyspace %s", name.getKeyspace()));
+
+        UserType old = ksm.userTypes.getType(name.getUserTypeName());
+        if (old == null)
+        {
+            if (ifExists)
+                return;
+            else
+                throw new InvalidRequestException(String.format("No user type named %s exists.", name));
+        }
+
+        // We don't want to drop a type unless it's not used anymore (mainly because
+        // if someone drops a type and recreates one with the same name but different
+        // definition with the previous name still in use, things can get messy).
+        // We have two places to check: 1) other user type that can nest the one
+        // we drop and 2) existing tables referencing the type (maybe in a nested
+        // way).
+
+        for (KSMetaData ksm2 : Schema.instance.getKeyspaceDefinitions())
+        {
+            for (UserType ut : ksm2.userTypes.getAllTypes().values())
+            {
+                if (ut.keyspace.equals(name.getKeyspace()) && ut.name.equals(name.getUserTypeName()))
+                    continue;
+                if (isUsedBy(ut))
+                    throw new InvalidRequestException(String.format("Cannot drop user type %s as it is still used by user type %s", name, ut.asCQL3Type()));
+            }
+
+            for (CFMetaData cfm : ksm2.cfMetaData().values())
+                for (ColumnDefinition def : cfm.allColumns())
+                    if (isUsedBy(def.type))
+                        throw new InvalidRequestException(String.format("Cannot drop user type %s as it is still used by table %s.%s", name, cfm.ksName, cfm.cfName));
+        }
+    }
+
+    private boolean isUsedBy(AbstractType<?> toCheck) throws RequestValidationException
+    {
+        if (toCheck instanceof UserType)
+        {
+            UserType ut = (UserType)toCheck;
+            if (name.getKeyspace().equals(ut.keyspace) && name.getUserTypeName().equals(ut.name))
+                return true;
+
+            for (AbstractType<?> subtype : ut.fieldTypes())
+                if (isUsedBy(subtype))
+                    return true;
+        }
+        else if (toCheck instanceof CompositeType)
+        {
+            CompositeType ct = (CompositeType)toCheck;
+            for (AbstractType<?> subtype : ct.types)
+                if (isUsedBy(subtype))
+                    return true;
+        }
+        else if (toCheck instanceof ColumnToCollectionType)
+        {
+            for (CollectionType collection : ((ColumnToCollectionType)toCheck).defined.values())
+                if (isUsedBy(collection))
+                    return true;
+        }
+        else if (toCheck instanceof CollectionType)
+        {
+            if (toCheck instanceof ListType)
+                return isUsedBy(((ListType)toCheck).elements);
+            else if (toCheck instanceof SetType)
+                return isUsedBy(((SetType)toCheck).elements);
+            else
+                return isUsedBy(((MapType)toCheck).keys) || isUsedBy(((MapType)toCheck).keys);
+        }
+        return false;
+    }
+
+    public Event.SchemaChange changeEvent()
+    {
+        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
+    }
+
+    @Override
+    public String keyspace()
+    {
+        return name.getKeyspace();
+    }
+
+    public boolean announceMigration(boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
+    {
+        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        assert ksm != null;
+
+        UserType toDrop = ksm.userTypes.getType(name.getUserTypeName());
+        // Can be null with ifExists
+        if (toDrop == null)
+            return false;
+
+        MigrationManager.announceTypeDrop(toDrop, isLocalOnly);
+        return true;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/DefaultDouble.java b/src/java/org/apache/cassandra/cql3/statements/IndexTarget.java
similarity index 60%
rename from src/java/org/apache/cassandra/utils/DefaultDouble.java
rename to src/java/org/apache/cassandra/cql3/statements/IndexTarget.java
index 236e177..58a8c92 100644
--- a/src/java/org/apache/cassandra/utils/DefaultDouble.java
+++ b/src/java/org/apache/cassandra/cql3/statements/IndexTarget.java

@@ -15,32 +15,28 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.utils;
+package org.apache.cassandra.cql3.statements;
 
+import org.apache.cassandra.cql3.ColumnIdentifier;
 
-public class DefaultDouble
+public class IndexTarget
 {
-    private final double originalValue;
-    private double currentValue;
+    public final ColumnIdentifier column;
+    public final boolean isCollectionKeys;
 
-    public DefaultDouble(double value)
+    private IndexTarget(ColumnIdentifier column, boolean isCollectionKeys)
     {
-        originalValue = value;
-        currentValue = value;
+        this.column = column;
+        this.isCollectionKeys = isCollectionKeys;
     }
 
-    public double value()
+    public static IndexTarget of(ColumnIdentifier c)
     {
-        return currentValue;
+        return new IndexTarget(c, false);
     }
 
-    public void set(double d)
+    public static IndexTarget keysOf(ColumnIdentifier c)
     {
-        currentValue = d;
-    }
-
-    public boolean isModified()
-    {
-        return originalValue != currentValue;
+        return new IndexTarget(c, true);
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index adb0084..b8cb818 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java

@@ -23,14 +23,14 @@
 import com.google.common.base.Function;
 import com.google.common.collect.Iterables;
 import org.github.jamm.MemoryMeter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CBuilder;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.ColumnSlice;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.db.marshal.CompositeType;
@@ -51,21 +51,16 @@
 {
     private static final ColumnIdentifier CAS_RESULT_COLUMN = new ColumnIdentifier("[applied]", false);
 
-    private static final Logger logger = LoggerFactory.getLogger(ModificationStatement.class);
-
-    private static boolean loggedCounterTTL = false;
-    private static boolean loggedCounterTimestamp = false;
-
     public static enum StatementType { INSERT, UPDATE, DELETE }
     public final StatementType type;
 
+    private final int boundTerms;
     public final CFMetaData cfm;
     public final Attributes attrs;
 
     protected final Map<ColumnIdentifier, Restriction> processedKeys = new HashMap<>();
     private final List<Operation> columnOperations = new ArrayList<Operation>();
 
-    private int boundTerms;
     // Separating normal and static conditions makes things somewhat easier
     private List<ColumnCondition> columnConditions;
     private List<ColumnCondition> staticConditions;
@@ -77,17 +72,18 @@
     private boolean setsStaticColumns;
     private boolean setsRegularColumns;
 
-    private final Function<ColumnCondition, ColumnIdentifier> getColumnForCondition = new Function<ColumnCondition, ColumnIdentifier>()
+    private final Function<ColumnCondition, ColumnDefinition> getColumnForCondition = new Function<ColumnCondition, ColumnDefinition>()
     {
-        public ColumnIdentifier apply(ColumnCondition cond)
+        public ColumnDefinition apply(ColumnCondition cond)
         {
-            return cond.column.name;
+            return cond.column;
         }
     };
 
-    public ModificationStatement(StatementType type, CFMetaData cfm, Attributes attrs)
+    public ModificationStatement(StatementType type, int boundTerms, CFMetaData cfm, Attributes attrs)
     {
         this.type = type;
+        this.boundTerms = boundTerms;
         this.cfm = cfm;
         this.attrs = attrs;
     }
@@ -103,7 +99,7 @@
     }
 
     public abstract boolean requireFullClusteringKey();
-    public abstract void addUpdateForKey(ColumnFamily updates, ByteBuffer key, ColumnNameBuilder builder, UpdateParameters params) throws InvalidRequestException;
+    public abstract void addUpdateForKey(ColumnFamily updates, ByteBuffer key, Composite prefix, UpdateParameters params) throws InvalidRequestException;
 
     public int getBoundTerms()
     {
@@ -122,12 +118,12 @@
 
     public boolean isCounter()
     {
-        return cfm.getDefaultValidator().isCommutative();
+        return cfm.isCounter();
     }
 
-    public long getTimestamp(long now, List<ByteBuffer> variables) throws InvalidRequestException
+    public long getTimestamp(long now, QueryOptions options) throws InvalidRequestException
     {
-        return attrs.getTimestamp(now, variables);
+        return attrs.getTimestamp(now, options);
     }
 
     public boolean isTimestampSet()
@@ -135,9 +131,9 @@
         return attrs.isTimestampSet();
     }
 
-    public int getTimeToLive(List<ByteBuffer> variables) throws InvalidRequestException
+    public int getTimeToLive(QueryOptions options) throws InvalidRequestException
     {
-        return attrs.getTimeToLive(variables);
+        return attrs.getTimeToLive(options);
     }
 
     public void checkAccess(ClientState state) throws InvalidRequestException, UnauthorizedException
@@ -151,38 +147,19 @@
 
     public void validate(ClientState state) throws InvalidRequestException
     {
-        if (hasConditions())
-        {
-            if (attrs.isTimestampSet())
-                throw new InvalidRequestException("Cannot provide custom timestamp for conditional update");
+        if (hasConditions() && attrs.isTimestampSet())
+            throw new InvalidRequestException("Cannot provide custom timestamp for conditional updates");
 
-            if (requiresRead())
-                throw new InvalidRequestException("Operations on lists requiring a read (setting by index and deletions by index or value) are not allowed with IF conditions");
-        }
+        if (isCounter() && attrs.isTimestampSet())
+            throw new InvalidRequestException("Cannot provide custom timestamp for counter updates");
 
-        if (isCounter())
-        {
-            if (attrs.isTimestampSet() && !loggedCounterTimestamp)
-            {
-                logger.warn("Detected use of 'USING TIMESTAMP' in a counter UPDATE. This is invalid " +
-                            "because counters do not use timestamps, and the timestamp has been ignored. " +
-                            "Such queries will be rejected in Cassandra 2.1+ - please fix your queries before then.");
-                loggedCounterTimestamp = true;
-            }
-
-            if (attrs.isTimeToLiveSet() && !loggedCounterTTL)
-            {
-                logger.warn("Detected use of 'USING TTL' in a counter UPDATE. This is invalid " +
-                            "because counter tables do not support TTL, and the TTL value has been ignored. " +
-                            "Such queries will be rejected in Cassandra 2.1+ - please fix your queries before then.");
-                loggedCounterTTL = true;
-            }
-        }
+        if (isCounter() && attrs.isTimeToLiveSet())
+            throw new InvalidRequestException("Cannot provide custom TTL for counter updates");
     }
 
     public void addOperation(Operation op)
     {
-        if (op.isStatic(cfm))
+        if (op.column.isStatic())
             setsStaticColumns = true;
         else
             setsRegularColumns = true;
@@ -194,19 +171,19 @@
         return columnOperations;
     }
 
-    public Iterable<ColumnIdentifier> getColumnsWithConditions()
+    public Iterable<ColumnDefinition> getColumnsWithConditions()
     {
         if (ifNotExists || ifExists)
             return null;
 
-        return Iterables.concat(columnConditions == null ? Collections.<ColumnIdentifier>emptyList() : Iterables.transform(columnConditions, getColumnForCondition),
-                                staticConditions == null ? Collections.<ColumnIdentifier>emptyList() : Iterables.transform(staticConditions, getColumnForCondition));
+        return Iterables.concat(columnConditions == null ? Collections.<ColumnDefinition>emptyList() : Iterables.transform(columnConditions, getColumnForCondition),
+                                staticConditions == null ? Collections.<ColumnDefinition>emptyList() : Iterables.transform(staticConditions, getColumnForCondition));
     }
 
     public void addCondition(ColumnCondition cond) throws InvalidRequestException
     {
         List<ColumnCondition> conds = null;
-        if (cond.column.kind == CFDefinition.Name.Kind.STATIC)
+        if (cond.column.isStatic())
         {
             setsStaticColumns = true;
             if (staticConditions == null)
@@ -243,25 +220,24 @@
         return ifExists;
     }
 
-    private void addKeyValues(CFDefinition.Name name, Restriction values) throws InvalidRequestException
+    private void addKeyValues(ColumnDefinition def, Restriction values) throws InvalidRequestException
     {
-        if (name.kind == CFDefinition.Name.Kind.COLUMN_ALIAS)
+        if (def.kind == ColumnDefinition.Kind.CLUSTERING_COLUMN)
             hasNoClusteringColumns = false;
-        if (processedKeys.put(name.name, values) != null)
-            throw new InvalidRequestException(String.format("Multiple definitions found for PRIMARY KEY part %s", name.name));
+        if (processedKeys.put(def.name, values) != null)
+            throw new InvalidRequestException(String.format("Multiple definitions found for PRIMARY KEY part %s", def.name));
     }
 
-    public void addKeyValue(CFDefinition.Name name, Term value) throws InvalidRequestException
+    public void addKeyValue(ColumnDefinition def, Term value) throws InvalidRequestException
     {
-        addKeyValues(name, new SingleColumnRestriction.EQ(value, false));
+        addKeyValues(def, new SingleColumnRestriction.EQ(value, false));
     }
 
     public void processWhereClause(List<Relation> whereClause, VariableSpecifications names) throws InvalidRequestException
     {
-        CFDefinition cfDef = cfm.getCfDef();
         for (Relation relation : whereClause)
         {
-            if (!(relation instanceof SingleColumnRelation))
+            if (relation.isMultiColumn())
             {
                 throw new InvalidRequestException(
                         String.format("Multi-column relations cannot be used in WHERE clauses for UPDATE and DELETE statements: %s", relation));
@@ -271,27 +247,27 @@
             if (rel.onToken)
                 throw new InvalidRequestException(String.format("The token function cannot be used in WHERE clauses for UPDATE and DELETE statements: %s", relation));
 
-            CFDefinition.Name name = cfDef.get(rel.getEntity());
-            if (name == null)
+            ColumnDefinition def = cfm.getColumnDefinition(rel.getEntity());
+            if (def == null)
                 throw new InvalidRequestException(String.format("Unknown key identifier %s", rel.getEntity()));
 
-            switch (name.kind)
+            switch (def.kind)
             {
-                case KEY_ALIAS:
-                case COLUMN_ALIAS:
+                case PARTITION_KEY:
+                case CLUSTERING_COLUMN:
                     Restriction restriction;
 
                     if (rel.operator() == Relation.Type.EQ)
                     {
-                        Term t = rel.getValue().prepare(name);
+                        Term t = rel.getValue().prepare(keyspace(), def);
                         t.collectMarkerSpecification(names);
                         restriction = new SingleColumnRestriction.EQ(t, false);
                     }
-                    else if (name.kind == CFDefinition.Name.Kind.KEY_ALIAS && rel.operator() == Relation.Type.IN)
+                    else if (def.kind == ColumnDefinition.Kind.PARTITION_KEY && rel.operator() == Relation.Type.IN)
                     {
                         if (rel.getValue() != null)
                         {
-                            Term t = rel.getValue().prepare(name);
+                            Term t = rel.getValue().prepare(keyspace(), def);
                             t.collectMarkerSpecification(names);
                             restriction = new SingleColumnRestriction.InWithMarker((Lists.Marker)t);
                         }
@@ -300,7 +276,7 @@
                             List<Term> values = new ArrayList<Term>(rel.getInValues().size());
                             for (Term.Raw raw : rel.getInValues())
                             {
-                                Term t = raw.prepare(name);
+                                Term t = raw.prepare(keyspace(), def);
                                 t.collectMarkerSpecification(names);
                                 values.add(t);
                             }
@@ -309,40 +285,37 @@
                     }
                     else
                     {
-                        throw new InvalidRequestException(String.format("Invalid operator %s for PRIMARY KEY part %s", rel.operator(), name));
+                        throw new InvalidRequestException(String.format("Invalid operator %s for PRIMARY KEY part %s", rel.operator(), def.name));
                     }
 
-                    addKeyValues(name, restriction);
+                    addKeyValues(def, restriction);
                     break;
-                case VALUE_ALIAS:
-                case COLUMN_METADATA:
-                case STATIC:
-                    throw new InvalidRequestException(String.format("Non PRIMARY KEY %s found in where clause", name));
+                default:
+                    throw new InvalidRequestException(String.format("Non PRIMARY KEY %s found in where clause", def.name));
             }
         }
     }
 
-    public List<ByteBuffer> buildPartitionKeyNames(List<ByteBuffer> variables)
+    public List<ByteBuffer> buildPartitionKeyNames(QueryOptions options)
     throws InvalidRequestException
     {
-        CFDefinition cfDef = cfm.getCfDef();
-        ColumnNameBuilder keyBuilder = cfDef.getKeyNameBuilder();
+        CBuilder keyBuilder = cfm.getKeyValidatorAsCType().builder();
         List<ByteBuffer> keys = new ArrayList<ByteBuffer>();
-        for (CFDefinition.Name name : cfDef.partitionKeys())
+        for (ColumnDefinition def : cfm.partitionKeyColumns())
         {
-            Restriction r = processedKeys.get(name.name);
+            Restriction r = processedKeys.get(def.name);
             if (r == null)
-                throw new InvalidRequestException(String.format("Missing mandatory PRIMARY KEY part %s", name));
+                throw new InvalidRequestException(String.format("Missing mandatory PRIMARY KEY part %s", def.name));
 
-            List<ByteBuffer> values = r.values(variables);
+            List<ByteBuffer> values = r.values(options);
 
             if (keyBuilder.remainingCount() == 1)
             {
                 for (ByteBuffer val : values)
                 {
                     if (val == null)
-                        throw new InvalidRequestException(String.format("Invalid null value for partition key part %s", name));
-                    ByteBuffer key = keyBuilder.copy().add(val).build();
+                        throw new InvalidRequestException(String.format("Invalid null value for partition key part %s", def.name));
+                    ByteBuffer key = keyBuilder.buildWith(val).toByteBuffer();
                     ThriftValidation.validateKey(cfm, key);
                     keys.add(key);
                 }
@@ -353,14 +326,14 @@
                     throw new InvalidRequestException("IN is only supported on the last column of the partition key");
                 ByteBuffer val = values.get(0);
                 if (val == null)
-                    throw new InvalidRequestException(String.format("Invalid null value for partition key part %s", name));
+                    throw new InvalidRequestException(String.format("Invalid null value for partition key part %s", def.name));
                 keyBuilder.add(val);
             }
         }
         return keys;
     }
 
-    public ColumnNameBuilder createClusteringPrefixBuilder(List<ByteBuffer> variables)
+    public Composite createClusteringPrefix(QueryOptions options)
     throws InvalidRequestException
     {
         // If the only updated/deleted columns are static, then we don't need clustering columns.
@@ -377,106 +350,81 @@
         {
             // If we set no non-static columns, then it's fine not to have clustering columns
             if (hasNoClusteringColumns)
-                return cfm.getStaticColumnNameBuilder();
+                return cfm.comparator.staticPrefix();
 
             // If we do have clustering columns however, then either it's an INSERT and the query is valid
             // but we still need to build a proper prefix, or it's not an INSERT, and then we want to reject
             // (see above)
             if (type != StatementType.INSERT)
             {
-                for (CFDefinition.Name name : cfm.getCfDef().clusteringColumns())
-                    if (processedKeys.get(name.name) != null)
-                        throw new InvalidRequestException(String.format("Invalid restriction on clustering column %s since the %s statement modifies only static columns", name.name, type));
+                for (ColumnDefinition def : cfm.clusteringColumns())
+                    if (processedKeys.get(def.name) != null)
+                        throw new InvalidRequestException(String.format("Invalid restriction on clustering column %s since the %s statement modifies only static columns", def.name, type));
                 // we should get there as it contradicts hasNoClusteringColumns == false
                 throw new AssertionError();
             }
         }
 
-        return createClusteringPrefixBuilderInternal(variables);
+        return createClusteringPrefixBuilderInternal(options);
     }
 
-    private ColumnNameBuilder updatePrefixFor(ByteBuffer name, ColumnNameBuilder prefix)
-    {
-        return isStatic(name) ? cfm.getStaticColumnNameBuilder() : prefix;
-    }
-
-    public boolean isStatic(ByteBuffer name)
-    {
-        ColumnDefinition def = cfm.getColumnDefinition(name);
-        return def != null && def.type == ColumnDefinition.Type.STATIC;
-    }
-
-    private ColumnNameBuilder createClusteringPrefixBuilderInternal(List<ByteBuffer> variables)
+    private Composite createClusteringPrefixBuilderInternal(QueryOptions options)
     throws InvalidRequestException
     {
-        CFDefinition cfDef = cfm.getCfDef();
-        ColumnNameBuilder builder = cfDef.getColumnNameBuilder();
-        CFDefinition.Name firstEmptyKey = null;
-        for (CFDefinition.Name name : cfDef.clusteringColumns())
+        CBuilder builder = cfm.comparator.prefixBuilder();
+        ColumnDefinition firstEmptyKey = null;
+        for (ColumnDefinition def : cfm.clusteringColumns())
         {
-            Restriction r = processedKeys.get(name.name);
+            Restriction r = processedKeys.get(def.name);
             if (r == null)
             {
-                firstEmptyKey = name;
-                if (requireFullClusteringKey() && cfDef.isComposite && !cfDef.isCompact)
-                    throw new InvalidRequestException(String.format("Missing mandatory PRIMARY KEY part %s", name));
+                firstEmptyKey = def;
+                if (requireFullClusteringKey() && !cfm.comparator.isDense() && cfm.comparator.isCompound())
+                    throw new InvalidRequestException(String.format("Missing mandatory PRIMARY KEY part %s", def.name));
             }
             else if (firstEmptyKey != null)
             {
-                throw new InvalidRequestException(String.format("Missing PRIMARY KEY part %s since %s is set", firstEmptyKey.name, name.name));
+                throw new InvalidRequestException(String.format("Missing PRIMARY KEY part %s since %s is set", firstEmptyKey.name, def.name));
             }
             else
             {
-                List<ByteBuffer> values = r.values(variables);
+                List<ByteBuffer> values = r.values(options);
                 assert values.size() == 1; // We only allow IN for row keys so far
                 ByteBuffer val = values.get(0);
                 if (val == null)
-                    throw new InvalidRequestException(String.format("Invalid null value for clustering key part %s", name));
+                    throw new InvalidRequestException(String.format("Invalid null value for clustering key part %s", def.name));
                 builder.add(val);
             }
         }
-        return builder;
+        return builder.build();
     }
 
-    protected CFDefinition.Name getFirstEmptyKey()
+    protected ColumnDefinition getFirstEmptyKey()
     {
-        for (CFDefinition.Name name : cfm.getCfDef().clusteringColumns())
+        for (ColumnDefinition def : cfm.clusteringColumns())
         {
-            if (processedKeys.get(name.name) == null)
-                return name;
+            if (processedKeys.get(def.name) == null)
+                return def;
         }
         return null;
     }
 
     public boolean requiresRead()
     {
+        // Lists SET operation incurs a read.
         for (Operation op : columnOperations)
             if (op.requiresRead())
                 return true;
+
         return false;
     }
 
-    protected Map<ByteBuffer, ColumnGroupMap> readRequiredRows(Collection<ByteBuffer> partitionKeys, ColumnNameBuilder clusteringPrefix, boolean local, ConsistencyLevel cl)
+    protected Map<ByteBuffer, CQL3Row> readRequiredRows(Collection<ByteBuffer> partitionKeys, Composite clusteringPrefix, boolean local, ConsistencyLevel cl)
     throws RequestExecutionException, RequestValidationException
     {
-        // Lists SET operation incurs a read.
-        Set<ByteBuffer> toRead = null;
-        for (Operation op : columnOperations)
-        {
-            if (op.requiresRead())
-            {
-                if (toRead == null)
-                    toRead = new TreeSet<ByteBuffer>(UTF8Type.instance);
-                toRead.add(op.columnName.key);
-            }
-        }
+        if (!requiresRead())
+            return null;
 
-        return toRead == null ? null : readRows(partitionKeys, clusteringPrefix, toRead, (CompositeType)cfm.comparator, local, cl);
-    }
-
-    private Map<ByteBuffer, ColumnGroupMap> readRows(Collection<ByteBuffer> partitionKeys, ColumnNameBuilder clusteringPrefix, Set<ByteBuffer> toRead, CompositeType composite, boolean local, ConsistencyLevel cl)
-    throws RequestExecutionException, RequestValidationException
-    {
         try
         {
             cl.validateForRead(keyspace());
@@ -486,16 +434,7 @@
             throw new InvalidRequestException(String.format("Write operation require a read but consistency %s is not supported on reads", cl));
         }
 
-        ColumnSlice[] slices = new ColumnSlice[toRead.size()];
-        int i = 0;
-        for (ByteBuffer name : toRead)
-        {
-            ColumnNameBuilder prefix = updatePrefixFor(name, clusteringPrefix);
-            ByteBuffer start = prefix.copy().add(name).build();
-            ByteBuffer finish = prefix.copy().add(name).buildAsEndOfRange();
-            slices[i++] = new ColumnSlice(start, finish);
-        }
-
+        ColumnSlice[] slices = new ColumnSlice[]{ clusteringPrefix.slice() };
         List<ReadCommand> commands = new ArrayList<ReadCommand>(partitionKeys.size());
         long now = System.currentTimeMillis();
         for (ByteBuffer key : partitionKeys)
@@ -509,20 +448,19 @@
                        ? SelectStatement.readLocally(keyspace(), commands)
                        : StorageProxy.read(commands, cl);
 
-        Map<ByteBuffer, ColumnGroupMap> map = new HashMap<ByteBuffer, ColumnGroupMap>();
+        Map<ByteBuffer, CQL3Row> map = new HashMap<ByteBuffer, CQL3Row>();
         for (Row row : rows)
         {
-            if (row.cf == null || row.cf.getColumnCount() == 0)
+            if (row.cf == null || row.cf.isEmpty())
                 continue;
 
-            ColumnGroupMap.Builder groupBuilder = new ColumnGroupMap.Builder(composite, true, now);
-            for (Column column : row.cf)
-                groupBuilder.add(column);
-
-            List<ColumnGroupMap> groups = groupBuilder.groups();
-            assert groups.isEmpty() || groups.size() == 1;
-            if (!groups.isEmpty())
-                map.put(row.key.key, groups.get(0));
+            Iterator<CQL3Row> iter = cfm.comparator.CQL3RowBuilder(cfm, now).group(row.cf.getSortedColumns().iterator());
+            if (iter.hasNext())
+            {
+                map.put(row.key.getKey(), iter.next());
+                // We can only update one CQ3Row per partition key at a time (we don't allow IN for clustering key)
+                assert !iter.hasNext();
+            }
         }
         return map;
     }
@@ -558,7 +496,7 @@
         else
             cl.validateForWrite(cfm.ksName);
 
-        Collection<? extends IMutation> mutations = getMutations(options.getValues(), false, cl, queryState.getTimestamp());
+        Collection<? extends IMutation> mutations = getMutations(options, false, options.getTimestamp(queryState));
         if (!mutations.isEmpty())
             StorageProxy.mutateWithTriggers(mutations, cl, false);
 
@@ -568,61 +506,56 @@
     public ResultMessage executeWithCondition(QueryState queryState, QueryOptions options)
     throws RequestExecutionException, RequestValidationException
     {
-        List<ByteBuffer> variables = options.getValues();
-        List<ByteBuffer> keys = buildPartitionKeyNames(variables);
+        List<ByteBuffer> keys = buildPartitionKeyNames(options);
         // We don't support IN for CAS operation so far
         if (keys.size() > 1)
             throw new InvalidRequestException("IN on the partition key is not supported with conditional updates");
 
         ByteBuffer key = keys.get(0);
+        long now = options.getTimestamp(queryState);
+        Composite prefix = createClusteringPrefix(options);
 
-        CQL3CasConditions conditions = new CQL3CasConditions(cfm, queryState.getTimestamp());
-        ColumnNameBuilder prefix = createClusteringPrefixBuilder(variables);
-        ColumnFamily updates = UnsortedColumns.factory.create(cfm);
-        addUpdatesAndConditions(key, prefix, updates, conditions, variables, getTimestamp(queryState.getTimestamp(), variables));
+        CQL3CasRequest request = new CQL3CasRequest(cfm, key, false);
+        addConditions(prefix, request, options);
+        request.addRowUpdate(prefix, this, options, now);
 
         ColumnFamily result = StorageProxy.cas(keyspace(),
                                                columnFamily(),
                                                key,
-                                               conditions,
-                                               updates,
+                                               request,
                                                options.getSerialConsistency(),
                                                options.getConsistency());
-        return new ResultMessage.Rows(buildCasResultSet(key, result));
+        return new ResultMessage.Rows(buildCasResultSet(key, result, options));
     }
 
-    public void addUpdatesAndConditions(ByteBuffer key, ColumnNameBuilder clusteringPrefix, ColumnFamily updates, CQL3CasConditions conditions, List<ByteBuffer> variables, long now)
-    throws InvalidRequestException
+    public void addConditions(Composite clusteringPrefix, CQL3CasRequest request, QueryOptions options) throws InvalidRequestException
     {
-        UpdateParameters updParams = new UpdateParameters(cfm, variables, now, getTimeToLive(variables), null);
-        addUpdateForKey(updates, key, clusteringPrefix, updParams);
-
         if (ifNotExists)
         {
             // If we use ifNotExists, if the statement applies to any non static columns, then the condition is on the row of the non-static
-            // columns and the prefix should be the rowPrefix. But if only static columns are set, then the ifNotExists apply to the existence
+            // columns and the prefix should be the clusteringPrefix. But if only static columns are set, then the ifNotExists apply to the existence
             // of any static columns and we should use the prefix for the "static part" of the partition.
-            conditions.addNotExist(clusteringPrefix);
+            request.addNotExist(clusteringPrefix);
         }
         else if (ifExists)
         {
-            conditions.addExist(clusteringPrefix);
+            request.addExist(clusteringPrefix);
         }
         else
         {
             if (columnConditions != null)
-                conditions.addConditions(clusteringPrefix, columnConditions, variables);
+                request.addConditions(clusteringPrefix, columnConditions, options);
             if (staticConditions != null)
-                conditions.addConditions(cfm.getStaticColumnNameBuilder(), staticConditions, variables);
+                request.addConditions(cfm.comparator.staticPrefix(), staticConditions, options);
         }
     }
 
-    private ResultSet buildCasResultSet(ByteBuffer key, ColumnFamily cf) throws InvalidRequestException
+    private ResultSet buildCasResultSet(ByteBuffer key, ColumnFamily cf, QueryOptions options) throws InvalidRequestException
     {
-        return buildCasResultSet(keyspace(), key, columnFamily(), cf, getColumnsWithConditions(), false);
+        return buildCasResultSet(keyspace(), key, columnFamily(), cf, getColumnsWithConditions(), false, options);
     }
 
-    public static ResultSet buildCasResultSet(String ksName, ByteBuffer key, String cfName, ColumnFamily cf, Iterable<ColumnIdentifier> columnsWithConditions, boolean isBatch)
+    public static ResultSet buildCasResultSet(String ksName, ByteBuffer key, String cfName, ColumnFamily cf, Iterable<ColumnDefinition> columnsWithConditions, boolean isBatch, QueryOptions options)
     throws InvalidRequestException
     {
         boolean success = cf == null;
@@ -632,7 +565,7 @@
         List<List<ByteBuffer>> rows = Collections.singletonList(Collections.singletonList(BooleanType.instance.decompose(success)));
 
         ResultSet rs = new ResultSet(metadata, rows);
-        return success ? rs : merge(rs, buildCasFailureResultSet(key, cf, columnsWithConditions, isBatch));
+        return success ? rs : merge(rs, buildCasFailureResultSet(key, cf, columnsWithConditions, isBatch, options));
     }
 
     private static ResultSet merge(ResultSet left, ResultSet right)
@@ -658,36 +591,35 @@
         return new ResultSet(new ResultSet.Metadata(specs), rows);
     }
 
-    private static ResultSet buildCasFailureResultSet(ByteBuffer key, ColumnFamily cf, Iterable<ColumnIdentifier> columnsWithConditions, boolean isBatch)
+    private static ResultSet buildCasFailureResultSet(ByteBuffer key, ColumnFamily cf, Iterable<ColumnDefinition> columnsWithConditions, boolean isBatch, QueryOptions options)
     throws InvalidRequestException
     {
-        CFDefinition cfDef = cf.metadata().getCfDef();
-
+        CFMetaData cfm = cf.metadata();
         Selection selection;
         if (columnsWithConditions == null)
         {
-            selection = Selection.wildcard(cfDef);
+            selection = Selection.wildcard(cfm);
         }
         else
         {
             // We can have multiple conditions on the same columns (for collections) so use a set
             // to avoid duplicate, but preserve the order just to it follows the order of IF in the query in general
-            Set<CFDefinition.Name> names = new LinkedHashSet<CFDefinition.Name>();
+            Set<ColumnDefinition> defs = new LinkedHashSet<>();
             // Adding the partition key for batches to disambiguate if the conditions span multipe rows (we don't add them outside
             // of batches for compatibility sakes).
             if (isBatch)
             {
-                names.addAll(cfDef.partitionKeys());
-                names.addAll(cfDef.clusteringColumns());
+                defs.addAll(cfm.partitionKeyColumns());
+                defs.addAll(cfm.clusteringColumns());
             }
-            for (ColumnIdentifier id : columnsWithConditions)
-                names.add(cfDef.get(id));
-            selection = Selection.forColumns(names);
+            for (ColumnDefinition def : columnsWithConditions)
+                defs.add(def);
+            selection = Selection.forColumns(defs);
         }
 
         long now = System.currentTimeMillis();
         Selection.ResultSetBuilder builder = selection.resultSetBuilder(now);
-        SelectStatement.forSelection(cfDef, selection).processColumnFamily(key, cf, Collections.<ByteBuffer>emptyList(), now, builder);
+        SelectStatement.forSelection(cfm, selection).processColumnFamily(key, cf, options, now, builder);
 
         return builder.build();
     }
@@ -697,54 +629,57 @@
         if (hasConditions())
             throw new UnsupportedOperationException();
 
-        List<ByteBuffer> variables = options.getValues();
-        for (IMutation mutation : getMutations(variables, true, null, queryState.getTimestamp()))
-            mutation.apply();
+        for (IMutation mutation : getMutations(options, true, queryState.getTimestamp()))
+        {
+            // We don't use counters internally.
+            assert mutation instanceof Mutation;
+
+            ((Mutation) mutation).apply();
+        }
         return null;
     }
 
     /**
      * Convert statement into a list of mutations to apply on the server
      *
-     * @param variables value for prepared statement markers
+     * @param options value for prepared statement markers
      * @param local if true, any requests (for collections) performed by getMutation should be done locally only.
-     * @param cl the consistency to use for the potential reads involved in generating the mutations (for lists set/delete operations)
      * @param now the current timestamp in microseconds to use if no timestamp is user provided.
      *
      * @return list of the mutations
      * @throws InvalidRequestException on invalid requests
      */
-    private Collection<? extends IMutation> getMutations(List<ByteBuffer> variables, boolean local, ConsistencyLevel cl, long now)
+    private Collection<? extends IMutation> getMutations(QueryOptions options, boolean local, long now)
     throws RequestExecutionException, RequestValidationException
     {
-        List<ByteBuffer> keys = buildPartitionKeyNames(variables);
-        ColumnNameBuilder clusteringPrefix = createClusteringPrefixBuilder(variables);
+        List<ByteBuffer> keys = buildPartitionKeyNames(options);
+        Composite clusteringPrefix = createClusteringPrefix(options);
 
-        UpdateParameters params = makeUpdateParameters(keys, clusteringPrefix, variables, local, cl, now);
+        UpdateParameters params = makeUpdateParameters(keys, clusteringPrefix, options, local, now);
 
-        Collection<IMutation> mutations = new ArrayList<IMutation>();
+        Collection<IMutation> mutations = new ArrayList<IMutation>(keys.size());
         for (ByteBuffer key: keys)
         {
             ThriftValidation.validateKey(cfm, key);
-            ColumnFamily cf = UnsortedColumns.factory.create(cfm);
+            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfm);
             addUpdateForKey(cf, key, clusteringPrefix, params);
-            RowMutation rm = new RowMutation(cfm.ksName, key, cf);
-            mutations.add(isCounter() ? new CounterMutation(rm, cl) : rm);
+            Mutation mut = new Mutation(cfm.ksName, key, cf);
+
+            mutations.add(isCounter() ? new CounterMutation(mut, options.getConsistency()) : mut);
         }
         return mutations;
     }
 
     public UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
-                                                 ColumnNameBuilder prefix,
-                                                 List<ByteBuffer> variables,
+                                                 Composite prefix,
+                                                 QueryOptions options,
                                                  boolean local,
-                                                 ConsistencyLevel cl,
                                                  long now)
     throws RequestExecutionException, RequestValidationException
     {
         // Some lists operation requires reading
-        Map<ByteBuffer, ColumnGroupMap> rows = readRequiredRows(keys, prefix, local, cl);
-        return new UpdateParameters(cfm, variables, getTimestamp(now, variables), getTimeToLive(variables), rows);
+        Map<ByteBuffer, CQL3Row> rows = readRequiredRows(keys, prefix, local, options.getConsistency());
+        return new UpdateParameters(cfm, options, getTimestamp(now, options), getTimeToLive(options), rows);
     }
 
     /**
@@ -783,22 +718,20 @@
         public ModificationStatement prepare(VariableSpecifications boundNames) throws InvalidRequestException
         {
             CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
-            CFDefinition cfDef = metadata.getCfDef();
-
-            // The collected count in the beginning of preparation.
-            // Will start at non-zero for statements nested inside a BatchStatement (the second and the further ones).
-            int collected = boundNames.getCollectedCount();
 
             Attributes preparedAttributes = attrs.prepare(keyspace(), columnFamily());
             preparedAttributes.collectMarkerSpecification(boundNames);
 
-            ModificationStatement stmt = prepareInternal(cfDef, boundNames, preparedAttributes);
+            ModificationStatement stmt = prepareInternal(metadata, boundNames, preparedAttributes);
 
             if (ifNotExists || ifExists || !conditions.isEmpty())
             {
                 if (stmt.isCounter())
                     throw new InvalidRequestException("Conditional updates are not supported on counter tables");
 
+                if (attrs.timestamp != null)
+                    throw new InvalidRequestException("Cannot provide custom timestamp for conditional updates");
+
                 if (ifNotExists)
                 {
                     // To have both 'IF NOT EXISTS' and some other conditions doesn't make sense.
@@ -817,21 +750,19 @@
                 {
                     for (Pair<ColumnIdentifier, ColumnCondition.Raw> entry : conditions)
                     {
-                        CFDefinition.Name name = cfDef.get(entry.left);
-                        if (name == null)
+                        ColumnDefinition def = metadata.getColumnDefinition(entry.left);
+                        if (def == null)
                             throw new InvalidRequestException(String.format("Unknown identifier %s", entry.left));
 
-                        ColumnCondition condition = entry.right.prepare(name);
+                        ColumnCondition condition = entry.right.prepare(keyspace(), def);
                         condition.collectMarkerSpecification(boundNames);
 
-                        switch (name.kind)
+                        switch (def.kind)
                         {
-                            case KEY_ALIAS:
-                            case COLUMN_ALIAS:
+                            case PARTITION_KEY:
+                            case CLUSTERING_COLUMN:
                                 throw new InvalidRequestException(String.format("PRIMARY KEY column '%s' cannot have IF conditions", entry.left));
-                            case VALUE_ALIAS:
-                            case COLUMN_METADATA:
-                            case STATIC:
+                            default:
                                 stmt.addCondition(condition);
                                 break;
                         }
@@ -840,11 +771,9 @@
 
                 stmt.validateWhereClauseForConditions();
             }
-
-            stmt.boundTerms = boundNames.getCollectedCount() - collected;
             return stmt;
         }
 
-        protected abstract ModificationStatement prepareInternal(CFDefinition cfDef, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException;
+        protected abstract ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException;
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/MultiColumnRestriction.java b/src/java/org/apache/cassandra/cql3/statements/MultiColumnRestriction.java
index f643684..96cb905 100644
--- a/src/java/org/apache/cassandra/cql3/statements/MultiColumnRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/statements/MultiColumnRestriction.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.cql3.statements;
 
 import org.apache.cassandra.cql3.AbstractMarker;
+import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.Term;
 import org.apache.cassandra.cql3.Tuples;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -40,16 +41,16 @@
             return true;
         }
 
-        public List<ByteBuffer> values(List<ByteBuffer> variables) throws InvalidRequestException
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
         {
-            Tuples.Value t = (Tuples.Value)value.bind(variables);
+            Tuples.Value t = (Tuples.Value)value.bind(options);
             return t.getElements();
         }
     }
 
     public interface IN extends MultiColumnRestriction
     {
-        public List<List<ByteBuffer>> splitValues(List<ByteBuffer> variables) throws InvalidRequestException;
+        public List<List<ByteBuffer>> splitValues(QueryOptions options) throws InvalidRequestException;
     }
 
     /**
@@ -68,12 +69,12 @@
             return true;
         }
 
-        public List<List<ByteBuffer>> splitValues(List<ByteBuffer> variables) throws InvalidRequestException
+        public List<List<ByteBuffer>> splitValues(QueryOptions options) throws InvalidRequestException
         {
             List<List<ByteBuffer>> buffers = new ArrayList<>(values.size());
             for (Term value : values)
             {
-                Term.MultiItemTerminal term = (Term.MultiItemTerminal)value.bind(variables);
+                Term.MultiItemTerminal term = (Term.MultiItemTerminal)value.bind(options);
                 buffers.add(term.getElements());
             }
             return buffers;
@@ -96,9 +97,10 @@
             return true;
         }
 
-        public List<List<ByteBuffer>> splitValues(List<ByteBuffer> variables) throws InvalidRequestException
+        public List<List<ByteBuffer>> splitValues(QueryOptions options) throws InvalidRequestException
         {
-            Tuples.InValue inValue = ((Tuples.InMarker) marker).bind(variables);
+            Tuples.InMarker inMarker = (Tuples.InMarker)marker;
+            Tuples.InValue inValue = inMarker.bind(options);
             if (inValue == null)
                 throw new InvalidRequestException("Invalid null value for IN restriction");
             return inValue.getSplitValues();
@@ -117,7 +119,7 @@
             return true;
         }
 
-        public ByteBuffer bound(Bound b, List<ByteBuffer> variables) throws InvalidRequestException
+        public ByteBuffer bound(Bound b, QueryOptions options) throws InvalidRequestException
         {
             throw new UnsupportedOperationException("Multicolumn slice restrictions do not support bound()");
         }
@@ -126,9 +128,9 @@
          * Similar to bounds(), but returns one ByteBuffer per-component in the bound instead of a single
          * ByteBuffer to represent the entire bound.
          */
-        public List<ByteBuffer> componentBounds(Bound b, List<ByteBuffer> variables) throws InvalidRequestException
+        public List<ByteBuffer> componentBounds(Bound b, QueryOptions options) throws InvalidRequestException
         {
-            Tuples.Value value = (Tuples.Value)bounds[b.idx].bind(variables);
+            Tuples.Value value = (Tuples.Value)bounds[b.idx].bind(options);
             return value.getElements();
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/statements/Restriction.java b/src/java/org/apache/cassandra/cql3/statements/Restriction.java
index 3d33bde..c529a38 100644
--- a/src/java/org/apache/cassandra/cql3/statements/Restriction.java
+++ b/src/java/org/apache/cassandra/cql3/statements/Restriction.java

@@ -21,7 +21,7 @@
 import java.util.List;
 
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.thrift.IndexOperator;
+import org.apache.cassandra.db.IndexExpression;
 import org.apache.cassandra.cql3.*;
 
 /**
@@ -35,10 +35,11 @@
     public boolean isSlice();
     public boolean isEQ();
     public boolean isIN();
+    public boolean isContains();
     public boolean isMultiColumn();
 
-    // Only supported for EQ and IN, but it's convenient to have here
-    public List<ByteBuffer> values(List<ByteBuffer> variables) throws InvalidRequestException;
+    // Not supported by Slice, but it's convenient to have here
+    public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException;
 
     public static interface EQ extends Restriction {}
 
@@ -49,20 +50,20 @@
 
     public static interface Slice extends Restriction
     {
-        public List<ByteBuffer> values(List<ByteBuffer> variables) throws InvalidRequestException;
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException;
 
         /** Returns true if the start or end bound (depending on the argument) is set, false otherwise */
         public boolean hasBound(Bound b);
 
-        public ByteBuffer bound(Bound b, List<ByteBuffer> variables) throws InvalidRequestException;
+        public ByteBuffer bound(Bound b, QueryOptions options) throws InvalidRequestException;
 
         /** Returns true if the start or end bound (depending on the argument) is inclusive, false otherwise */
         public boolean isInclusive(Bound b);
 
         public Relation.Type getRelation(Bound eocBound, Bound inclusiveBound);
 
-        public IndexOperator getIndexOperator(Bound b);
+        public IndexExpression.Operator getIndexOperator(Bound b);
 
-        public void setBound(Relation.Type type, Term t) throws InvalidRequestException;
+        public void setBound(ColumnIdentifier name, Relation.Type type, Term t) throws InvalidRequestException;
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/SchemaAlteringStatement.java b/src/java/org/apache/cassandra/cql3/statements/SchemaAlteringStatement.java
index 845d8cc..8882871 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SchemaAlteringStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SchemaAlteringStatement.java

@@ -23,6 +23,7 @@
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.transport.messages.ResultMessage;
 
 /**
@@ -62,7 +63,7 @@
         return new Prepared(this);
     }
 
-    public abstract ResultMessage.SchemaChange.Change changeType();
+    public abstract Event.SchemaChange changeEvent();
 
     /**
      * Announces the migration to other nodes in the cluster.
@@ -70,23 +71,26 @@
      * is used, for example)
      * @throws RequestValidationException
      */
-    public abstract boolean announceMigration() throws RequestValidationException;
+    public abstract boolean announceMigration(boolean isLocalOnly) throws RequestValidationException;
 
     public ResultMessage execute(QueryState state, QueryOptions options) throws RequestValidationException
     {
         // If an IF [NOT] EXISTS clause was used, this may not result in an actual schema change.  To avoid doing
         // extra work in the drivers to handle schema changes, we return an empty message in this case. (CASSANDRA-7600)
-        boolean didChangeSchema = announceMigration();
-        if (!didChangeSchema)
-            return new ResultMessage.Void();
-
-        String tableName = cfName == null || columnFamily() == null ? "" : columnFamily();
-        return new ResultMessage.SchemaChange(changeType(), keyspace(), tableName);
+        boolean didChangeSchema = announceMigration(false);
+        return didChangeSchema ? new ResultMessage.SchemaChange(changeEvent()) : new ResultMessage.Void();
     }
 
     public ResultMessage executeInternal(QueryState state, QueryOptions options)
     {
-        // executeInternal is for local query only, thus altering schema is not supported
-        throw new UnsupportedOperationException();
+        try
+        {
+            boolean didChangeSchema = announceMigration(true);
+            return didChangeSchema ? new ResultMessage.SchemaChange(changeEvent()) : new ResultMessage.Void();
+        }
+        catch (RequestValidationException e)
+        {
+            throw new RuntimeException(e);
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index aadd0bd..a8c9d44 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java

@@ -23,6 +23,7 @@
 import com.google.common.base.Joiner;
 import com.google.common.base.Objects;
 import com.google.common.base.Predicate;
+import com.google.common.collect.AbstractIterator;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Iterators;
 
@@ -30,30 +31,26 @@
 
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.cql3.CFDefinition.Name;
-import org.apache.cassandra.cql3.CFDefinition.Name.Kind;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.index.SecondaryIndexManager;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.StorageProxy;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.service.pager.*;
 import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -69,7 +66,7 @@
     private static final int DEFAULT_COUNT_PAGE_SIZE = 10000;
 
     private final int boundTerms;
-    public final CFDefinition cfDef;
+    public final CFMetaData cfm;
     public final Parameters parameters;
     private final Selection selection;
     private final Term limit;
@@ -81,10 +78,10 @@
     private final Restriction[] columnRestrictions;
 
     /** Restrictions on non-primary key columns (i.e. secondary index restrictions) */
-    private final Map<CFDefinition.Name, Restriction> metadataRestrictions = new HashMap<CFDefinition.Name, Restriction>();
+    private final Map<ColumnIdentifier, Restriction> metadataRestrictions = new HashMap<ColumnIdentifier, Restriction>();
 
-    // The name of all restricted names not covered by the key or index filter
-    private final Set<CFDefinition.Name> restrictedNames = new HashSet<CFDefinition.Name>();
+    // All restricted columns not covered by the key or index filter
+    private final Set<ColumnDefinition> restrictedColumns = new HashSet<ColumnDefinition>();
     private Restriction.Slice sliceRestriction;
 
     private boolean isReversed;
@@ -93,7 +90,7 @@
     private boolean keyIsInRelation;
     private boolean usesSecondaryIndexing;
 
-    private Map<CFDefinition.Name, Integer> orderingIndexes;
+    private Map<ColumnIdentifier, Integer> orderingIndexes;
 
     private boolean selectsStaticColumns;
     private boolean selectsOnlyStaticColumns;
@@ -101,21 +98,21 @@
     // Used by forSelection below
     private static final Parameters defaultParameters = new Parameters(Collections.<ColumnIdentifier, Boolean>emptyMap(), false, false, null, false);
 
-    private static final Predicate<CFDefinition.Name> isStaticFilter = new Predicate<CFDefinition.Name>()
+    private static final Predicate<ColumnDefinition> isStaticFilter = new Predicate<ColumnDefinition>()
     {
-        public boolean apply(CFDefinition.Name name)
+        public boolean apply(ColumnDefinition def)
         {
-            return name.kind == CFDefinition.Name.Kind.STATIC;
+            return def.isStatic();
         }
     };
 
-    public SelectStatement(CFDefinition cfDef, int boundTerms, Parameters parameters, Selection selection, Term limit)
+    public SelectStatement(CFMetaData cfm, int boundTerms, Parameters parameters, Selection selection, Term limit)
     {
-        this.cfDef = cfDef;
+        this.cfm = cfm;
         this.boundTerms = boundTerms;
         this.selection = selection;
-        this.keyRestrictions = new Restriction[cfDef.partitionKeyCount()];
-        this.columnRestrictions = new Restriction[cfDef.clusteringColumnsCount()];
+        this.keyRestrictions = new Restriction[cfm.partitionKeyColumns().size()];
+        this.columnRestrictions = new Restriction[cfm.clusteringColumns().size()];
         this.parameters = parameters;
         this.limit = limit;
 
@@ -125,7 +122,7 @@
 
     private void initStaticColumnsInfo()
     {
-        if (!cfDef.cfm.hasStaticColumns())
+        if (!cfm.hasStaticColumns())
             return;
 
         // If it's a wildcard, we do select static but not only them
@@ -138,9 +135,9 @@
         // Otherwise, check the selected columns
         selectsStaticColumns = !Iterables.isEmpty(Iterables.filter(selection.getColumns(), isStaticFilter));
         selectsOnlyStaticColumns = true;
-        for (CFDefinition.Name name : selection.getColumns())
+        for (ColumnDefinition def : selection.getColumns())
         {
-            if (name.kind != CFDefinition.Name.Kind.KEY_ALIAS && name.kind != CFDefinition.Name.Kind.STATIC)
+            if (def.kind != ColumnDefinition.Kind.PARTITION_KEY && def.kind != ColumnDefinition.Kind.STATIC)
             {
                 selectsOnlyStaticColumns = false;
                 break;
@@ -151,9 +148,9 @@
     // Creates a simple select based on the given selection.
     // Note that the results select statement should not be used for actual queries, but only for processing already
     // queried data through processColumnFamily.
-    static SelectStatement forSelection(CFDefinition cfDef, Selection selection)
+    static SelectStatement forSelection(CFMetaData cfm, Selection selection)
     {
-        return new SelectStatement(cfDef, 0, defaultParameters, selection, null);
+        return new SelectStatement(cfm, 0, defaultParameters, selection, null);
     }
 
     public ResultSet.Metadata getResultMetadata()
@@ -172,7 +169,7 @@
              + meter.measureDeep(keyRestrictions)
              + meter.measureDeep(columnRestrictions)
              + meter.measureDeep(metadataRestrictions)
-             + meter.measureDeep(restrictedNames)
+             + meter.measureDeep(restrictedColumns)
              + (sliceRestriction == null ? 0 : meter.measureDeep(sliceRestriction))
              + (orderingIndexes == null ? 0 : meter.measureDeep(orderingIndexes));
     }
@@ -195,42 +192,31 @@
     public ResultMessage.Rows execute(QueryState state, QueryOptions options) throws RequestExecutionException, RequestValidationException
     {
         ConsistencyLevel cl = options.getConsistency();
-        List<ByteBuffer> variables = options.getValues();
         if (cl == null)
             throw new InvalidRequestException("Invalid empty consistency level");
 
         cl.validateForRead(keyspace());
 
-        int limit = getLimit(variables);
-        int limitForQuery = updateLimitForQuery(limit);
+        int limit = getLimit(options);
         long now = System.currentTimeMillis();
-        Pageable command;
-        if (isKeyRange || usesSecondaryIndexing)
-        {
-            command = getRangeCommand(variables, limitForQuery, now);
-        }
-        else
-        {
-            List<ReadCommand> commands = getSliceCommands(variables, limitForQuery, now);
-            command = commands == null ? null : new Pageable.ReadCommands(commands);
-        }
+        Pageable command = getPageableCommand(options, limit, now);
 
         int pageSize = options.getPageSize();
         // A count query will never be paged for the user, but we always page it internally to avoid OOM.
         // If we user provided a pageSize we'll use that to page internally (because why not), otherwise we use our default
         // Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
-        if (parameters.isCount && pageSize <= 0 && MessagingService.instance().allNodesAtLeast20)
+        if (parameters.isCount && pageSize <= 0)
             pageSize = DEFAULT_COUNT_PAGE_SIZE;
 
         if (pageSize <= 0 || command == null || !QueryPagers.mayNeedPaging(command, pageSize))
         {
-            return execute(command, cl, variables, limit, now);
+            return execute(command, options, limit, now);
         }
         else
         {
             QueryPager pager = QueryPagers.pager(command, cl, options.getPagingState());
             if (parameters.isCount)
-                return pageCountQuery(pager, variables, pageSize, now, limit);
+                return pageCountQuery(pager, options, pageSize, now, limit);
 
             // We can't properly do post-query ordering if we page (see #6722)
             if (needsPostQueryOrdering())
@@ -238,14 +224,31 @@
                                                 + "ORDER BY or the IN and sort client side, or disable paging for this query");
 
             List<Row> page = pager.fetchPage(pageSize);
-            ResultMessage.Rows msg = processResults(page, variables, limit, now);
+            ResultMessage.Rows msg = processResults(page, options, limit, now);
+
             if (!pager.isExhausted())
                 msg.result.metadata.setHasMorePages(pager.state());
+
             return msg;
         }
     }
 
-    private ResultMessage.Rows execute(Pageable command, ConsistencyLevel cl, List<ByteBuffer> variables, int limit, long now) throws RequestValidationException, RequestExecutionException
+    private Pageable getPageableCommand(QueryOptions options, int limit, long now) throws RequestValidationException
+    {
+        int limitForQuery = updateLimitForQuery(limit);
+        if (isKeyRange || usesSecondaryIndexing)
+            return getRangeCommand(options, limitForQuery, now);
+
+        List<ReadCommand> commands = getSliceCommands(options, limitForQuery, now);
+        return commands == null ? null : new Pageable.ReadCommands(commands);
+    }
+
+    public Pageable getPageableCommand(QueryOptions options) throws RequestValidationException
+    {
+        return getPageableCommand(options, getLimit(options), System.currentTimeMillis());
+    }
+
+    private ResultMessage.Rows execute(Pageable command, QueryOptions options, int limit, long now) throws RequestValidationException, RequestExecutionException
     {
         List<Row> rows;
         if (command == null)
@@ -255,21 +258,21 @@
         else
         {
             rows = command instanceof Pageable.ReadCommands
-                 ? StorageProxy.read(((Pageable.ReadCommands)command).commands, cl)
-                 : StorageProxy.getRangeSlice((RangeSliceCommand)command, cl);
+                 ? StorageProxy.read(((Pageable.ReadCommands)command).commands, options.getConsistency())
+                 : StorageProxy.getRangeSlice((RangeSliceCommand)command, options.getConsistency());
         }
 
-        return processResults(rows, variables, limit, now);
+        return processResults(rows, options, limit, now);
     }
 
-    private ResultMessage.Rows pageCountQuery(QueryPager pager, List<ByteBuffer> variables, int pageSize, long now, int limit) throws RequestValidationException, RequestExecutionException
+    private ResultMessage.Rows pageCountQuery(QueryPager pager, QueryOptions options, int pageSize, long now, int limit) throws RequestValidationException, RequestExecutionException
     {
         int count = 0;
         while (!pager.isExhausted())
         {
             int maxLimit = pager.maxRemaining();
             logger.debug("New maxLimit for paged count query is {}", maxLimit);
-            ResultSet rset = process(pager.fetchPage(pageSize), variables, maxLimit, now);
+            ResultSet rset = process(pager.fetchPage(pageSize), options, maxLimit, now);
             count += rset.rows.size();
         }
 
@@ -279,10 +282,10 @@
         return new ResultMessage.Rows(result);
     }
 
-    public ResultMessage.Rows processResults(List<Row> rows, List<ByteBuffer> variables, int limit, long now) throws RequestValidationException
+    public ResultMessage.Rows processResults(List<Row> rows, QueryOptions options, int limit, long now) throws RequestValidationException
     {
         // Even for count, we need to process the result as it'll group some column together in sparse column families
-        ResultSet rset = process(rows, variables, limit, now);
+        ResultSet rset = process(rows, options, limit, now);
         rset = parameters.isCount ? rset.makeCountResult(parameters.countAlias) : rset;
         return new ResultMessage.Rows(rset);
     }
@@ -298,50 +301,44 @@
 
     public ResultMessage.Rows executeInternal(QueryState state, QueryOptions options) throws RequestExecutionException, RequestValidationException
     {
-        List<ByteBuffer> variables = options.getValues();
-        int limit = getLimit(variables);
-        int limitForQuery = updateLimitForQuery(limit);
+        int limit = getLimit(options);
         long now = System.currentTimeMillis();
-        List<Row> rows;
-        if (isKeyRange || usesSecondaryIndexing)
-        {
-            RangeSliceCommand command = getRangeCommand(variables, limitForQuery, now);
-            rows = command == null ? Collections.<Row>emptyList() : command.executeLocally();
-        }
-        else
-        {
-            List<ReadCommand> commands = getSliceCommands(variables, limitForQuery, now);
-            rows = commands == null ? Collections.<Row>emptyList() : readLocally(keyspace(), commands);
-        }
+        Pageable command = getPageableCommand(options, limit, now);
+        List<Row> rows = command == null
+                       ? Collections.<Row>emptyList()
+                       : (command instanceof Pageable.ReadCommands
+                          ? readLocally(keyspace(), ((Pageable.ReadCommands)command).commands)
+                          : ((RangeSliceCommand)command).executeLocally());
 
-        return processResults(rows, variables, limit, now);
+        return processResults(rows, options, limit, now);
     }
 
     public ResultSet process(List<Row> rows) throws InvalidRequestException
     {
         assert !parameters.isCount; // not yet needed
-        return process(rows, Collections.<ByteBuffer>emptyList(), getLimit(Collections.<ByteBuffer>emptyList()), System.currentTimeMillis());
+        QueryOptions options = QueryOptions.DEFAULT;
+        return process(rows, options, getLimit(options), System.currentTimeMillis());
     }
 
     public String keyspace()
     {
-        return cfDef.cfm.ksName;
+        return cfm.ksName;
     }
 
     public String columnFamily()
     {
-        return cfDef.cfm.cfName;
+        return cfm.cfName;
     }
 
-    private List<ReadCommand> getSliceCommands(List<ByteBuffer> variables, int limit, long now) throws RequestValidationException
+    private List<ReadCommand> getSliceCommands(QueryOptions options, int limit, long now) throws RequestValidationException
     {
-        Collection<ByteBuffer> keys = getKeys(variables);
+        Collection<ByteBuffer> keys = getKeys(options);
         if (keys.isEmpty()) // in case of IN () for (the last column of) the partition key.
             return null;
 
         List<ReadCommand> commands = new ArrayList<>(keys.size());
 
-        IDiskAtomFilter filter = makeFilter(variables, limit);
+        IDiskAtomFilter filter = makeFilter(options, limit);
         if (filter == null)
             return null;
 
@@ -353,35 +350,35 @@
             // We should not share the slice filter amongst the commands (hence the cloneShallow), due to
             // SliceQueryFilter not being immutable due to its columnCounter used by the lastCounted() method
             // (this is fairly ugly and we should change that but that's probably not a tiny refactor to do that cleanly)
-            commands.add(ReadCommand.create(keyspace(), key, columnFamily(), now, filter.cloneShallow()));
+            commands.add(ReadCommand.create(keyspace(), ByteBufferUtil.clone(key), columnFamily(), now, filter.cloneShallow()));
         }
 
         return commands;
     }
 
-    private RangeSliceCommand getRangeCommand(List<ByteBuffer> variables, int limit, long now) throws RequestValidationException
+    private RangeSliceCommand getRangeCommand(QueryOptions options, int limit, long now) throws RequestValidationException
     {
-        IDiskAtomFilter filter = makeFilter(variables, limit);
+        IDiskAtomFilter filter = makeFilter(options, limit);
         if (filter == null)
             return null;
 
-        List<IndexExpression> expressions = getIndexExpressions(variables);
+        List<IndexExpression> expressions = getValidatedIndexExpressions(options);
         // The LIMIT provided by the user is the number of CQL row he wants returned.
         // We want to have getRangeSlice to count the number of columns, not the number of keys.
-        AbstractBounds<RowPosition> keyBounds = getKeyBounds(variables);
+        AbstractBounds<RowPosition> keyBounds = getKeyBounds(options);
         return keyBounds == null
              ? null
              : new RangeSliceCommand(keyspace(), columnFamily(), now,  filter, keyBounds, expressions, limit, !parameters.isDistinct, false);
     }
 
-    private AbstractBounds<RowPosition> getKeyBounds(List<ByteBuffer> variables) throws InvalidRequestException
+    private AbstractBounds<RowPosition> getKeyBounds(QueryOptions options) throws InvalidRequestException
     {
         IPartitioner<?> p = StorageService.getPartitioner();
 
         if (onToken)
         {
-            Token startToken = getTokenBound(Bound.START, variables, p);
-            Token endToken = getTokenBound(Bound.END, variables, p);
+            Token startToken = getTokenBound(Bound.START, options, p);
+            Token endToken = getTokenBound(Bound.END, options, p);
 
             boolean includeStart = includeKeyBound(Bound.START);
             boolean includeEnd = includeKeyBound(Bound.END);
@@ -407,11 +404,11 @@
         }
         else
         {
-            ByteBuffer startKeyBytes = getKeyBound(Bound.START, variables);
-            ByteBuffer finishKeyBytes = getKeyBound(Bound.END, variables);
+            ByteBuffer startKeyBytes = getKeyBound(Bound.START, options);
+            ByteBuffer finishKeyBytes = getKeyBound(Bound.END, options);
 
-            RowPosition startKey = RowPosition.forKey(startKeyBytes, p);
-            RowPosition finishKey = RowPosition.forKey(finishKeyBytes, p);
+            RowPosition startKey = RowPosition.ForKey.get(startKeyBytes, p);
+            RowPosition finishKey = RowPosition.ForKey.get(finishKeyBytes, p);
 
             if (startKey.compareTo(finishKey) > 0 && !finishKey.isMinimum(p))
                 return null;
@@ -433,18 +430,17 @@
 
     private ColumnSlice makeStaticSlice()
     {
-        ColumnNameBuilder staticPrefix = cfDef.cfm.getStaticColumnNameBuilder();
-        // Note: we could use staticPrefix.build() for the start bound, but EMPTY_BYTE_BUFFER gives us the
+        // Note: we could use staticPrefix.start() for the start bound, but EMPTY gives us the
         // same effect while saving a few CPU cycles.
         return isReversed
-             ? new ColumnSlice(staticPrefix.buildAsEndOfRange(), ByteBufferUtil.EMPTY_BYTE_BUFFER)
-             : new ColumnSlice(ByteBufferUtil.EMPTY_BYTE_BUFFER, staticPrefix.buildAsEndOfRange());
+             ? new ColumnSlice(cfm.comparator.staticPrefix().end(), Composites.EMPTY)
+             : new ColumnSlice(Composites.EMPTY, cfm.comparator.staticPrefix().end());
     }
 
-    private IDiskAtomFilter makeFilter(List<ByteBuffer> variables, int limit)
+    private IDiskAtomFilter makeFilter(QueryOptions options, int limit)
     throws InvalidRequestException
     {
-        int toGroup = cfDef.isCompact ? -1 : cfDef.clusteringColumnsCount();
+        int toGroup = cfm.comparator.isDense() ? -1 : cfm.clusteringColumns().size();
         if (parameters.isDistinct)
         {
             // For distinct, we only care about fetching the beginning of each partition. If we don't have
@@ -454,12 +450,8 @@
         }
         else if (isColumnRange())
         {
-            // For sparse, we used to ask for 'defined columns' * 'asked limit' (where defined columns includes the row marker)
-            // to account for the grouping of columns.
-            // Since that doesn't work for maps/sets/lists, we now use the compositesToGroup option of SliceQueryFilter.
-            // But we must preserve backward compatibility too (for mixed version cluster that is).
-            List<ByteBuffer> startBounds = getRequestedBound(Bound.START, variables);
-            List<ByteBuffer> endBounds = getRequestedBound(Bound.END, variables);
+            List<Composite> startBounds = getRequestedBound(Bound.START, options);
+            List<Composite> endBounds = getRequestedBound(Bound.END, options);
             assert startBounds.size() == endBounds.size();
 
             // Handles fetching static columns. Note that for 2i, the filter is just used to restrict
@@ -473,18 +465,18 @@
             if (startBounds.size() == 1)
             {
                 ColumnSlice slice = new ColumnSlice(startBounds.get(0), endBounds.get(0));
-                if (slice.isAlwaysEmpty(cfDef.cfm.comparator, isReversed))
+                if (slice.isAlwaysEmpty(cfm.comparator, isReversed))
                     return staticSlice == null ? null : sliceFilter(staticSlice, limit, toGroup);
 
                 if (staticSlice == null)
                     return sliceFilter(slice, limit, toGroup);
 
                 if (isReversed)
-                    return slice.includes(cfDef.cfm.comparator.reverseComparator, staticSlice.start)
+                    return slice.includes(cfm.comparator.reverseComparator(), staticSlice.start)
                             ? sliceFilter(new ColumnSlice(slice.start, staticSlice.finish), limit, toGroup)
                             : sliceFilter(new ColumnSlice[]{ slice, staticSlice }, limit, toGroup);
                 else
-                    return slice.includes(cfDef.cfm.comparator, staticSlice.finish)
+                    return slice.includes(cfm.comparator, staticSlice.finish)
                             ? sliceFilter(new ColumnSlice(staticSlice.start, slice.finish), limit, toGroup)
                             : sliceFilter(new ColumnSlice[]{ staticSlice, slice }, limit, toGroup);
             }
@@ -493,7 +485,7 @@
             for (int i = 0; i < startBounds.size(); i++)
             {
                 ColumnSlice slice = new ColumnSlice(startBounds.get(i), endBounds.get(i));
-                if (!slice.isAlwaysEmpty(cfDef.cfm.comparator, isReversed))
+                if (!slice.isAlwaysEmpty(cfm.comparator, isReversed))
                     l.add(slice);
             }
 
@@ -502,16 +494,15 @@
             if (staticSlice == null)
                 return sliceFilter(l.toArray(new ColumnSlice[l.size()]), limit, toGroup);
 
-            // The slices should not overlap. We know the slices built from startBounds/endBounds don't, but
-            // if there is a static slice, it could overlap with the 2nd slice. Check for it and correct if
-            // that's the case
+            // The slices should not overlap. We know the slices built from startBounds/endBounds don't, but if there is
+            // a static slice, it could overlap with the 2nd slice. Check for it and correct if that's the case
             ColumnSlice[] slices;
             if (isReversed)
             {
-                if (l.get(l.size() - 1).includes(cfDef.cfm.comparator.reverseComparator, staticSlice.start))
+                if (l.get(l.size() - 1).includes(cfm.comparator.reverseComparator(), staticSlice.start))
                 {
                     slices = l.toArray(new ColumnSlice[l.size()]);
-                    slices[slices.length-1] = new ColumnSlice(slices[slices.length-1].start, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+                    slices[slices.length-1] = new ColumnSlice(slices[slices.length-1].start, Composites.EMPTY);
                 }
                 else
                 {
@@ -521,10 +512,10 @@
             }
             else
             {
-                if (l.get(0).includes(cfDef.cfm.comparator, staticSlice.finish))
+                if (l.get(0).includes(cfm.comparator, staticSlice.finish))
                 {
                     slices = new ColumnSlice[l.size()];
-                    slices[0] = new ColumnSlice(ByteBufferUtil.EMPTY_BYTE_BUFFER, l.get(0).finish);
+                    slices[0] = new ColumnSlice(Composites.EMPTY, l.get(0).finish);
                     for (int i = 1; i < l.size(); i++)
                         slices[i] = l.get(i);
                 }
@@ -540,10 +531,10 @@
         }
         else
         {
-            SortedSet<ByteBuffer> cellNames = getRequestedColumns(variables);
+            SortedSet<CellName> cellNames = getRequestedColumns(options);
             if (cellNames == null) // in case of IN () for the last column of the key
                 return null;
-            QueryProcessor.validateCellNames(cellNames);
+            QueryProcessor.validateCellNames(cellNames, cfm.comparator);
             return new NamesQueryFilter(cellNames, true);
         }
     }
@@ -555,15 +546,16 @@
 
     private SliceQueryFilter sliceFilter(ColumnSlice[] slices, int limit, int toGroup)
     {
+        assert ColumnSlice.validateSlices(slices, cfm.comparator, isReversed) : String.format("Invalid slices: " + Arrays.toString(slices) + (isReversed ? " (reversed)" : ""));
         return new SliceQueryFilter(slices, isReversed, limit, toGroup);
     }
 
-    private int getLimit(List<ByteBuffer> variables) throws InvalidRequestException
+    private int getLimit(QueryOptions options) throws InvalidRequestException
     {
         int l = Integer.MAX_VALUE;
         if (limit != null)
         {
-            ByteBuffer b = limit.bindAndGet(variables);
+            ByteBuffer b = limit.bindAndGet(options);
             if (b == null)
                 throw new InvalidRequestException("Invalid null value of limit");
 
@@ -593,24 +585,24 @@
              : limit;
     }
 
-    private Collection<ByteBuffer> getKeys(final List<ByteBuffer> variables) throws InvalidRequestException
+    private Collection<ByteBuffer> getKeys(final QueryOptions options) throws InvalidRequestException
     {
         List<ByteBuffer> keys = new ArrayList<ByteBuffer>();
-        ColumnNameBuilder builder = cfDef.getKeyNameBuilder();
-        for (CFDefinition.Name name : cfDef.partitionKeys())
+        CBuilder builder = cfm.getKeyValidatorAsCType().builder();
+        for (ColumnDefinition def : cfm.partitionKeyColumns())
         {
-            Restriction r = keyRestrictions[name.position];
+            Restriction r = keyRestrictions[def.position()];
             assert r != null && !r.isSlice();
 
-            List<ByteBuffer> values = r.values(variables);
+            List<ByteBuffer> values = r.values(options);
 
             if (builder.remainingCount() == 1)
             {
                 for (ByteBuffer val : values)
                 {
                     if (val == null)
-                        throw new InvalidRequestException(String.format("Invalid null value for partition key part %s", name));
-                    keys.add(builder.copy().add(val).build());
+                        throw new InvalidRequestException(String.format("Invalid null value for partition key part %s", def.name));
+                    keys.add(builder.buildWith(val).toByteBuffer());
                 }
             }
             else
@@ -620,14 +612,14 @@
                     throw new InvalidRequestException("IN is only supported on the last column of the partition key");
                 ByteBuffer val = values.get(0);
                 if (val == null)
-                    throw new InvalidRequestException(String.format("Invalid null value for partition key part %s", name));
+                    throw new InvalidRequestException(String.format("Invalid null value for partition key part %s", def.name));
                 builder.add(val);
             }
         }
         return keys;
     }
 
-    private ByteBuffer getKeyBound(Bound b, List<ByteBuffer> variables) throws InvalidRequestException
+    private ByteBuffer getKeyBound(Bound b, QueryOptions options) throws InvalidRequestException
     {
         // Deal with unrestricted partition key components (special-casing is required to deal with 2i queries on the first
         // component of a composite partition key).
@@ -636,10 +628,10 @@
                 return ByteBufferUtil.EMPTY_BYTE_BUFFER;
 
         // We deal with IN queries for keys in other places, so we know buildBound will return only one result
-        return buildBound(b, cfDef.partitionKeys(), keyRestrictions, false, cfDef.getKeyNameBuilder(), variables).get(0);
+        return buildBound(b, cfm.partitionKeyColumns(), keyRestrictions, false, cfm.getKeyValidatorAsCType(), options).get(0).toByteBuffer();
     }
 
-    private Token getTokenBound(Bound b, List<ByteBuffer> variables, IPartitioner<?> p) throws InvalidRequestException
+    private Token getTokenBound(Bound b, QueryOptions options, IPartitioner<?> p) throws InvalidRequestException
     {
         assert onToken;
 
@@ -651,7 +643,7 @@
         ByteBuffer value;
         if (keyRestriction.isEQ())
         {
-            value = keyRestriction.values(variables).get(0);
+            value = keyRestriction.values(options).get(0);
         }
         else
         {
@@ -659,7 +651,7 @@
             if (!slice.hasBound(b))
                 return p.getMinimumToken();
 
-            value = slice.bound(b, variables);
+            value = slice.bound(b, options);
         }
 
         if (value == null)
@@ -685,10 +677,10 @@
 
     private boolean isColumnRange()
     {
-        // Due to CASSANDRA-5762, we always do a slice for CQL3 tables (not compact, composite).
-        // Static CF (non compact but non composite) never entails a column slice however
-        if (!cfDef.isCompact)
-            return cfDef.isComposite;
+        // Due to CASSANDRA-5762, we always do a slice for CQL3 tables (not dense, composite).
+        // Static CF (non dense but non composite) never entails a column slice however
+        if (!cfm.comparator.isDense())
+            return cfm.comparator.isCompound();
 
         // Otherwise (i.e. for compact table where we don't have a row marker anyway and thus don't care about CASSANDRA-5762),
         // it is a range query if it has at least one the column alias for which no relation is defined or is not EQ.
@@ -700,25 +692,25 @@
         return false;
     }
 
-    private SortedSet<ByteBuffer> getRequestedColumns(List<ByteBuffer> variables) throws InvalidRequestException
+    private SortedSet<CellName> getRequestedColumns(QueryOptions options) throws InvalidRequestException
     {
         // Note: getRequestedColumns don't handle static columns, but due to CASSANDRA-5762
         // we always do a slice for CQL3 tables, so it's ok to ignore them here
         assert !isColumnRange();
 
-        ColumnNameBuilder builder = cfDef.getColumnNameBuilder();
-        Iterator<CFDefinition.Name> idIter = cfDef.clusteringColumns().iterator();
+        CBuilder builder = cfm.comparator.prefixBuilder();
+        Iterator<ColumnDefinition> idIter = cfm.clusteringColumns().iterator();
         for (Restriction r : columnRestrictions)
         {
-            CFDefinition.Name name = idIter.next();
+            ColumnDefinition def = idIter.next();
             assert r != null && !r.isSlice();
 
-            List<ByteBuffer> values = r.values(variables);
+            List<ByteBuffer> values = r.values(options);
             if (values.size() == 1)
             {
                 ByteBuffer val = values.get(0);
                 if (val == null)
-                    throw new InvalidRequestException(String.format("Invalid null value for clustering key part %s", name.name));
+                    throw new InvalidRequestException(String.format("Invalid null value for clustering key part %s", def.name));
                 builder.add(val);
             }
             else
@@ -728,32 +720,29 @@
                 // for each value of the IN, creates all the columns corresponding to the selection.
                 if (values.isEmpty())
                     return null;
-                SortedSet<ByteBuffer> columns = new TreeSet<ByteBuffer>(cfDef.cfm.comparator);
+                SortedSet<CellName> columns = new TreeSet<CellName>(cfm.comparator);
                 Iterator<ByteBuffer> iter = values.iterator();
                 while (iter.hasNext())
                 {
                     ByteBuffer val = iter.next();
-                    ColumnNameBuilder b = iter.hasNext() ? builder.copy() : builder;
                     if (val == null)
-                        throw new InvalidRequestException(String.format("Invalid null value for clustering key part %s", name.name));
-                    b.add(val);
-                    if (cfDef.isCompact)
-                        columns.add(b.build());
-                    else
-                        columns.addAll(addSelectedColumns(b));
+                        throw new InvalidRequestException(String.format("Invalid null value for clustering key part %s", def.name));
+
+                    Composite prefix = builder.buildWith(val);
+                    columns.addAll(addSelectedColumns(prefix));
                 }
                 return columns;
             }
         }
 
-        return addSelectedColumns(builder);
+        return addSelectedColumns(builder.build());
     }
 
-    private SortedSet<ByteBuffer> addSelectedColumns(ColumnNameBuilder builder)
+    private SortedSet<CellName> addSelectedColumns(Composite prefix)
     {
-        if (cfDef.isCompact)
+        if (cfm.comparator.isDense())
         {
-            return FBUtilities.singleton(builder.build(), cfDef.cfm.comparator);
+            return FBUtilities.singleton(cfm.comparator.create(prefix, null), cfm.comparator);
         }
         else
         {
@@ -761,32 +750,27 @@
             // non-know set of columns, so we shouldn't get there
             assert !selectACollection();
 
-            SortedSet<ByteBuffer> columns = new TreeSet<ByteBuffer>(cfDef.cfm.comparator);
+            SortedSet<CellName> columns = new TreeSet<CellName>(cfm.comparator);
 
             // We need to query the selected column as well as the marker
             // column (for the case where the row exists but has no columns outside the PK)
             // Two exceptions are "static CF" (non-composite non-compact CF) and "super CF"
             // that don't have marker and for which we must query all columns instead
-            if (cfDef.isComposite && !cfDef.cfm.isSuper())
+            if (cfm.comparator.isCompound() && !cfm.isSuper())
             {
                 // marker
-                columns.add(builder.copy().add(ByteBufferUtil.EMPTY_BYTE_BUFFER).build());
+                columns.add(cfm.comparator.rowMarker(prefix));
 
                 // selected columns
-                for (ColumnIdentifier id : selection.regularAndStaticColumnsToFetch())
-                    columns.add(builder.copy().add(id.key).build());
+                for (ColumnDefinition def : selection.getColumns())
+                    if (def.kind == ColumnDefinition.Kind.REGULAR || def.kind == ColumnDefinition.Kind.STATIC)
+                        columns.add(cfm.comparator.create(prefix, def));
             }
             else
             {
                 // We now that we're not composite so we can ignore static columns
-                Iterator<CFDefinition.Name> iter = cfDef.regularColumns().iterator();
-                while (iter.hasNext())
-                {
-                    ColumnIdentifier name = iter.next().name;
-                    ColumnNameBuilder b = iter.hasNext() ? builder.copy() : builder;
-                    ByteBuffer cname = b.add(name.key).build();
-                    columns.add(cname);
-                }
+                for (ColumnDefinition def : cfm.regularColumns())
+                    columns.add(cfm.comparator.create(prefix, def));
             }
             return columns;
         }
@@ -794,38 +778,39 @@
 
     private boolean selectACollection()
     {
-        if (!cfDef.hasCollections)
+        if (!cfm.comparator.hasCollections())
             return false;
 
-        for (CFDefinition.Name name : selection.getColumns())
+        for (ColumnDefinition def : selection.getColumns())
         {
-            if (name.type instanceof CollectionType)
+            if (def.type instanceof CollectionType)
                 return true;
         }
 
         return false;
     }
 
-    private List<ByteBuffer> buildBound(Bound bound,
-                                        Collection<CFDefinition.Name> names,
-                                        Restriction[] restrictions,
-                                        boolean isReversed,
-                                        ColumnNameBuilder builder,
-                                        List<ByteBuffer> variables) throws InvalidRequestException
+    private static List<Composite> buildBound(Bound bound,
+                                              List<ColumnDefinition> defs,
+                                              Restriction[] restrictions,
+                                              boolean isReversed,
+                                              CType type,
+                                              QueryOptions options) throws InvalidRequestException
     {
+        CBuilder builder = type.builder();
 
         // check the first restriction to see if we're dealing with a multi-column restriction
-        if (!names.isEmpty())
+        if (!defs.isEmpty())
         {
             Restriction firstRestriction = restrictions[0];
             if (firstRestriction != null && firstRestriction.isMultiColumn())
             {
                 if (firstRestriction.isSlice())
-                    return buildMultiColumnSliceBound(bound, names, (MultiColumnRestriction.Slice) firstRestriction, isReversed, builder, variables);
+                    return buildMultiColumnSliceBound(bound, defs, (MultiColumnRestriction.Slice) firstRestriction, isReversed, builder, options);
                 else if (firstRestriction.isIN())
-                    return buildMultiColumnInBound(bound, (MultiColumnRestriction.IN) firstRestriction, isReversed, builder, variables);
+                    return buildMultiColumnInBound(bound, defs, (MultiColumnRestriction.IN) firstRestriction, isReversed, builder, type, options);
                 else
-                    return buildMultiColumnEQBound(bound, (MultiColumnRestriction.EQ) firstRestriction, isReversed, builder, variables);
+                    return buildMultiColumnEQBound(bound, defs, (MultiColumnRestriction.EQ) firstRestriction, isReversed, builder, options);
             }
         }
 
@@ -834,54 +819,53 @@
         // to the component comparator but not to the end-of-component itself),
         // it only depends on whether the slice is reversed
         Bound eocBound = isReversed ? Bound.reverse(bound) : bound;
-        for (Iterator<CFDefinition.Name> iter = names.iterator(); iter.hasNext();)
+        for (Iterator<ColumnDefinition> iter = defs.iterator(); iter.hasNext();)
         {
-            CFDefinition.Name name = iter.next();
+            ColumnDefinition def = iter.next();
 
             // In a restriction, we always have Bound.START < Bound.END for the "base" comparator.
             // So if we're doing a reverse slice, we must inverse the bounds when giving them as start and end of the slice filter.
             // But if the actual comparator itself is reversed, we must inversed the bounds too.
-            Bound b = isReversed == isReversedType(name) ? bound : Bound.reverse(bound);
-            Restriction r = restrictions[name.position];
+            Bound b = isReversed == isReversedType(def) ? bound : Bound.reverse(bound);
+            Restriction r = restrictions[def.position()];
             if (isNullRestriction(r, b))
             {
                 // There wasn't any non EQ relation on that key, we select all records having the preceding component as prefix.
                 // For composites, if there was preceding component and we're computing the end, we must change the last component
                 // End-Of-Component, otherwise we would be selecting only one record.
-                return Collections.singletonList(builder.componentCount() > 0 && eocBound == Bound.END
-                                                 ? builder.buildAsEndOfRange()
-                                                 : builder.build());
+                Composite prefix = builder.build();
+                return Collections.singletonList(!prefix.isEmpty() && eocBound == Bound.END ? prefix.end() : prefix);
             }
             if (r.isSlice())
             {
-                builder.add(getSliceValue(r, b, variables));
+                builder.add(getSliceValue(r, b, options));
                 Relation.Type relType = ((Restriction.Slice)r).getRelation(eocBound, b);
-                return Collections.singletonList(builder.buildForRelation(relType));
+                return Collections.singletonList(builder.build().withEOC(eocForRelation(relType)));
             }
             else
             {
-                List<ByteBuffer> values = r.values(variables);
+                List<ByteBuffer> values = r.values(options);
                 if (values.size() != 1)
                 {
                     // IN query, we only support it on the clustering columns
-                    assert name.position == names.size() - 1;
+                    assert def.position() == defs.size() - 1;
                     // The IN query might not have listed the values in comparator order, so we need to re-sort
                     // the bounds lists to make sure the slices works correctly (also, to avoid duplicates).
-                    TreeSet<ByteBuffer> s = new TreeSet<>(isReversed ? cfDef.cfm.comparator.reverseComparator : cfDef.cfm.comparator);
+                    TreeSet<Composite> s = new TreeSet<>(isReversed ? type.reverseComparator() : type);
                     for (ByteBuffer val : values)
                     {
                         if (val == null)
-                            throw new InvalidRequestException(String.format("Invalid null clustering key part %s", name));
-                        ColumnNameBuilder copy = builder.copy().add(val);
+                            throw new InvalidRequestException(String.format("Invalid null clustering key part %s", def.name));
+                        Composite prefix = builder.buildWith(val);
                         // See below for why this
-                        s.add((eocBound == Bound.END && copy.remainingCount() > 0) ? copy.buildAsEndOfRange() : copy.build());
+                        s.add((eocBound == Bound.END && builder.remainingCount() > 0) ? prefix.end() : prefix);
                     }
                     return new ArrayList<>(s);
                 }
 
                 ByteBuffer val = values.get(0);
                 if (val == null)
-                    throw new InvalidRequestException(String.format("Invalid null clustering key part %s", name));
+                    throw new InvalidRequestException(String.format("Invalid null clustering key part %s", def.name));
                 builder.add(val);
             }
         }
@@ -890,21 +874,41 @@
         // it would be harmless to do it. However, we use this method got the partition key too. And when a query
         // with 2ndary index is done, and with the the partition provided with an EQ, we'll end up here, and in that
         // case using the eoc would be bad, since for the random partitioner we have no guarantee that
-        // builder.buildAsEndOfRange() will sort after builder.build() (see #5240).
-        return Collections.singletonList((eocBound == Bound.END && builder.remainingCount() > 0) ? builder.buildAsEndOfRange() : builder.build());
+        // prefix.end() will sort after prefix (see #5240).
+        Composite prefix = builder.build();
+        return Collections.singletonList(eocBound == Bound.END && builder.remainingCount() > 0 ? prefix.end() : prefix);
     }
 
-    private List<ByteBuffer> buildMultiColumnSliceBound(Bound bound,
-                                                        Collection<CFDefinition.Name> names,
-                                                        MultiColumnRestriction.Slice slice,
-                                                        boolean isReversed,
-                                                        ColumnNameBuilder builder,
-                                                        List<ByteBuffer> variables) throws InvalidRequestException
+    private static Composite.EOC eocForRelation(Relation.Type op)
+    {
+        switch (op)
+        {
+            case LT:
+                // < X => using startOf(X) as finish bound
+                return Composite.EOC.START;
+            case GT:
+            case LTE:
+                // > X => using endOf(X) as start bound
+                // <= X => using endOf(X) as finish bound
+                return Composite.EOC.END;
+            default:
+                // >= X => using X as start bound (could use START_OF too)
+                // = X => using X
+                return Composite.EOC.NONE;
+        }
+    }
+
+    private static List<Composite> buildMultiColumnSliceBound(Bound bound,
+                                                              List<ColumnDefinition> defs,
+                                                              MultiColumnRestriction.Slice slice,
+                                                              boolean isReversed,
+                                                              CBuilder builder,
+                                                              QueryOptions options) throws InvalidRequestException
     {
         Bound eocBound = isReversed ? Bound.reverse(bound) : bound;
 
-        Iterator<CFDefinition.Name> iter = names.iterator();
-        CFDefinition.Name firstName = iter.next();
+        Iterator<ColumnDefinition> iter = defs.iterator();
+        ColumnDefinition firstName = iter.next();
         // A hack to preserve pre-6875 behavior for tuple-notation slices where the comparator mixes ASCENDING
         // and DESCENDING orders.  This stores the bound for the first component; we will re-use it for all following
         // components, even if they don't match the first component's reversal/non-reversal.  Note that this does *not*
@@ -912,58 +916,84 @@
         Bound firstComponentBound = isReversed == isReversedType(firstName) ? bound : Bound.reverse(bound);
 
         if (!slice.hasBound(firstComponentBound))
-            return Collections.singletonList(builder.componentCount() > 0 && eocBound == Bound.END
-                    ? builder.buildAsEndOfRange()
-                    : builder.build());
-
-        List<ByteBuffer> vals = slice.componentBounds(firstComponentBound, variables);
-        builder.add(vals.get(firstName.position));
-
-        while(iter.hasNext())
         {
-            CFDefinition.Name name = iter.next();
-            if (name.position >= vals.size())
+            Composite prefix = builder.build();
+            return Collections.singletonList(builder.remainingCount() > 0 && eocBound == Bound.END
+                    ? prefix.end()
+                    : prefix);
+        }
+
+        List<ByteBuffer> vals = slice.componentBounds(firstComponentBound, options);
+
+        ByteBuffer v = vals.get(firstName.position());
+        if (v == null)
+            throw new InvalidRequestException("Invalid null value in condition for column " + firstName.name);
+        builder.add(v);
+
+        while (iter.hasNext())
+        {
+            ColumnDefinition def = iter.next();
+            if (def.position() >= vals.size())
                 break;
 
-            builder.add(vals.get(name.position));
+            v = vals.get(def.position());
+            if (v == null)
+                throw new InvalidRequestException("Invalid null value in condition for column " + def.name);
+            builder.add(v);
         }
         Relation.Type relType = slice.getRelation(eocBound, firstComponentBound);
-        return Collections.singletonList(builder.buildForRelation(relType));
+        return Collections.singletonList(builder.build().withEOC(eocForRelation(relType)));
     }
 
-    private List<ByteBuffer> buildMultiColumnInBound(Bound bound,
-                                                     MultiColumnRestriction.IN restriction,
-                                                     boolean isReversed,
-                                                     ColumnNameBuilder builder,
-                                                     List<ByteBuffer> variables) throws InvalidRequestException
+    private static List<Composite> buildMultiColumnInBound(Bound bound,
+                                                           List<ColumnDefinition> defs,
+                                                           MultiColumnRestriction.IN restriction,
+                                                           boolean isReversed,
+                                                           CBuilder builder,
+                                                           CType type,
+                                                           QueryOptions options) throws InvalidRequestException
     {
-        List<List<ByteBuffer>> splitInValues = restriction.splitValues(variables);
+        List<List<ByteBuffer>> splitInValues = restriction.splitValues(options);
         Bound eocBound = isReversed ? Bound.reverse(bound) : bound;
 
         // The IN query might not have listed the values in comparator order, so we need to re-sort
         // the bounds lists to make sure the slices works correctly (also, to avoid duplicates).
-        TreeSet<ByteBuffer> inValues = new TreeSet<>(isReversed ? cfDef.cfm.comparator.reverseComparator : cfDef.cfm.comparator);
+        TreeSet<Composite> inValues = new TreeSet<>(isReversed ? type.reverseComparator() : type);
         for (List<ByteBuffer> components : splitInValues)
         {
-            ColumnNameBuilder nameBuilder = builder.copy();
-            for (ByteBuffer component : components)
-                nameBuilder.add(component);
+            for (int i = 0; i < components.size(); i++)
+                if (components.get(i) == null)
+                    throw new InvalidRequestException("Invalid null value in condition for column " + defs.get(i));
 
-            inValues.add((eocBound == Bound.END && nameBuilder.remainingCount() > 0) ? nameBuilder.buildAsEndOfRange() : nameBuilder.build());
+            Composite prefix = builder.buildWith(components);
+            inValues.add(eocBound == Bound.END && builder.remainingCount() - components.size() > 0
+                         ? prefix.end()
+                         : prefix);
         }
         return new ArrayList<>(inValues);
     }
 
-    private List<ByteBuffer> buildMultiColumnEQBound(Bound bound, MultiColumnRestriction.EQ restriction, boolean isReversed, ColumnNameBuilder builder, List<ByteBuffer> variables) throws InvalidRequestException
+    private static List<Composite> buildMultiColumnEQBound(Bound bound,
+                                                           List<ColumnDefinition> defs,
+                                                           MultiColumnRestriction.EQ restriction,
+                                                           boolean isReversed,
+                                                           CBuilder builder,
+                                                           QueryOptions options) throws InvalidRequestException
     {
         Bound eocBound = isReversed ? Bound.reverse(bound) : bound;
-        for (ByteBuffer component : restriction.values(variables))
+        List<ByteBuffer> values = restriction.values(options);
+        for (int i = 0; i < values.size(); i++)
+        {
+            ByteBuffer component = values.get(i);
+            if (component == null)
+                throw new InvalidRequestException("Invalid null value in condition for column " + defs.get(i));
             builder.add(component);
+        }
 
-        ByteBuffer result = builder.componentCount() > 0 && eocBound == Bound.END
-                ? builder.buildAsEndOfRange()
-                : builder.build();
-        return Collections.singletonList(result);
+        Composite prefix = builder.build();
+        return Collections.singletonList(builder.remainingCount() > 0 && eocBound == Bound.END
+                                         ? prefix.end()
+                                         : prefix);
     }
 
     private static boolean isNullRestriction(Restriction r, Bound b)
@@ -971,45 +1001,45 @@
         return r == null || (r.isSlice() && !((Restriction.Slice)r).hasBound(b));
     }
 
-    private static ByteBuffer getSliceValue(Restriction r, Bound b, List<ByteBuffer> variables) throws InvalidRequestException
+    private static ByteBuffer getSliceValue(Restriction r, Bound b, QueryOptions options) throws InvalidRequestException
     {
         Restriction.Slice slice = (Restriction.Slice)r;
         assert slice.hasBound(b);
-        ByteBuffer val = slice.bound(b, variables);
+        ByteBuffer val = slice.bound(b, options);
         if (val == null)
             throw new InvalidRequestException(String.format("Invalid null clustering key part %s", r));
         return val;
     }
 
-    private List<ByteBuffer> getRequestedBound(Bound b, List<ByteBuffer> variables) throws InvalidRequestException
+    private List<Composite> getRequestedBound(Bound b, QueryOptions options) throws InvalidRequestException
     {
         assert isColumnRange();
-        return buildBound(b, cfDef.clusteringColumns(), columnRestrictions, isReversed, cfDef.getColumnNameBuilder(), variables);
+        return buildBound(b, cfm.clusteringColumns(), columnRestrictions, isReversed, cfm.comparator, options);
     }
 
-    public List<IndexExpression> getIndexExpressions(List<ByteBuffer> variables) throws InvalidRequestException
+    public List<IndexExpression> getValidatedIndexExpressions(QueryOptions options) throws InvalidRequestException
     {
-        if (!usesSecondaryIndexing || restrictedNames.isEmpty())
+        if (!usesSecondaryIndexing || restrictedColumns.isEmpty())
             return Collections.emptyList();
 
         List<IndexExpression> expressions = new ArrayList<IndexExpression>();
-        for (CFDefinition.Name name : restrictedNames)
+        for (ColumnDefinition def : restrictedColumns)
         {
             Restriction restriction;
-            switch (name.kind)
+            switch (def.kind)
             {
-                case KEY_ALIAS:
-                    restriction = keyRestrictions[name.position];
+                case PARTITION_KEY:
+                    restriction = keyRestrictions[def.position()];
                     break;
-                case COLUMN_ALIAS:
-                    restriction = columnRestrictions[name.position];
+                case CLUSTERING_COLUMN:
+                    restriction = columnRestrictions[def.position()];
                     break;
-                case COLUMN_METADATA:
+                case REGULAR:
                 case STATIC:
-                    restriction = metadataRestrictions.get(name);
+                    restriction = metadataRestrictions.get(def.name);
                     break;
                 default:
-                    // We don't allow restricting a VALUE_ALIAS for now in prepare.
+                    // We don't allow restricting a COMPACT_VALUE for now in prepare.
                     throw new AssertionError();
             }
 
@@ -1020,54 +1050,104 @@
                 {
                     if (slice.hasBound(b))
                     {
-                        ByteBuffer value = slice.bound(b, variables);
-                        validateIndexExpressionValue(value, name);
-                        IndexOperator op = slice.getIndexOperator(b);
+                        ByteBuffer value = validateIndexedValue(def, slice.bound(b, options));
+                        IndexExpression.Operator op = slice.getIndexOperator(b);
                         // If the underlying comparator for name is reversed, we need to reverse the IndexOperator: user operation
                         // always refer to the "forward" sorting even if the clustering order is reversed, but the 2ndary code does
                         // use the underlying comparator as is.
-                        if (name.type instanceof ReversedType)
+                        if (def.type instanceof ReversedType)
                             op = reverse(op);
-                        expressions.add(new IndexExpression(name.name.key, op, value));
+                        expressions.add(new IndexExpression(def.name.bytes, op, value));
                     }
                 }
             }
+            else if (restriction.isContains())
+            {
+                SingleColumnRestriction.Contains contains = (SingleColumnRestriction.Contains)restriction;
+                for (ByteBuffer value : contains.values(options))
+                {
+                    validateIndexedValue(def, value);
+                    expressions.add(new IndexExpression(def.name.bytes, IndexExpression.Operator.CONTAINS, value));
+                }
+                for (ByteBuffer key : contains.keys(options))
+                {
+                    validateIndexedValue(def, key);
+                    expressions.add(new IndexExpression(def.name.bytes, IndexExpression.Operator.CONTAINS_KEY, key));
+                }
+            }
             else
             {
-                List<ByteBuffer> values = restriction.values(variables);
+                List<ByteBuffer> values = restriction.values(options);
 
                 if (values.size() != 1)
                     throw new InvalidRequestException("IN restrictions are not supported on indexed columns");
 
-                ByteBuffer value = values.get(0);
-                validateIndexExpressionValue(value, name);
-                expressions.add(new IndexExpression(name.name.key, IndexOperator.EQ, value));
+                ByteBuffer value = validateIndexedValue(def, values.get(0));
+                expressions.add(new IndexExpression(def.name.bytes, IndexExpression.Operator.EQ, value));
             }
         }
+        
+        if (usesSecondaryIndexing)
+        {
+            ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(columnFamily());
+            SecondaryIndexManager secondaryIndexManager = cfs.indexManager;
+            secondaryIndexManager.validateIndexSearchersForQuery(expressions);
+        }
+        
         return expressions;
     }
 
-    private void validateIndexExpressionValue(ByteBuffer value, CFDefinition.Name name) throws InvalidRequestException
+    private static ByteBuffer validateIndexedValue(ColumnDefinition def, ByteBuffer value) throws InvalidRequestException
     {
         if (value == null)
-            throw new InvalidRequestException(String.format("Unsupported null value for indexed column %s", name));
+            throw new InvalidRequestException(String.format("Unsupported null value for indexed column %s", def.name));
         if (value.remaining() > 0xFFFF)
             throw new InvalidRequestException("Index expression values may not be larger than 64K");
+        return value;
     }
 
-    private static IndexOperator reverse(IndexOperator op)
+    private Iterator<Cell> applySliceRestriction(final Iterator<Cell> cells, final QueryOptions options) throws InvalidRequestException
+    {
+        assert sliceRestriction != null;
+
+        final CellNameType type = cfm.comparator;
+        final CellName excludedStart = sliceRestriction.isInclusive(Bound.START) ? null : type.makeCellName(sliceRestriction.bound(Bound.START, options));
+        final CellName excludedEnd = sliceRestriction.isInclusive(Bound.END) ? null : type.makeCellName(sliceRestriction.bound(Bound.END, options));
+
+        return new AbstractIterator<Cell>()
+        {
+            protected Cell computeNext()
+            {
+                while (cells.hasNext())
+                {
+                    Cell c = cells.next();
+
+                    // For dynamic CF, the column could be out of the requested bounds (because we don't support strict bounds internally (unless
+                    // the comparator is composite that is)), filter here
+                    if ( (excludedStart != null && type.compare(c.name(), excludedStart) == 0)
+                      || (excludedEnd != null && type.compare(c.name(), excludedEnd) == 0) )
+                        continue;
+
+                    return c;
+                }
+                return endOfData();
+            }
+        };
+    }
+
+    private static IndexExpression.Operator reverse(IndexExpression.Operator op)
     {
         switch (op)
         {
-            case LT:  return IndexOperator.GT;
-            case LTE: return IndexOperator.GTE;
-            case GT:  return IndexOperator.LT;
-            case GTE: return IndexOperator.LTE;
+            case LT:  return IndexExpression.Operator.GT;
+            case LTE: return IndexExpression.Operator.GTE;
+            case GT:  return IndexExpression.Operator.LT;
+            case GTE: return IndexExpression.Operator.LTE;
             default: return op;
         }
     }
 
-    private ResultSet process(List<Row> rows, List<ByteBuffer> variables, int limit, long now) throws InvalidRequestException
+    private ResultSet process(List<Row> rows, QueryOptions options, int limit, long now) throws InvalidRequestException
     {
         Selection.ResultSetBuilder result = selection.resultSetBuilder(now);
         for (org.apache.cassandra.db.Row row : rows)
@@ -1076,7 +1156,7 @@
             if (row.cf == null)
                 continue;
 
-            processColumnFamily(row.key.key, row.cf, variables, now, result);
+            processColumnFamily(row.key.getKey(), row.cf, options, now, result);
         }
 
         ResultSet cqlRows = result.build();
@@ -1093,132 +1173,100 @@
     }
 
     // Used by ModificationStatement for CAS operations
-    void processColumnFamily(ByteBuffer key, ColumnFamily cf, List<ByteBuffer> variables, long now, Selection.ResultSetBuilder result)
+    void processColumnFamily(ByteBuffer key, ColumnFamily cf, QueryOptions options, long now, Selection.ResultSetBuilder result)
     throws InvalidRequestException
     {
-        ByteBuffer[] keyComponents = cfDef.hasCompositeKey
-                                   ? ((CompositeType)cfDef.cfm.getKeyValidator()).split(key)
-                                   : new ByteBuffer[]{ key };
-
-        if (parameters.isDistinct && !selectsStaticColumns)
+        CFMetaData cfm = cf.metadata();
+        ByteBuffer[] keyComponents = null;
+        if (cfm.getKeyValidator() instanceof CompositeType)
         {
-            if (!cf.hasOnlyTombstones(now))
-            {
-                result.newRow();
-                // selection.getColumns() will contain only the partition key components - all of them.
-                for (CFDefinition.Name name : selection.getColumns())
-                    result.add(keyComponents[name.position]);
-            }
-        }
-        else if (cfDef.isCompact)
-        {
-            // One cqlRow per column
-            for (Column c : cf)
-            {
-                if (c.isMarkedForDelete(now))
-                    continue;
-
-                ByteBuffer[] components = null;
-                if (cfDef.isComposite)
-                {
-                    components = ((CompositeType)cfDef.cfm.comparator).split(c.name());
-                }
-                else if (sliceRestriction != null)
-                {
-                    Comparator<ByteBuffer> comp = cfDef.cfm.comparator;
-                    // For dynamic CF, the column could be out of the requested bounds, filter here
-                    if (!sliceRestriction.isInclusive(Bound.START) && comp.compare(c.name(), sliceRestriction.bound(Bound.START, variables)) == 0)
-                        continue;
-                    if (!sliceRestriction.isInclusive(Bound.END) && comp.compare(c.name(), sliceRestriction.bound(Bound.END, variables)) == 0)
-                        continue;
-                }
-
-                result.newRow();
-                // Respect selection order
-                for (CFDefinition.Name name : selection.getColumns())
-                {
-                    switch (name.kind)
-                    {
-                        case KEY_ALIAS:
-                            result.add(keyComponents[name.position]);
-                            break;
-                        case COLUMN_ALIAS:
-                            ByteBuffer val = cfDef.isComposite
-                                           ? (name.position < components.length ? components[name.position] : null)
-                                           : c.name();
-                            result.add(val);
-                            break;
-                        case VALUE_ALIAS:
-                            result.add(c);
-                            break;
-                        case COLUMN_METADATA:
-                        case STATIC:
-                            // This should not happen for compact CF
-                            throw new AssertionError();
-                        default:
-                            throw new AssertionError();
-                    }
-                }
-            }
-        }
-        else if (cfDef.isComposite)
-        {
-            // Sparse case: group column in cqlRow when composite prefix is equal
-            CompositeType composite = (CompositeType)cfDef.cfm.comparator;
-
-            ColumnGroupMap.Builder builder = new ColumnGroupMap.Builder(composite, cfDef.hasCollections, now);
-
-            for (Column c : cf)
-            {
-                if (c.isMarkedForDelete(now))
-                    continue;
-
-                builder.add(c);
-            }
-
-            ColumnGroupMap staticGroup = null;
-            // Gather up static values first
-            if (!builder.isEmpty() && builder.firstGroup().isStatic)
-            {
-                staticGroup = builder.firstGroup();
-                builder.discardFirst();
-
-                // If there was static columns but there is no actual row, then provided the select was a full
-                // partition selection (i.e. not a 2ndary index search and there was no condition on clustering columns)
-                // then we want to include the static columns in the result set.
-                if (builder.isEmpty() && !usesSecondaryIndexing && hasNoClusteringColumnsRestriction() && hasValueForQuery(staticGroup))
-                {
-                    handleGroup(result, keyComponents, ColumnGroupMap.EMPTY, staticGroup);
-                    return;
-                }
-            }
-
-            for (ColumnGroupMap group : builder.groups())
-                handleGroup(result, keyComponents, group, staticGroup);
+            keyComponents = ((CompositeType)cfm.getKeyValidator()).split(key);
         }
         else
         {
-            if (cf.hasOnlyTombstones(now))
-                return;
+            keyComponents = new ByteBuffer[]{ key };
+        }
 
-            // Static case: One cqlRow for all columns
+        Iterator<Cell> cells = cf.getSortedColumns().iterator();
+        if (sliceRestriction != null)
+            cells = applySliceRestriction(cells, options);
+
+        CQL3Row.RowIterator iter = cfm.comparator.CQL3RowBuilder(cfm, now).group(cells);
+
+        // If there is static columns but there is no non-static row, then provided the select was a full
+        // partition selection (i.e. not a 2ndary index search and there was no condition on clustering columns)
+        // then we want to include the static columns in the result set (and we're done).
+        CQL3Row staticRow = iter.getStaticRow();
+        if (staticRow != null && !iter.hasNext() && !usesSecondaryIndexing && hasNoClusteringColumnsRestriction())
+        {
             result.newRow();
-            for (CFDefinition.Name name : selection.getColumns())
+            for (ColumnDefinition def : selection.getColumns())
             {
-                if (name.kind == CFDefinition.Name.Kind.KEY_ALIAS)
-                    result.add(keyComponents[name.position]);
-                else
-                    result.add(cf.getColumn(name.name.key));
+                switch (def.kind)
+                {
+                    case PARTITION_KEY:
+                        result.add(keyComponents[def.position()]);
+                        break;
+                    case STATIC:
+                        addValue(result, def, staticRow, options);
+                        break;
+                    default:
+                        result.add((ByteBuffer)null);
+                }
+            }
+            return;
+        }
+
+        while (iter.hasNext())
+        {
+            CQL3Row cql3Row = iter.next();
+
+            // Respect requested order
+            result.newRow();
+            // Respect selection order
+            for (ColumnDefinition def : selection.getColumns())
+            {
+                switch (def.kind)
+                {
+                    case PARTITION_KEY:
+                        result.add(keyComponents[def.position()]);
+                        break;
+                    case CLUSTERING_COLUMN:
+                        result.add(cql3Row.getClusteringColumn(def.position()));
+                        break;
+                    case COMPACT_VALUE:
+                        result.add(cql3Row.getColumn(null));
+                        break;
+                    case REGULAR:
+                        addValue(result, def, cql3Row, options);
+                        break;
+                    case STATIC:
+                        addValue(result, def, staticRow, options);
+                        break;
+                }
             }
         }
     }
 
-    private boolean hasValueForQuery(ColumnGroupMap staticGroup)
+    private static void addValue(Selection.ResultSetBuilder result, ColumnDefinition def, CQL3Row row, QueryOptions options)
     {
-        for (CFDefinition.Name name : Iterables.filter(selection.getColumns(), isStaticFilter))
-            if (staticGroup.hasValueFor(name.name.key))
-                return true;
-        return false;
+        if (row == null)
+        {
+            result.add((ByteBuffer)null);
+            return;
+        }
+
+        if (def.type.isCollection())
+        {
+            List<Cell> collection = row.getCollection(def.name);
+            ByteBuffer value = collection == null
+                             ? null
+                             : ((CollectionType)def.type).serializeForNativeProtocol(collection, options.getProtocolVersion());
+            result.add(value);
+            return;
+        }
+
+        result.add(row.getColumn(def.name));
     }
 
     private boolean hasNoClusteringColumnsRestriction()
@@ -1238,91 +1286,32 @@
     /**
      * Orders results when multiple keys are selected (using IN)
      */
-    private void orderResults(ResultSet cqlRows)
+    private void orderResults(ResultSet cqlRows) throws InvalidRequestException
     {
         if (cqlRows.size() == 0 || !needsPostQueryOrdering())
             return;
 
         assert orderingIndexes != null;
 
-        // optimization when only *one* order condition was given
-        // because there is no point of using composite comparator if there is only one order condition
-        if (parameters.orderings.size() == 1)
-        {
-            CFDefinition.Name ordering = cfDef.get(parameters.orderings.keySet().iterator().next());
-            Collections.sort(cqlRows.rows, new SingleColumnComparator(orderingIndexes.get(ordering), ordering.type));
-            return;
-        }
+        List<Integer> idToSort = new ArrayList<Integer>();
+        List<Comparator<ByteBuffer>> sorters = new ArrayList<Comparator<ByteBuffer>>();
 
-        // builds a 'composite' type for multi-column comparison from the comparators of the ordering components
-        // and passes collected position information and built composite comparator to CompositeComparator to do
-        // an actual comparison of the CQL rows.
-        List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(parameters.orderings.size());
-        int[] positions = new int[parameters.orderings.size()];
-
-        int idx = 0;
         for (ColumnIdentifier identifier : parameters.orderings.keySet())
         {
-            CFDefinition.Name orderingColumn = cfDef.get(identifier);
-            types.add(orderingColumn.type);
-            positions[idx++] = orderingIndexes.get(orderingColumn);
+            ColumnDefinition orderingColumn = cfm.getColumnDefinition(identifier);
+            idToSort.add(orderingIndexes.get(orderingColumn.name));
+            sorters.add(orderingColumn.type);
         }
 
-        Collections.sort(cqlRows.rows, new CompositeComparator(types, positions));
+        Comparator<List<ByteBuffer>> comparator = idToSort.size() == 1
+                                                ? new SingleColumnComparator(idToSort.get(0), sorters.get(0))
+                                                : new CompositeComparator(sorters, idToSort);
+        Collections.sort(cqlRows.rows, comparator);
     }
 
-    private void handleGroup(Selection.ResultSetBuilder result,
-                             ByteBuffer[] keyComponents,
-                             ColumnGroupMap columns,
-                             ColumnGroupMap staticGroup) throws InvalidRequestException
+    private static boolean isReversedType(ColumnDefinition def)
     {
-        // Respect requested order
-        result.newRow();
-        for (CFDefinition.Name name : selection.getColumns())
-        {
-            switch (name.kind)
-            {
-                case KEY_ALIAS:
-                    result.add(keyComponents[name.position]);
-                    break;
-                case COLUMN_ALIAS:
-                    result.add(columns.getKeyComponent(name.position));
-                    break;
-                case VALUE_ALIAS:
-                    // This should not happen for SPARSE
-                    throw new AssertionError();
-                case COLUMN_METADATA:
-                    addValue(result, name, columns);
-                    break;
-                case STATIC:
-                    addValue(result, name, staticGroup);
-                    break;
-            }
-        }
-    }
-
-    private static void addValue(Selection.ResultSetBuilder result, CFDefinition.Name name, ColumnGroupMap group)
-    {
-        if (group == null)
-        {
-            result.add((ByteBuffer)null);
-            return;
-        }
-
-        if (name.type.isCollection())
-        {
-            List<Pair<ByteBuffer, Column>> collection = group.getCollection(name.name.key);
-            result.add(collection == null ? null : ((CollectionType)name.type).serialize(collection));
-        }
-        else
-        {
-            result.add(group.getSimple(name.name.key));
-        }
-    }
-
-    private static boolean isReversedType(CFDefinition.Name name)
-    {
-        return name.type instanceof ReversedType;
+        return def.type instanceof ReversedType;
     }
 
     private boolean columnFilterIsIdentity()
@@ -1346,18 +1335,18 @@
     private void validateDistinctSelection()
     throws InvalidRequestException
     {
-        Collection<CFDefinition.Name> requestedColumns = selection.getColumns();
-        for (CFDefinition.Name name : requestedColumns)
-            if (name.kind != CFDefinition.Name.Kind.KEY_ALIAS && name.kind != CFDefinition.Name.Kind.STATIC)
-                throw new InvalidRequestException(String.format("SELECT DISTINCT queries must only request partition key columns and/or static columns (not %s)", name));
+        Collection<ColumnDefinition> requestedColumns = selection.getColumns();
+        for (ColumnDefinition def : requestedColumns)
+            if (def.kind != ColumnDefinition.Kind.PARTITION_KEY && def.kind != ColumnDefinition.Kind.STATIC)
+                throw new InvalidRequestException(String.format("SELECT DISTINCT queries must only request partition key columns and/or static columns (not %s)", def.name));
 
         // If it's a key range, we require that all partition key columns are selected so we don't have to bother with post-query grouping.
         if (!isKeyRange)
             return;
 
-        for (CFDefinition.Name name : cfDef.partitionKeys())
-            if (!requestedColumns.contains(name))
-                throw new InvalidRequestException(String.format("SELECT DISTINCT queries must request all the partition key columns (missing %s)", name));
+        for (ColumnDefinition def : cfm.partitionKeyColumns())
+            if (!requestedColumns.contains(def))
+                throw new InvalidRequestException(String.format("SELECT DISTINCT queries must request all the partition key columns (missing %s)", def.name));
     }
 
     public static class RawStatement extends CFStatement
@@ -1379,9 +1368,6 @@
         public ParsedStatement.Prepared prepare() throws InvalidRequestException
         {
             CFMetaData cfm = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
-
-            CFDefinition cfDef = cfm.getCfDef();
-
             VariableSpecifications boundNames = getBoundVariables();
 
             // Select clause
@@ -1389,10 +1375,10 @@
                 throw new InvalidRequestException("Only COUNT(*) and COUNT(1) operations are currently supported.");
 
             Selection selection = selectClause.isEmpty()
-                                ? Selection.wildcard(cfDef)
-                                : Selection.fromSelectors(cfDef, selectClause);
+                                ? Selection.wildcard(cfm)
+                                : Selection.fromSelectors(cfm, selectClause);
 
-            SelectStatement stmt = new SelectStatement(cfDef, boundNames.size(), parameters, selection, prepareLimit(boundNames));
+            SelectStatement stmt = new SelectStatement(cfm, boundNames.size(), parameters, selection, prepareLimit(boundNames));
 
             /*
              * WHERE clause. For a given entity, rules are:
@@ -1411,51 +1397,52 @@
                 if (relation.isMultiColumn())
                 {
                     MultiColumnRelation rel = (MultiColumnRelation) relation;
-                    List<CFDefinition.Name> names = new ArrayList<>(rel.getEntities().size());
+                    List<ColumnDefinition> names = new ArrayList<>(rel.getEntities().size());
                     for (ColumnIdentifier entity : rel.getEntities())
                     {
-                        boolean[] queriable = processRelationEntity(stmt, relation, entity, cfDef);
+                        ColumnDefinition def = cfm.getColumnDefinition(entity);
+                        boolean[] queriable = processRelationEntity(stmt, relation, entity, def);
                         hasQueriableIndex |= queriable[0];
                         hasQueriableClusteringColumnIndex |= queriable[1];
-                        Name name = cfDef.get(entity);
-                        names.add(name);
-                        hasMultiColumnRelations |= Kind.COLUMN_ALIAS.equals(name.kind);
+                        names.add(def);
+                        hasMultiColumnRelations |= ColumnDefinition.Kind.CLUSTERING_COLUMN.equals(def.kind);
                     }
                     updateRestrictionsForRelation(stmt, names, rel, boundNames);
                 }
                 else
                 {
                     SingleColumnRelation rel = (SingleColumnRelation) relation;
-                    boolean[] queriable = processRelationEntity(stmt, relation, rel.getEntity(), cfDef);
+                    ColumnIdentifier entity = rel.getEntity();
+                    ColumnDefinition def = cfm.getColumnDefinition(entity);
+                    boolean[] queriable = processRelationEntity(stmt, relation, entity, def);
                     hasQueriableIndex |= queriable[0];
                     hasQueriableClusteringColumnIndex |= queriable[1];
-                    Name name = cfDef.get(rel.getEntity());
-                    hasSingleColumnRelations |= Kind.COLUMN_ALIAS.equals(name.kind);
-                    updateRestrictionsForRelation(stmt, name, rel, boundNames);
+                    hasSingleColumnRelations |= ColumnDefinition.Kind.CLUSTERING_COLUMN.equals(def.kind);
+                    updateRestrictionsForRelation(stmt, def, rel, boundNames);
                 }
             }
             if (hasSingleColumnRelations && hasMultiColumnRelations)
                 throw new InvalidRequestException("Mixing single column relations and multi column relations on clustering columns is not allowed");
 
              // At this point, the select statement if fully constructed, but we still have a few things to validate
-            processPartitionKeyRestrictions(stmt, cfDef, hasQueriableIndex);
+            processPartitionKeyRestrictions(stmt, hasQueriableIndex, cfm);
 
             // All (or none) of the partition key columns have been specified;
             // hence there is no need to turn these restrictions into index expressions.
             if (!stmt.usesSecondaryIndexing)
-                stmt.restrictedNames.removeAll(cfDef.partitionKeys());
+                stmt.restrictedColumns.removeAll(cfm.partitionKeyColumns());
 
             if (stmt.selectsOnlyStaticColumns && stmt.hasClusteringColumnsRestriction())
                 throw new InvalidRequestException("Cannot restrict clustering columns when selecting only static columns");
 
-            processColumnRestrictions(stmt, cfDef, hasQueriableIndex);
+            processColumnRestrictions(stmt, hasQueriableIndex, cfm);
 
             // Covers indexes on the first clustering column (among others).
             if (stmt.isKeyRange && hasQueriableClusteringColumnIndex)
                 stmt.usesSecondaryIndexing = true;
 
             if (!stmt.usesSecondaryIndexing)
-                stmt.restrictedNames.removeAll(cfDef.clusteringColumns());
+                stmt.restrictedColumns.removeAll(cfm.clusteringColumns());
 
             // Even if usesSecondaryIndexing is false at this point, we'll still have to use one if
             // there is restrictions not covered by the PK.
@@ -1470,7 +1457,7 @@
                 validateSecondaryIndexSelections(stmt);
 
             if (!stmt.parameters.orderings.isEmpty())
-                processOrderingClause(stmt, cfDef);
+                processOrderingClause(stmt, cfm);
 
             checkNeedsFiltering(stmt);
 
@@ -1481,15 +1468,14 @@
         }
 
         /** Returns a pair of (hasQueriableIndex, hasQueriableClusteringColumnIndex) */
-        private boolean[] processRelationEntity(SelectStatement stmt, Relation relation, ColumnIdentifier entity, CFDefinition cfDef) throws InvalidRequestException
+        private boolean[] processRelationEntity(SelectStatement stmt, Relation relation, ColumnIdentifier entity, ColumnDefinition def) throws InvalidRequestException
         {
-            CFDefinition.Name name = cfDef.get(entity);
-            if (name == null)
+            if (def == null)
                 handleUnrecognizedEntity(entity, relation);
 
-            stmt.restrictedNames.add(name);
-            if (cfDef.cfm.getColumnDefinition(name.name.key).isIndexed() && relation.operator() == Relation.Type.EQ)
-                return new boolean[]{true, name.kind == CFDefinition.Name.Kind.COLUMN_ALIAS};
+            stmt.restrictedColumns.add(def);
+            if (def.isIndexed() && relation.operator().allowsIndexQuery())
+                return new boolean[]{true, def.kind == ColumnDefinition.Kind.CLUSTERING_COLUMN};
             return new boolean[]{false, false};
         }
 
@@ -1508,29 +1494,29 @@
             if (limit == null)
                 return null;
 
-            Term prepLimit = limit.prepare(limitReceiver());
+            Term prepLimit = limit.prepare(keyspace(), limitReceiver());
             prepLimit.collectMarkerSpecification(boundNames);
             return prepLimit;
         }
 
-        private void updateRestrictionsForRelation(SelectStatement stmt, List<CFDefinition.Name> names, MultiColumnRelation relation, VariableSpecifications boundNames) throws InvalidRequestException
+        private void updateRestrictionsForRelation(SelectStatement stmt, List<ColumnDefinition> defs, MultiColumnRelation relation, VariableSpecifications boundNames) throws InvalidRequestException
         {
-            List<CFDefinition.Name> restrictedColumns = new ArrayList<>();
-            Set<CFDefinition.Name> seen = new HashSet<>();
+            List<ColumnDefinition> restrictedColumns = new ArrayList<>();
+            Set<ColumnDefinition> seen = new HashSet<>();
 
             int previousPosition = -1;
-            for (CFDefinition.Name name : names)
+            for (ColumnDefinition def : defs)
             {
                 // ensure multi-column restriction only applies to clustering columns
-                if (name.kind != CFDefinition.Name.Kind.COLUMN_ALIAS)
-                    throw new InvalidRequestException(String.format("Multi-column relations can only be applied to clustering columns: %s", name));
+                if (def.kind != ColumnDefinition.Kind.CLUSTERING_COLUMN)
+                    throw new InvalidRequestException(String.format("Multi-column relations can only be applied to clustering columns: %s", def));
 
-                if (seen.contains(name))
-                    throw new InvalidRequestException(String.format("Column \"%s\" appeared twice in a relation: %s", name, relation));
-                seen.add(name);
+                if (seen.contains(def))
+                    throw new InvalidRequestException(String.format("Column \"%s\" appeared twice in a relation: %s", def, relation));
+                seen.add(def);
 
                 // check that no clustering columns were skipped
-                if (name.position != previousPosition + 1)
+                if (def.position() != previousPosition + 1)
                 {
                     if (previousPosition == -1)
                         throw new InvalidRequestException(String.format(
@@ -1542,29 +1528,27 @@
                 }
                 previousPosition++;
 
-                Restriction existing = getExistingRestriction(stmt, name);
+                Restriction existing = getExistingRestriction(stmt, def);
                 Relation.Type operator = relation.operator();
                 if (existing != null)
                 {
                     if (operator == Relation.Type.EQ || operator == Relation.Type.IN)
-                        throw new InvalidRequestException(String.format("Column \"%s\" cannot be restricted by more than one relation if it is in an %s relation", name, relation.operator()));
+                        throw new InvalidRequestException(String.format("Column \"%s\" cannot be restricted by more than one relation if it is in an %s relation", def, relation.operator()));
                     else if (!existing.isSlice())
-                        throw new InvalidRequestException(String.format("Column \"%s\" cannot be restricted by an equality relation and an inequality relation", name));
+                        throw new InvalidRequestException(String.format("Column \"%s\" cannot be restricted by an equality relation and an inequality relation", def));
                 }
-                restrictedColumns.add(name);
+                restrictedColumns.add(def);
             }
 
-            boolean onToken = false;
-
             switch (relation.operator())
             {
                 case EQ:
                 {
-                    Term t = relation.getValue().prepare(names);
+                    Term t = relation.getValue().prepare(keyspace(), defs);
                     t.collectMarkerSpecification(boundNames);
-                    Restriction restriction = new MultiColumnRestriction.EQ(t, onToken);
-                    for (CFDefinition.Name name : restrictedColumns)
-                        stmt.columnRestrictions[name.position] = restriction;
+                    Restriction restriction = new MultiColumnRestriction.EQ(t, false);
+                    for (ColumnDefinition def : restrictedColumns)
+                        stmt.columnRestrictions[def.position()] = restriction;
                     break;
                 }
                 case IN:
@@ -1578,7 +1562,7 @@
                         List<Term> terms = new ArrayList<>(inValues.size());
                         for (Term.MultiColumnRaw tuple : inValues)
                         {
-                            Term t = tuple.prepare(names);
+                            Term t = tuple.prepare(keyspace(), defs);
                             t.collectMarkerSpecification(boundNames);
                             terms.add(t);
                         }
@@ -1587,12 +1571,12 @@
                     else
                     {
                         Tuples.INRaw rawMarker = relation.getInMarker();
-                        AbstractMarker t = rawMarker.prepare(names);
+                        AbstractMarker t = rawMarker.prepare(keyspace(), defs);
                         t.collectMarkerSpecification(boundNames);
                         restriction = new MultiColumnRestriction.InWithMarker(t);
                     }
-                    for (CFDefinition.Name name : restrictedColumns)
-                        stmt.columnRestrictions[name.position] = restriction;
+                    for (ColumnDefinition def : restrictedColumns)
+                        stmt.columnRestrictions[def.position()] = restriction;
 
                     break;
                 }
@@ -1601,80 +1585,84 @@
                 case GT:
                 case GTE:
                 {
-                    Term t = relation.getValue().prepare(names);
+                    Term t = relation.getValue().prepare(keyspace(), defs);
                     t.collectMarkerSpecification(boundNames);
-                    for (CFDefinition.Name name : names)
+                    for (ColumnDefinition def : defs)
                     {
-                        Restriction.Slice restriction = (Restriction.Slice)getExistingRestriction(stmt, name);
+                        Restriction.Slice restriction = (Restriction.Slice)getExistingRestriction(stmt, def);
                         if (restriction == null)
-                            restriction = new MultiColumnRestriction.Slice(onToken);
+                            restriction = new MultiColumnRestriction.Slice(false);
                         else if (!restriction.isMultiColumn())
-                            throw new InvalidRequestException(String.format("Column \"%s\" cannot have both tuple-notation inequalities and single-column inequalities: %s", name, relation));
-                        restriction.setBound(relation.operator(), t);
-                        stmt.columnRestrictions[name.position] = restriction;
+                            throw new InvalidRequestException(String.format("Column \"%s\" cannot have both tuple-notation inequalities and single-column inequalities: %s", def.name, relation));
+                        restriction.setBound(def.name, relation.operator(), t);
+                        stmt.columnRestrictions[def.position()] = restriction;
                     }
+                    break;
                 }
+                case NEQ:
+                    throw new InvalidRequestException(String.format("Unsupported \"!=\" relation: %s", relation));
             }
         }
 
-        private Restriction getExistingRestriction(SelectStatement stmt, CFDefinition.Name name)
+        private Restriction getExistingRestriction(SelectStatement stmt, ColumnDefinition def)
         {
-            switch (name.kind)
+            switch (def.kind)
             {
-                case KEY_ALIAS:
-                    return stmt.keyRestrictions[name.position];
-                case COLUMN_ALIAS:
-                    return stmt.columnRestrictions[name.position];
-                case VALUE_ALIAS:
-                    return null;
+                case PARTITION_KEY:
+                    return stmt.keyRestrictions[def.position()];
+                case CLUSTERING_COLUMN:
+                    return stmt.columnRestrictions[def.position()];
+                case REGULAR:
+                case STATIC:
+                    return stmt.metadataRestrictions.get(def.name);
                 default:
-                    return stmt.metadataRestrictions.get(name);
+                    throw new AssertionError();
             }
         }
 
-        private void updateRestrictionsForRelation(SelectStatement stmt, CFDefinition.Name name, SingleColumnRelation relation, VariableSpecifications names) throws InvalidRequestException
+        private void updateRestrictionsForRelation(SelectStatement stmt, ColumnDefinition def, SingleColumnRelation relation, VariableSpecifications names) throws InvalidRequestException
         {
-            switch (name.kind)
+            switch (def.kind)
             {
-                case KEY_ALIAS:
-                    stmt.keyRestrictions[name.position] = updateSingleColumnRestriction(name, stmt.keyRestrictions[name.position], relation, names);
+                case PARTITION_KEY:
+                    stmt.keyRestrictions[def.position()] = updateSingleColumnRestriction(def, stmt.keyRestrictions[def.position()], relation, names);
                     break;
-                case COLUMN_ALIAS:
-                    stmt.columnRestrictions[name.position] = updateSingleColumnRestriction(name, stmt.columnRestrictions[name.position], relation, names);
+                case CLUSTERING_COLUMN:
+                    stmt.columnRestrictions[def.position()] = updateSingleColumnRestriction(def, stmt.columnRestrictions[def.position()], relation, names);
                     break;
-                case VALUE_ALIAS:
-                    throw new InvalidRequestException(String.format("Predicates on the non-primary-key column (%s) of a COMPACT table are not yet supported", name.name));
-                case COLUMN_METADATA:
+                case COMPACT_VALUE:
+                    throw new InvalidRequestException(String.format("Predicates on the non-primary-key column (%s) of a COMPACT table are not yet supported", def.name));
+                case REGULAR:
                 case STATIC:
                     // We only all IN on the row key and last clustering key so far, never on non-PK columns, and this even if there's an index
-                    Restriction r = updateSingleColumnRestriction(name, stmt.metadataRestrictions.get(name), relation, names);
+                    Restriction r = updateSingleColumnRestriction(def, stmt.metadataRestrictions.get(def.name), relation, names);
                     if (r.isIN() && !((Restriction.IN)r).canHaveOnlyOneValue())
                         // Note: for backward compatibility reason, we conside a IN of 1 value the same as a EQ, so we let that slide.
-                        throw new InvalidRequestException(String.format("IN predicates on non-primary-key columns (%s) is not yet supported", name));
-                    stmt.metadataRestrictions.put(name, r);
+                        throw new InvalidRequestException(String.format("IN predicates on non-primary-key columns (%s) is not yet supported", def.name));
+                    stmt.metadataRestrictions.put(def.name, r);
                     break;
             }
         }
 
-        Restriction updateSingleColumnRestriction(CFDefinition.Name name, Restriction existingRestriction, SingleColumnRelation newRel, VariableSpecifications boundNames) throws InvalidRequestException
+        Restriction updateSingleColumnRestriction(ColumnDefinition def, Restriction existingRestriction, SingleColumnRelation newRel, VariableSpecifications boundNames) throws InvalidRequestException
         {
-            ColumnSpecification receiver = name;
+            ColumnSpecification receiver = def;
             if (newRel.onToken)
             {
-                if (name.kind != CFDefinition.Name.Kind.KEY_ALIAS)
-                    throw new InvalidRequestException(String.format("The token() function is only supported on the partition key, found on %s", name));
+                if (def.kind != ColumnDefinition.Kind.PARTITION_KEY)
+                    throw new InvalidRequestException(String.format("The token() function is only supported on the partition key, found on %s", def.name));
 
-                receiver = new ColumnSpecification(name.ksName,
-                                                   name.cfName,
+                receiver = new ColumnSpecification(def.ksName,
+                                                   def.cfName,
                                                    new ColumnIdentifier("partition key token", true),
                                                    StorageService.getPartitioner().getTokenValidator());
             }
 
             // We don't support relations against entire collections, like "numbers = {1, 2, 3}"
-            if (receiver.type.isCollection())
+            if (receiver.type.isCollection() && !(newRel.operator().equals(Relation.Type.CONTAINS_KEY) || newRel.operator() == Relation.Type.CONTAINS))
             {
                 throw new InvalidRequestException(String.format("Collection column '%s' (%s) cannot be restricted by a '%s' relation",
-                                                                name, receiver.type.asCQL3Type(), newRel.operator()));
+                                                                def.name, receiver.type.asCQL3Type(), newRel.operator()));
             }
 
             switch (newRel.operator())
@@ -1682,62 +1670,85 @@
                 case EQ:
                 {
                     if (existingRestriction != null)
-                        throw new InvalidRequestException(String.format("%s cannot be restricted by more than one relation if it includes an Equal", name));
-                    Term t = newRel.getValue().prepare(receiver);
+                        throw new InvalidRequestException(String.format("%s cannot be restricted by more than one relation if it includes an Equal", def.name));
+                    Term t = newRel.getValue().prepare(keyspace(), receiver);
                     t.collectMarkerSpecification(boundNames);
                     existingRestriction = new SingleColumnRestriction.EQ(t, newRel.onToken);
                 }
                 break;
                 case IN:
                     if (existingRestriction != null)
-                        throw new InvalidRequestException(String.format("%s cannot be restricted by more than one relation if it includes a IN", name));
+                        throw new InvalidRequestException(String.format("%s cannot be restricted by more than one relation if it includes a IN", def.name));
 
                     if (newRel.getInValues() == null)
                     {
                         // Means we have a "SELECT ... IN ?"
                         assert newRel.getValue() != null;
-                        Term t = newRel.getValue().prepare(receiver);
+                        Term t = newRel.getValue().prepare(keyspace(), receiver);
                         t.collectMarkerSpecification(boundNames);
                         existingRestriction = new SingleColumnRestriction.InWithMarker((Lists.Marker)t);
                     }
                     else
                     {
-                        List<Term> inValues = new ArrayList<Term>(newRel.getInValues().size());
+                        List<Term> inValues = new ArrayList<>(newRel.getInValues().size());
                         for (Term.Raw raw : newRel.getInValues())
                         {
-                            Term t = raw.prepare(receiver);
+                            Term t = raw.prepare(keyspace(), receiver);
                             t.collectMarkerSpecification(boundNames);
                             inValues.add(t);
                         }
                         existingRestriction = new SingleColumnRestriction.InWithValues(inValues);
                     }
                     break;
+                case NEQ:
+                    throw new InvalidRequestException(String.format("Unsupported \"!=\" relation on column \"%s\"", def.name));
                 case GT:
                 case GTE:
                 case LT:
                 case LTE:
+                    {
+                        if (existingRestriction == null)
+                            existingRestriction = new SingleColumnRestriction.Slice(newRel.onToken);
+                        else if (!existingRestriction.isSlice())
+                            throw new InvalidRequestException(String.format("Column \"%s\" cannot be restricted by both an equality and an inequality relation", def.name));
+                        else if (existingRestriction.isMultiColumn())
+                            throw new InvalidRequestException(String.format("Column \"%s\" cannot be restricted by both a tuple notation inequality and a single column inequality (%s)", def.name, newRel));
+                        else if (existingRestriction.isOnToken() != newRel.onToken)
+                            // For partition keys, we shouldn't have slice restrictions without token(). And while this is rejected later by
+                            // processPartitionKeysRestrictions, we shouldn't update the existing restriction by the new one if the old one was using token()
+                            // and the new one isn't since that would bypass that later test.
+                            throw new InvalidRequestException("Only EQ and IN relation are supported on the partition key (unless you use the token() function)");
+
+                        Term t = newRel.getValue().prepare(keyspace(), receiver);
+                        t.collectMarkerSpecification(boundNames);
+                        ((SingleColumnRestriction.Slice)existingRestriction).setBound(def.name, newRel.operator(), t);
+                    }
+                    break;
+                case CONTAINS_KEY:
+                    if (!(receiver.type instanceof MapType))
+                        throw new InvalidRequestException(String.format("Cannot use CONTAINS_KEY on non-map column %s", def.name));
+                    // Fallthrough on purpose
+                case CONTAINS:
                 {
+                    if (!receiver.type.isCollection())
+                        throw new InvalidRequestException(String.format("Cannot use %s relation on non collection column %s", newRel.operator(), def.name));
+
                     if (existingRestriction == null)
-                        existingRestriction = new SingleColumnRestriction.Slice(newRel.onToken);
-                    else if (!existingRestriction.isSlice())
-                        throw new InvalidRequestException(String.format("Column \"%s\" cannot be restricted by both an equality and an inequality relation", name));
-                    else if (existingRestriction.isOnToken() != newRel.onToken)
-                        // For partition keys, we shouldn't have slice restrictions without token(). And while this is rejected later by
-                        // processPartitionKeysRestrictions, we shouldn't update the existing restriction by the new one if the old one was using token()
-                        // and the new one isn't since that would bypass that later test.
-                        throw new InvalidRequestException("Only EQ and IN relation are supported on the partition key (unless you use the token() function)");
-                    else if (existingRestriction.isMultiColumn())
-                        throw new InvalidRequestException(String.format("Column \"%s\" cannot be restricted by both a tuple notation inequality and a single column inequality (%s)", name, newRel));
-                    Term t = newRel.getValue().prepare(receiver);
+                        existingRestriction = new SingleColumnRestriction.Contains();
+                    else if (!existingRestriction.isContains())
+                        throw new InvalidRequestException(String.format("Collection column %s can only be restricted by CONTAINS or CONTAINS KEY", def.name));
+                    boolean isKey = newRel.operator() == Relation.Type.CONTAINS_KEY;
+                    receiver = makeCollectionReceiver(receiver, isKey);
+                    Term t = newRel.getValue().prepare(keyspace(), receiver);
                     t.collectMarkerSpecification(boundNames);
-                    ((SingleColumnRestriction.Slice)existingRestriction).setBound(newRel.operator(), t);
+                    ((SingleColumnRestriction.Contains)existingRestriction).add(t, isKey);
+                    break;
                 }
-                break;
             }
             return existingRestriction;
         }
 
-        private void processPartitionKeyRestrictions(SelectStatement stmt, CFDefinition cfDef, boolean hasQueriableIndex) throws InvalidRequestException
+        private void processPartitionKeyRestrictions(SelectStatement stmt, boolean hasQueriableIndex, CFMetaData cfm) throws InvalidRequestException
         {
             // If there is a queriable index, no special condition are required on the other restrictions.
             // But we still need to know 2 things:
@@ -1746,12 +1757,12 @@
             // If a component of the partition key is restricted by a relation, all preceding
             // components must have a EQ. Only the last partition key component can be in IN relation.
             boolean canRestrictFurtherComponents = true;
-            CFDefinition.Name previous = null;
+            ColumnDefinition previous = null;
             stmt.keyIsInRelation = false;
-            Iterator<CFDefinition.Name> iter = cfDef.partitionKeys().iterator();
+            Iterator<ColumnDefinition> iter = cfm.partitionKeyColumns().iterator();
             for (int i = 0; i < stmt.keyRestrictions.length; i++)
             {
-                CFDefinition.Name cname = iter.next();
+                ColumnDefinition cdef = iter.next();
                 Restriction restriction = stmt.keyRestrictions[i];
 
                 if (restriction == null)
@@ -1768,7 +1779,7 @@
                             stmt.isKeyRange = true;
                             break;
                         }
-                        throw new InvalidRequestException(String.format("Partition key part %s must be restricted since preceding part is", cname));
+                        throw new InvalidRequestException(String.format("Partition key part %s must be restricted since preceding part is", cdef.name));
                     }
 
                     stmt.isKeyRange = true;
@@ -1783,7 +1794,7 @@
                     }
                     throw new InvalidRequestException(String.format(
                             "Partitioning column \"%s\" cannot be restricted because the preceding column (\"%s\") is " +
-                            "either not restricted or is restricted by a non-EQ relation", cname, previous));
+                            "either not restricted or is restricted by a non-EQ relation", cdef.name, previous));
                 }
                 else if (restriction.isOnToken())
                 {
@@ -1801,7 +1812,7 @@
                     {
                         // We only support IN for the last name so far
                         if (i != stmt.keyRestrictions.length - 1)
-                            throw new InvalidRequestException(String.format("Partition KEY part %s cannot be restricted by IN relation (only the last part of the partition key can)", cname));
+                            throw new InvalidRequestException(String.format("Partition KEY part %s cannot be restricted by IN relation (only the last part of the partition key can)", cdef.name));
                         stmt.keyIsInRelation = true;
                     }
                 }
@@ -1813,43 +1824,43 @@
                     // index with filtering, we'll need to handle it though.
                     throw new InvalidRequestException("Only EQ and IN relation are supported on the partition key (unless you use the token() function)");
                 }
-                previous = cname;
+                previous = cdef;
             }
 
             if (stmt.onToken)
-                checkTokenFunctionArgumentsOrder(cfDef);
+                checkTokenFunctionArgumentsOrder(cfm);
         }
 
         /**
          * Checks that the column identifiers used as argument for the token function have been specified in the
          * partition key order.
-         * @param cfDef the Column Family Definition
+         * @param cfm the Column Family MetaData
          * @throws InvalidRequestException if the arguments have not been provided in the proper order.
          */
-        private void checkTokenFunctionArgumentsOrder(CFDefinition cfDef) throws InvalidRequestException
+        private void checkTokenFunctionArgumentsOrder(CFMetaData cfm) throws InvalidRequestException
         {
-            Iterator<Name> iter = Iterators.cycle(cfDef.partitionKeys());
+            Iterator<ColumnDefinition> iter = Iterators.cycle(cfm.partitionKeyColumns());
             for (Relation relation : whereClause)
             {
                 SingleColumnRelation singleColumnRelation = (SingleColumnRelation) relation;
-                if (singleColumnRelation.onToken && !cfDef.get(singleColumnRelation.getEntity()).equals(iter.next()))
+                if (singleColumnRelation.onToken && !cfm.getColumnDefinition(singleColumnRelation.getEntity()).equals(iter.next()))
                     throw new InvalidRequestException(String.format("The token function arguments must be in the partition key order: %s",
-                                                                    Joiner.on(',').join(cfDef.partitionKeys())));
+                                                                    Joiner.on(',').join(cfm.partitionKeyColumns())));
             }
         }
 
-        private void processColumnRestrictions(SelectStatement stmt, CFDefinition cfDef, boolean hasQueriableIndex) throws InvalidRequestException
+        private void processColumnRestrictions(SelectStatement stmt, boolean hasQueriableIndex, CFMetaData cfm) throws InvalidRequestException
         {
             // If a clustering key column is restricted by a non-EQ relation, all preceding
             // columns must have a EQ, and all following must have no restriction. Unless
             // the column is indexed that is.
             boolean canRestrictFurtherComponents = true;
-            CFDefinition.Name previous = null;
+            ColumnDefinition previous = null;
             boolean previousIsSlice = false;
-            Iterator<CFDefinition.Name> iter = cfDef.clusteringColumns().iterator();
+            Iterator<ColumnDefinition> iter = cfm.clusteringColumns().iterator();
             for (int i = 0; i < stmt.columnRestrictions.length; i++)
             {
-                CFDefinition.Name cname = iter.next();
+                ColumnDefinition cdef = iter.next();
                 Restriction restriction = stmt.columnRestrictions[i];
 
                 if (restriction == null)
@@ -1872,28 +1883,28 @@
                             break;
                         }
                         throw new InvalidRequestException(String.format(
-                                "PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is either not restricted or by a non-EQ relation)", cname, previous));
+                                "PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is either not restricted or by a non-EQ relation)", cdef.name, previous));
                     }
                 }
                 else if (restriction.isSlice())
                 {
-                    previousIsSlice = true;
                     canRestrictFurtherComponents = false;
+                    previousIsSlice = true;
                     Restriction.Slice slice = (Restriction.Slice)restriction;
                     // For non-composite slices, we don't support internally the difference between exclusive and
                     // inclusive bounds, so we deal with it manually.
-                    if (!cfDef.isComposite && (!slice.isInclusive(Bound.START) || !slice.isInclusive(Bound.END)))
+                    if (!cfm.comparator.isCompound() && (!slice.isInclusive(Bound.START) || !slice.isInclusive(Bound.END)))
                         stmt.sliceRestriction = slice;
                 }
                 else if (restriction.isIN())
                 {
                     if (!restriction.isMultiColumn() && i != stmt.columnRestrictions.length - 1)
-                        throw new InvalidRequestException(String.format("Clustering column \"%s\" cannot be restricted by an IN relation", cname));
-                    if (stmt.selectACollection())
-                        throw new InvalidRequestException(String.format("Cannot restrict column \"%s\" by IN relation as a collection is selected by the query", cname));
+                        throw new InvalidRequestException(String.format("Clustering column \"%s\" cannot be restricted by an IN relation", cdef.name));
+                    else if (stmt.selectACollection())
+                        throw new InvalidRequestException(String.format("Cannot restrict column \"%s\" by IN relation as a collection is selected by the query", cdef.name));
                 }
 
-                previous = cname;
+                previous = cdef;
             }
         }
 
@@ -1905,7 +1916,7 @@
             // the static parts. But 1) we don't have an easy way to do that with 2i and 2) since we don't support index on static columns
             // so far, 2i means that you've restricted a non static column, so the query is somewhat non-sensical.
             if (stmt.selectsOnlyStaticColumns)
-                throw new InvalidRequestException("Queries using 2ndary indexes don't support selecting only static columns");
+                throw new InvalidRequestException("Queries using 2ndary indexes don't support selecting only static columns");            
         }
 
         private void verifyOrderingIsAllowed(SelectStatement stmt) throws InvalidRequestException
@@ -1925,74 +1936,50 @@
                 throw new InvalidRequestException(String.format("Order by on unknown column %s", column));
         }
 
-        private void processOrderingClause(SelectStatement stmt, CFDefinition cfDef) throws InvalidRequestException
+        private void processOrderingClause(SelectStatement stmt, CFMetaData cfm) throws InvalidRequestException
         {
             verifyOrderingIsAllowed(stmt);
 
-            // If we order an IN query, we'll have to do a manual sort post-query. Currently, this sorting requires that we
-            // have queried the column on which we sort (TODO: we should update it to add the column on which we sort to the one
-            // queried automatically, and then removing it from the resultSet afterwards if needed)
+            // If we order post-query (see orderResults), the sorted column needs to be in the ResultSet for sorting, even if we don't
+            // ultimately ship them to the client (CASSANDRA-4911).
             if (stmt.keyIsInRelation)
             {
-                stmt.orderingIndexes = new HashMap<CFDefinition.Name, Integer>();
+                stmt.orderingIndexes = new HashMap<>();
                 for (ColumnIdentifier column : stmt.parameters.orderings.keySet())
                 {
-                    final CFDefinition.Name name = cfDef.get(column);
-                    if (name == null)
+                    final ColumnDefinition def = cfm.getColumnDefinition(column);
+                    if (def == null)
                         handleUnrecognizedOrderingColumn(column);
 
-                    if (selectClause.isEmpty()) // wildcard
-                    {
-                        stmt.orderingIndexes.put(name, Iterables.indexOf(cfDef, new Predicate<CFDefinition.Name>()
-                        {
-                            public boolean apply(CFDefinition.Name n)
-                            {
-                                return name.equals(n);
-                            }
-                        }));
-                    }
-                    else
-                    {
-                        boolean hasColumn = false;
-                        for (int i = 0; i < selectClause.size(); i++)
-                        {
-                            RawSelector selector = selectClause.get(i);
-                            if (name.name.equals(selector.selectable))
-                            {
-                                stmt.orderingIndexes.put(name, i);
-                                hasColumn = true;
-                                break;
-                            }
-                        }
-
-                        if (!hasColumn)
-                            throw new InvalidRequestException("ORDER BY could not be used on columns missing in select clause.");
-                    }
+                    int index = indexOf(def, stmt.selection);
+                    if (index < 0)
+                        index = stmt.selection.addColumnForOrdering(def);
+                    stmt.orderingIndexes.put(def.name, index);
                 }
             }
-            stmt.isReversed = isReversed(stmt, cfDef);
+            stmt.isReversed = isReversed(stmt, cfm);
         }
 
-        private boolean isReversed(SelectStatement stmt, CFDefinition cfDef) throws InvalidRequestException
+        private boolean isReversed(SelectStatement stmt, CFMetaData cfm) throws InvalidRequestException
         {
-            Boolean[] reversedMap = new Boolean[cfDef.clusteringColumnsCount()];
+            Boolean[] reversedMap = new Boolean[cfm.clusteringColumns().size()];
             int i = 0;
             for (Map.Entry<ColumnIdentifier, Boolean> entry : stmt.parameters.orderings.entrySet())
             {
                 ColumnIdentifier column = entry.getKey();
                 boolean reversed = entry.getValue();
 
-                CFDefinition.Name name = cfDef.get(column);
-                if (name == null)
+                ColumnDefinition def = cfm.getColumnDefinition(column);
+                if (def == null)
                     handleUnrecognizedOrderingColumn(column);
 
-                if (name.kind != CFDefinition.Name.Kind.COLUMN_ALIAS)
+                if (def.kind != ColumnDefinition.Kind.CLUSTERING_COLUMN)
                     throw new InvalidRequestException(String.format("Order by is currently only supported on the clustered columns of the PRIMARY KEY, got %s", column));
 
-                if (i++ != name.position)
+                if (i++ != def.position())
                     throw new InvalidRequestException(String.format("Order by currently only support the ordering of columns following their declared order in the PRIMARY KEY"));
 
-                reversedMap[name.position] = (reversed != isReversedType(name));
+                reversedMap[def.position()] = (reversed != isReversedType(def));
             }
 
             // Check that all boolean in reversedMap, if set, agrees
@@ -2024,7 +2011,7 @@
                 // We will potentially filter data if either:
                 //  - Have more than one IndexExpression
                 //  - Have no index expression and the column filter is not the identity
-                if (stmt.restrictedNames.size() > 1 || (stmt.restrictedNames.isEmpty() && !stmt.columnFilterIsIdentity()))
+                if (stmt.restrictedColumns.size() > 1 || (stmt.restrictedColumns.isEmpty() && !stmt.columnFilterIsIdentity()))
                     throw new InvalidRequestException("Cannot execute this query as it might involve data filtering and " +
                                                       "thus may have unpredictable performance. If you want to execute " +
                                                       "this query despite the performance unpredictability, use ALLOW FILTERING");
@@ -2042,21 +2029,37 @@
             // than answering with something that is wrong.
             if (stmt.sliceRestriction != null && stmt.isKeyRange && limit != null)
             {
-                SingleColumnRelation rel = findInclusiveClusteringRelationForCompact(stmt.cfDef);
+                SingleColumnRelation rel = findInclusiveClusteringRelationForCompact(stmt.cfm);
                 throw new InvalidRequestException(String.format("The query requests a restriction of rows with a strict bound (%s) over a range of partitions. "
                                                               + "This is not supported by the underlying storage engine for COMPACT tables if a LIMIT is provided. "
                                                               + "Please either make the condition non strict (%s) or remove the user LIMIT", rel, rel.withNonStrictOperator()));
             }
         }
 
-        private SingleColumnRelation findInclusiveClusteringRelationForCompact(CFDefinition cfDef)
+        private int indexOf(ColumnDefinition def, Selection selection)
+        {
+            return indexOf(def, selection.getColumns().iterator());
+        }
+
+        private int indexOf(final ColumnDefinition def, Iterator<ColumnDefinition> defs)
+        {
+            return Iterators.indexOf(defs, new Predicate<ColumnDefinition>()
+                                           {
+                                               public boolean apply(ColumnDefinition n)
+                                               {
+                                                   return def.name.equals(n.name);
+                                               }
+                                           });
+        }
+
+        private SingleColumnRelation findInclusiveClusteringRelationForCompact(CFMetaData cfm)
         {
             for (Relation r : whereClause)
             {
                 // We only call this when sliceRestriction != null, i.e. for compact table with non composite comparator,
                 // so it can't be a MultiColumnRelation.
                 SingleColumnRelation rel = (SingleColumnRelation)r;
-                if (cfDef.get(rel.getEntity()).kind == CFDefinition.Name.Kind.COLUMN_ALIAS
+                if (cfm.getColumnDefinition(rel.getEntity()).kind == ColumnDefinition.Kind.CLUSTERING_COLUMN
                     && (rel.operator() == Relation.Type.GT || rel.operator() == Relation.Type.LT))
                     return rel;
             }
@@ -2081,6 +2084,23 @@
             return new ColumnSpecification(keyspace(), columnFamily(), new ColumnIdentifier("[limit]", true), Int32Type.instance);
         }
 
+        private static ColumnSpecification makeCollectionReceiver(ColumnSpecification collection, boolean isKey)
+        {
+            assert collection.type.isCollection();
+            switch (((CollectionType)collection.type).kind)
+            {
+                case LIST:
+                    assert !isKey;
+                    return Lists.valueSpecOf(collection);
+                case SET:
+                    assert !isKey;
+                    return Sets.valueSpecOf(collection);
+                case MAP:
+                    return isKey ? Maps.keySpecOf(collection) : Maps.valueSpecOf(collection);
+            }
+            throw new AssertionError();
+        }
+
         @Override
         public String toString()
         {
@@ -2122,9 +2142,9 @@
     private static class SingleColumnComparator implements Comparator<List<ByteBuffer>>
     {
         private final int index;
-        private final AbstractType<?> comparator;
+        private final Comparator<ByteBuffer> comparator;
 
-        public SingleColumnComparator(int columnIndex, AbstractType<?> orderer)
+        public SingleColumnComparator(int columnIndex, Comparator<ByteBuffer> orderer)
         {
             index = columnIndex;
             comparator = orderer;
@@ -2141,10 +2161,10 @@
      */
     private static class CompositeComparator implements Comparator<List<ByteBuffer>>
     {
-        private final List<AbstractType<?>> orderTypes;
-        private final int[] positions;
+        private final List<Comparator<ByteBuffer>> orderTypes;
+        private final List<Integer> positions;
 
-        private CompositeComparator(List<AbstractType<?>> orderTypes, int[] positions)
+        private CompositeComparator(List<Comparator<ByteBuffer>> orderTypes, List<Integer> positions)
         {
             this.orderTypes = orderTypes;
             this.positions = positions;
@@ -2152,10 +2172,10 @@
 
         public int compare(List<ByteBuffer> a, List<ByteBuffer> b)
         {
-            for (int i = 0; i < positions.length; i++)
+            for (int i = 0; i < positions.size(); i++)
             {
-                AbstractType<?> type = orderTypes.get(i);
-                int columnPos = positions[i];
+                Comparator<ByteBuffer> type = orderTypes.get(i);
+                int columnPos = positions.get(i);
 
                 ByteBuffer aValue = a.get(columnPos);
                 ByteBuffer bValue = b.get(columnPos);

diff --git a/src/java/org/apache/cassandra/cql3/statements/Selectable.java b/src/java/org/apache/cassandra/cql3/statements/Selectable.java
index 9f25542..448301c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/Selectable.java
+++ b/src/java/org/apache/cassandra/cql3/statements/Selectable.java

@@ -66,4 +66,22 @@
             return sb.append(")").toString();
         }
     }
+
+    public static class WithFieldSelection implements Selectable
+    {
+        public final Selectable selected;
+        public final ColumnIdentifier field;
+
+        public WithFieldSelection(Selectable selected, ColumnIdentifier field)
+        {
+            this.selected = selected;
+            this.field = field;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%s.%s", selected, field);
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/Selection.java b/src/java/org/apache/cassandra/cql3/statements/Selection.java
index 37ab384..7893b4d 100644
--- a/src/java/org/apache/cassandra/cql3/statements/Selection.java
+++ b/src/java/org/apache/cassandra/cql3/statements/Selection.java

@@ -22,30 +22,36 @@
 import java.util.Collection;
 import java.util.List;
 
+import com.google.common.collect.Iterators;
+
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.functions.Functions;
-import org.apache.cassandra.db.CounterColumn;
-import org.apache.cassandra.db.ExpiringColumn;
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.CounterCell;
+import org.apache.cassandra.db.ExpiringCell;
 import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public abstract class Selection
 {
-    private final Collection<CFDefinition.Name> columns;
-    private final List<ColumnSpecification> metadata;
+    private final Collection<ColumnDefinition> columns;
+    private final ResultSet.Metadata metadata;
     private final boolean collectTimestamps;
     private final boolean collectTTLs;
 
-    protected Selection(Collection<CFDefinition.Name> columns, List<ColumnSpecification> metadata, boolean collectTimestamps, boolean collectTTLs)
+    protected Selection(Collection<ColumnDefinition> columns, List<ColumnSpecification> metadata, boolean collectTimestamps, boolean collectTTLs)
     {
         this.columns = columns;
-        this.metadata = metadata;
+        this.metadata = new ResultSet.Metadata(metadata);
         this.collectTimestamps = collectTimestamps;
         this.collectTTLs = collectTTLs;
     }
@@ -58,22 +64,28 @@
 
     public ResultSet.Metadata getResultMetadata()
     {
-        return new ResultSet.Metadata(metadata);
+        return metadata;
     }
 
-    public static Selection wildcard(CFDefinition cfDef)
+    public static Selection wildcard(CFMetaData cfm)
     {
-        List<CFDefinition.Name> all = new ArrayList<CFDefinition.Name>();
-        for (CFDefinition.Name name : cfDef)
-            all.add(name);
+        List<ColumnDefinition> all = new ArrayList<ColumnDefinition>(cfm.allColumns().size());
+        Iterators.addAll(all, cfm.allColumnsInSelectOrder());
         return new SimpleSelection(all, true);
     }
 
-    public static Selection forColumns(Collection<CFDefinition.Name> columns)
+    public static Selection forColumns(Collection<ColumnDefinition> columns)
     {
         return new SimpleSelection(columns, false);
     }
 
+    public int addColumnForOrdering(ColumnDefinition c)
+    {
+        columns.add(c);
+        metadata.addNonSerializedColumn(c);
+        return columns.size() - 1;
+    }
+
     private static boolean isUsingFunction(List<RawSelector> rawSelectors)
     {
         for (RawSelector rawSelector : rawSelectors)
@@ -84,70 +96,98 @@
         return false;
     }
 
-    private static int addAndGetIndex(CFDefinition.Name name, List<CFDefinition.Name> l)
+    private static int addAndGetIndex(ColumnDefinition def, List<ColumnDefinition> l)
     {
-        int idx = l.indexOf(name);
+        int idx = l.indexOf(def);
         if (idx < 0)
         {
             idx = l.size();
-            l.add(name);
+            l.add(def);
         }
         return idx;
     }
 
-    private static Selector makeSelector(CFDefinition cfDef, RawSelector raw, List<CFDefinition.Name> names, List<ColumnSpecification> metadata) throws InvalidRequestException
+    private static Selector makeSelector(CFMetaData cfm, RawSelector raw, List<ColumnDefinition> defs, List<ColumnSpecification> metadata) throws InvalidRequestException
     {
         if (raw.selectable instanceof ColumnIdentifier)
         {
-            CFDefinition.Name name = cfDef.get((ColumnIdentifier)raw.selectable);
-            if (name == null)
+            ColumnDefinition def = cfm.getColumnDefinition((ColumnIdentifier)raw.selectable);
+            if (def == null)
                 throw new InvalidRequestException(String.format("Undefined name %s in selection clause", raw.selectable));
             if (metadata != null)
-                metadata.add(raw.alias == null ? name : makeAliasSpec(cfDef, name.type, raw.alias));
-            return new SimpleSelector(name.toString(), addAndGetIndex(name, names), name.type);
+                metadata.add(raw.alias == null ? def : makeAliasSpec(cfm, def.type, raw.alias));
+            return new SimpleSelector(def.name.toString(), addAndGetIndex(def, defs), def.type);
         }
         else if (raw.selectable instanceof Selectable.WritetimeOrTTL)
         {
             Selectable.WritetimeOrTTL tot = (Selectable.WritetimeOrTTL)raw.selectable;
-            CFDefinition.Name name = cfDef.get(tot.id);
-            if (name == null)
+            ColumnDefinition def = cfm.getColumnDefinition(tot.id);
+            if (def == null)
                 throw new InvalidRequestException(String.format("Undefined name %s in selection clause", tot.id));
-            if (name.isPrimaryKeyColumn())
-                throw new InvalidRequestException(String.format("Cannot use selection function %s on PRIMARY KEY part %s", tot.isWritetime ? "writeTime" : "ttl", name));
-            if (name.type.isCollection())
+            if (def.isPrimaryKeyColumn())
+                throw new InvalidRequestException(String.format("Cannot use selection function %s on PRIMARY KEY part %s", tot.isWritetime ? "writeTime" : "ttl", def.name));
+            if (def.type.isCollection())
                 throw new InvalidRequestException(String.format("Cannot use selection function %s on collections", tot.isWritetime ? "writeTime" : "ttl"));
 
             if (metadata != null)
-                metadata.add(makeWritetimeOrTTLSpec(cfDef, tot, raw.alias));
-            return new WritetimeOrTTLSelector(name.toString(), addAndGetIndex(name, names), tot.isWritetime);
+                metadata.add(makeWritetimeOrTTLSpec(cfm, tot, raw.alias));
+            return new WritetimeOrTTLSelector(def.name.toString(), addAndGetIndex(def, defs), tot.isWritetime);
+        }
+        else if (raw.selectable instanceof Selectable.WithFieldSelection)
+        {
+            Selectable.WithFieldSelection withField = (Selectable.WithFieldSelection)raw.selectable;
+            Selector selected = makeSelector(cfm, new RawSelector(withField.selected, null), defs, null);
+            AbstractType<?> type = selected.getType();
+            if (!(type instanceof UserType))
+                throw new InvalidRequestException(String.format("Invalid field selection: %s of type %s is not a user type", withField.selected, type.asCQL3Type()));
+
+            UserType ut = (UserType)type;
+            for (int i = 0; i < ut.size(); i++)
+            {
+                if (!ut.fieldName(i).equals(withField.field.bytes))
+                    continue;
+
+                if (metadata != null)
+                    metadata.add(makeFieldSelectSpec(cfm, withField, ut.fieldType(i), raw.alias));
+                return new FieldSelector(ut, i, selected);
+            }
+            throw new InvalidRequestException(String.format("%s of type %s has no field %s", withField.selected, type.asCQL3Type(), withField.field));
         }
         else
         {
             Selectable.WithFunction withFun = (Selectable.WithFunction)raw.selectable;
             List<Selector> args = new ArrayList<Selector>(withFun.args.size());
             for (Selectable rawArg : withFun.args)
-                args.add(makeSelector(cfDef, new RawSelector(rawArg, null), names, null));
+                args.add(makeSelector(cfm, new RawSelector(rawArg, null), defs, null));
 
-            AbstractType<?> returnType = Functions.getReturnType(withFun.functionName, cfDef.cfm.ksName, cfDef.cfm.cfName);
+            AbstractType<?> returnType = Functions.getReturnType(withFun.functionName, cfm.ksName, cfm.cfName);
             if (returnType == null)
                 throw new InvalidRequestException(String.format("Unknown function '%s'", withFun.functionName));
-            ColumnSpecification spec = makeFunctionSpec(cfDef, withFun, returnType, raw.alias);
-            Function fun = Functions.get(withFun.functionName, args, spec);
+            ColumnSpecification spec = makeFunctionSpec(cfm, withFun, returnType, raw.alias);
+            Function fun = Functions.get(cfm.ksName, withFun.functionName, args, spec);
             if (metadata != null)
                 metadata.add(spec);
             return new FunctionSelector(fun, args);
         }
     }
 
-    private static ColumnSpecification makeWritetimeOrTTLSpec(CFDefinition cfDef, Selectable.WritetimeOrTTL tot, ColumnIdentifier alias)
+    private static ColumnSpecification makeWritetimeOrTTLSpec(CFMetaData cfm, Selectable.WritetimeOrTTL tot, ColumnIdentifier alias)
     {
-        return new ColumnSpecification(cfDef.cfm.ksName,
-                                       cfDef.cfm.cfName,
+        return new ColumnSpecification(cfm.ksName,
+                                       cfm.cfName,
                                        alias == null ? new ColumnIdentifier(tot.toString(), true) : alias,
                                        tot.isWritetime ? LongType.instance : Int32Type.instance);
     }
 
-    private static ColumnSpecification makeFunctionSpec(CFDefinition cfDef,
+    private static ColumnSpecification makeFieldSelectSpec(CFMetaData cfm, Selectable.WithFieldSelection s, AbstractType<?> type, ColumnIdentifier alias)
+    {
+        return new ColumnSpecification(cfm.ksName,
+                                       cfm.cfName,
+                                       alias == null ? new ColumnIdentifier(s.toString(), true) : alias,
+                                       type);
+    }
+
+    private static ColumnSpecification makeFunctionSpec(CFMetaData cfm,
                                                         Selectable.WithFunction fun,
                                                         AbstractType<?> returnType,
                                                         ColumnIdentifier alias) throws InvalidRequestException
@@ -155,31 +195,31 @@
         if (returnType == null)
             throw new InvalidRequestException(String.format("Unknown function %s called in selection clause", fun.functionName));
 
-        return new ColumnSpecification(cfDef.cfm.ksName,
-                                       cfDef.cfm.cfName,
+        return new ColumnSpecification(cfm.ksName,
+                                       cfm.cfName,
                                        alias == null ? new ColumnIdentifier(fun.toString(), true) : alias,
                                        returnType);
     }
 
-    private static ColumnSpecification makeAliasSpec(CFDefinition cfDef, AbstractType<?> type, ColumnIdentifier alias)
+    private static ColumnSpecification makeAliasSpec(CFMetaData cfm, AbstractType<?> type, ColumnIdentifier alias)
     {
-        return new ColumnSpecification(cfDef.cfm.ksName, cfDef.cfm.cfName, alias, type);
+        return new ColumnSpecification(cfm.ksName, cfm.cfName, alias, type);
     }
 
-    public static Selection fromSelectors(CFDefinition cfDef, List<RawSelector> rawSelectors) throws InvalidRequestException
+    public static Selection fromSelectors(CFMetaData cfm, List<RawSelector> rawSelectors) throws InvalidRequestException
     {
         boolean usesFunction = isUsingFunction(rawSelectors);
 
         if (usesFunction)
         {
-            List<CFDefinition.Name> names = new ArrayList<CFDefinition.Name>();
+            List<ColumnDefinition> defs = new ArrayList<ColumnDefinition>();
             List<ColumnSpecification> metadata = new ArrayList<ColumnSpecification>(rawSelectors.size());
             List<Selector> selectors = new ArrayList<Selector>(rawSelectors.size());
             boolean collectTimestamps = false;
             boolean collectTTLs = false;
             for (RawSelector rawSelector : rawSelectors)
             {
-                Selector selector = makeSelector(cfDef, rawSelector, names, metadata);
+                Selector selector = makeSelector(cfm, rawSelector, defs, metadata);
                 selectors.add(selector);
                 if (selector instanceof WritetimeOrTTLSelector)
                 {
@@ -187,45 +227,31 @@
                     collectTTLs |= !((WritetimeOrTTLSelector)selector).isWritetime;
                 }
             }
-            return new SelectionWithFunctions(names, metadata, selectors, collectTimestamps, collectTTLs);
+            return new SelectionWithFunctions(defs, metadata, selectors, collectTimestamps, collectTTLs);
         }
         else
         {
-            List<CFDefinition.Name> names = new ArrayList<CFDefinition.Name>(rawSelectors.size());
+            List<ColumnDefinition> defs = new ArrayList<ColumnDefinition>(rawSelectors.size());
             List<ColumnSpecification> metadata = new ArrayList<ColumnSpecification>(rawSelectors.size());
             for (RawSelector rawSelector : rawSelectors)
             {
                 assert rawSelector.selectable instanceof ColumnIdentifier;
-                CFDefinition.Name name = cfDef.get((ColumnIdentifier)rawSelector.selectable);
-                if (name == null)
+                ColumnDefinition def = cfm.getColumnDefinition((ColumnIdentifier)rawSelector.selectable);
+                if (def == null)
                     throw new InvalidRequestException(String.format("Undefined name %s in selection clause", rawSelector.selectable));
-                names.add(name);
-                metadata.add(rawSelector.alias == null ? name : makeAliasSpec(cfDef, name.type, rawSelector.alias));
+                defs.add(def);
+                metadata.add(rawSelector.alias == null ? def : makeAliasSpec(cfm, def.type, rawSelector.alias));
             }
-            return new SimpleSelection(names, metadata, false);
+            return new SimpleSelection(defs, metadata, false);
         }
     }
 
     protected abstract List<ByteBuffer> handleRow(ResultSetBuilder rs) throws InvalidRequestException;
 
     /**
-     * @return the list of CQL3 "regular" (the "COLUMN_METADATA" ones) column names to fetch.
-     */
-    public List<ColumnIdentifier> regularAndStaticColumnsToFetch()
-    {
-        List<ColumnIdentifier> toFetch = new ArrayList<ColumnIdentifier>();
-        for (CFDefinition.Name name : columns)
-        {
-            if (name.kind == CFDefinition.Name.Kind.COLUMN_METADATA || name.kind == CFDefinition.Name.Kind.STATIC)
-                toFetch.add(name.name);
-        }
-        return toFetch;
-    }
-
-    /**
      * @return the list of CQL3 columns value this SelectionClause needs.
      */
-    public Collection<CFDefinition.Name> getColumns()
+    public Collection<ColumnDefinition> getColumns()
     {
         return columns;
     }
@@ -235,9 +261,9 @@
         return new ResultSetBuilder(now);
     }
 
-    private static ByteBuffer value(Column c)
+    private static ByteBuffer value(Cell c)
     {
-        return (c instanceof CounterColumn)
+        return (c instanceof CounterCell)
             ? ByteBufferUtil.bytes(CounterContext.instance().total(c.value()))
             : c.value();
     }
@@ -261,7 +287,7 @@
 
         private ResultSetBuilder(long now)
         {
-            this.resultSet = new ResultSet(metadata);
+            this.resultSet = new ResultSet(getResultMetadata().copy(), new ArrayList<List<ByteBuffer>>());
             this.timestamps = collectTimestamps ? new long[columns.size()] : null;
             this.ttls = collectTTLs ? new int[columns.size()] : null;
             this.now = now;
@@ -272,7 +298,7 @@
             current.add(v);
         }
 
-        public void add(Column c)
+        public void add(Cell c)
         {
             current.add(isDead(c) ? null : value(c));
             if (timestamps != null)
@@ -282,15 +308,15 @@
             if (ttls != null)
             {
                 int ttl = -1;
-                if (!isDead(c) && c instanceof ExpiringColumn)
+                if (!isDead(c) && c instanceof ExpiringCell)
                     ttl = c.getLocalDeletionTime() - (int) (now / 1000);
                 ttls[current.size() - 1] = ttl;
             }
         }
 
-        private boolean isDead(Column c)
+        private boolean isDead(Cell c)
         {
-            return c == null || c.isMarkedForDelete(now);
+            return c == null || !c.isLive(now);
         }
 
         public void newRow() throws InvalidRequestException
@@ -316,12 +342,12 @@
     {
         private final boolean isWildcard;
 
-        public SimpleSelection(Collection<CFDefinition.Name> columns, boolean isWildcard)
+        public SimpleSelection(Collection<ColumnDefinition> columns, boolean isWildcard)
         {
             this(columns, new ArrayList<ColumnSpecification>(columns), isWildcard);
         }
 
-        public SimpleSelection(Collection<CFDefinition.Name> columns, List<ColumnSpecification> metadata, boolean isWildcard)
+        public SimpleSelection(Collection<ColumnDefinition> columns, List<ColumnSpecification> metadata, boolean isWildcard)
         {
             /*
              * In theory, even a simple selection could have multiple time the same column, so we
@@ -344,12 +370,18 @@
         }
     }
 
-    private interface Selector extends AssignementTestable
+    private static abstract class Selector implements AssignementTestable
     {
-        public ByteBuffer compute(ResultSetBuilder rs) throws InvalidRequestException;
+        public abstract ByteBuffer compute(ResultSetBuilder rs) throws InvalidRequestException;
+        public abstract AbstractType<?> getType();
+
+        public boolean isAssignableTo(String keyspace, ColumnSpecification receiver)
+        {
+            return receiver.type.isValueCompatibleWith(getType());
+        }
     }
 
-    private static class SimpleSelector implements Selector
+    private static class SimpleSelector extends Selector
     {
         private final String columnName;
         private final int idx;
@@ -367,9 +399,9 @@
             return rs.current.get(idx);
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public AbstractType<?> getType()
         {
-            return receiver.type.isValueCompatibleWith(type);
+            return type;
         }
 
         @Override
@@ -379,7 +411,7 @@
         }
     }
 
-    private static class FunctionSelector implements Selector
+    private static class FunctionSelector extends Selector
     {
         private final Function fun;
         private final List<Selector> argSelectors;
@@ -399,9 +431,9 @@
             return fun.execute(args);
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public AbstractType<?> getType()
         {
-            return receiver.type.isValueCompatibleWith(fun.returnType());
+            return fun.returnType();
         }
 
         @Override
@@ -419,7 +451,41 @@
         }
     }
 
-    private static class WritetimeOrTTLSelector implements Selector
+    private static class FieldSelector extends Selector
+    {
+        private final UserType type;
+        private final int field;
+        private final Selector selected;
+
+        public FieldSelector(UserType type, int field, Selector selected)
+        {
+            this.type = type;
+            this.field = field;
+            this.selected = selected;
+        }
+
+        public ByteBuffer compute(ResultSetBuilder rs) throws InvalidRequestException
+        {
+            ByteBuffer value = selected.compute(rs);
+            if (value == null)
+                return null;
+            ByteBuffer[] buffers = type.split(value);
+            return field < buffers.length ? buffers[field] : null;
+        }
+
+        public AbstractType<?> getType()
+        {
+            return type.fieldType(field);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%s.%s", selected, UTF8Type.instance.getString(type.fieldName(field)));
+        }
+    }
+
+    private static class WritetimeOrTTLSelector extends Selector
     {
         private final String columnName;
         private final int idx;
@@ -444,9 +510,9 @@
             return ttl > 0 ? ByteBufferUtil.bytes(ttl) : null;
         }
 
-        public boolean isAssignableTo(ColumnSpecification receiver)
+        public AbstractType<?> getType()
         {
-            return receiver.type.isValueCompatibleWith(isWritetime ? LongType.instance : Int32Type.instance);
+            return isWritetime ? LongType.instance : Int32Type.instance;
         }
 
         @Override
@@ -460,7 +526,7 @@
     {
         private final List<Selector> selectors;
 
-        public SelectionWithFunctions(Collection<CFDefinition.Name> columns, List<ColumnSpecification> metadata, List<Selector> selectors, boolean collectTimestamps, boolean collectTTLs)
+        public SelectionWithFunctions(Collection<ColumnDefinition> columns, List<ColumnSpecification> metadata, List<Selector> selectors, boolean collectTimestamps, boolean collectTTLs)
         {
             super(columns, metadata, collectTimestamps, collectTTLs);
             this.selectors = selectors;

diff --git a/src/java/org/apache/cassandra/cql3/statements/SingleColumnRestriction.java b/src/java/org/apache/cassandra/cql3/statements/SingleColumnRestriction.java
index 2e63272..bc77357 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SingleColumnRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SingleColumnRestriction.java

@@ -18,8 +18,8 @@
 package org.apache.cassandra.cql3.statements;
 
 import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.IndexExpression;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.thrift.IndexOperator;
 
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
@@ -44,9 +44,9 @@
             this.onToken = onToken;
         }
 
-        public List<ByteBuffer> values(List<ByteBuffer> variables) throws InvalidRequestException
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
         {
-            return Collections.singletonList(value.bindAndGet(variables));
+            return Collections.singletonList(value.bindAndGet(options));
         }
 
         public boolean isSlice()
@@ -64,6 +64,11 @@
             return false;
         }
 
+        public boolean isContains()
+        {
+            return false;
+        }
+
         public boolean isOnToken()
         {
             return onToken;
@@ -85,11 +90,11 @@
             this.values = values;
         }
 
-        public List<ByteBuffer> values(List<ByteBuffer> variables) throws InvalidRequestException
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
         {
             List<ByteBuffer> buffers = new ArrayList<>(values.size());
             for (Term value : values)
-                buffers.add(value.bindAndGet(variables));
+                buffers.add(value.bindAndGet(options));
             return buffers;
         }
 
@@ -113,6 +118,11 @@
             return true;
         }
 
+        public boolean isContains()
+        {
+            return false;
+        }
+
         public boolean isOnToken()
         {
             return false;
@@ -134,9 +144,9 @@
             this.marker = marker;
         }
 
-        public List<ByteBuffer> values(List<ByteBuffer> variables) throws InvalidRequestException
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
         {
-            Term.MultiItemTerminal lval = (Term.MultiItemTerminal)marker.bind(variables);
+            Term.MultiItemTerminal lval = (Term.MultiItemTerminal)marker.bind(options);
             if (lval == null)
                 throw new InvalidRequestException("Invalid null value for IN restriction");
             return lval.getElements();
@@ -162,6 +172,11 @@
             return true;
         }
 
+        public boolean isContains()
+        {
+            return false;
+        }
+
         public boolean isOnToken()
         {
             return false;
@@ -202,7 +217,12 @@
             return false;
         }
 
-        public List<ByteBuffer> values(List<ByteBuffer> variables) throws InvalidRequestException
+        public boolean isContains()
+        {
+            return false;
+        }
+
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
         {
             throw new UnsupportedOperationException();
         }
@@ -218,9 +238,9 @@
             return bounds[b.idx] != null;
         }
 
-        public ByteBuffer bound(Bound b, List<ByteBuffer> variables) throws InvalidRequestException
+        public ByteBuffer bound(Bound b, QueryOptions options) throws InvalidRequestException
         {
-            return bounds[b.idx].bindAndGet(variables);
+            return bounds[b.idx].bindAndGet(options);
         }
 
         /** Returns true if the start or end bound (depending on the argument) is inclusive, false otherwise */
@@ -241,19 +261,19 @@
             throw new AssertionError();
         }
 
-        public IndexOperator getIndexOperator(Bound b)
+        public IndexExpression.Operator getIndexOperator(Bound b)
         {
             switch (b)
             {
                 case START:
-                    return boundInclusive[b.idx] ? IndexOperator.GTE : IndexOperator.GT;
+                    return boundInclusive[b.idx] ? IndexExpression.Operator.GTE : IndexExpression.Operator.GT;
                 case END:
-                    return boundInclusive[b.idx] ? IndexOperator.LTE : IndexOperator.LT;
+                    return boundInclusive[b.idx] ? IndexExpression.Operator.LTE : IndexExpression.Operator.LT;
             }
             throw new AssertionError();
         }
 
-        public void setBound(Relation.Type type, Term t) throws InvalidRequestException
+        public void setBound(ColumnIdentifier name, Relation.Type type, Term t) throws InvalidRequestException
         {
             Bound b;
             boolean inclusive;
@@ -281,7 +301,7 @@
 
             if (bounds[b.idx] != null)
                 throw new InvalidRequestException(String.format(
-                        "More than one restriction was found for the %s bound", b.name().toLowerCase()));
+                        "More than one restriction was found for the %s bound on %s", b.name().toLowerCase(), name));
 
             bounds[b.idx] = t;
             boundInclusive[b.idx] = inclusive;
@@ -297,4 +317,97 @@
                                  onToken ? "*" : "");
         }
     }
+
+    // This holds both CONTAINS and CONTAINS_KEY restriction because we might want to have both of them.
+    public static class Contains extends SingleColumnRestriction
+    {
+        private List<Term> values; // for CONTAINS
+        private List<Term> keys;   // for CONTAINS_KEY
+
+        public boolean hasContains()
+        {
+            return values != null;
+        }
+
+        public boolean hasContainsKey()
+        {
+            return keys != null;
+        }
+
+        public void add(Term t, boolean isKey)
+        {
+            if (isKey)
+                addKey(t);
+            else
+                addValue(t);
+        }
+
+        public void addValue(Term t)
+        {
+            if (values == null)
+                values = new ArrayList<>();
+            values.add(t);
+        }
+
+        public void addKey(Term t)
+        {
+            if (keys == null)
+                keys = new ArrayList<>();
+            keys.add(t);
+        }
+
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
+        {
+            if (values == null)
+                return Collections.emptyList();
+
+            List<ByteBuffer> buffers = new ArrayList<ByteBuffer>(values.size());
+            for (Term value : values)
+                buffers.add(value.bindAndGet(options));
+            return buffers;
+        }
+
+        public List<ByteBuffer> keys(QueryOptions options) throws InvalidRequestException
+        {
+            if (keys == null)
+                return Collections.emptyList();
+
+            List<ByteBuffer> buffers = new ArrayList<ByteBuffer>(keys.size());
+            for (Term value : keys)
+                buffers.add(value.bindAndGet(options));
+            return buffers;
+        }
+
+        public boolean isSlice()
+        {
+            return false;
+        }
+
+        public boolean isEQ()
+        {
+            return false;
+        }
+
+        public boolean isIN()
+        {
+            return false;
+        }
+
+        public boolean isContains()
+        {
+            return true;
+        }
+
+        public boolean isOnToken()
+        {
+            return false;
+        }
+
+
+        @Override
+        public String toString()
+        {
+            return String.format("CONTAINS(values=%s, keys=%s)", values, keys);
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
index 8453a76..8eeab71 100644
--- a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java

@@ -22,7 +22,9 @@
 
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
@@ -33,11 +35,11 @@
  */
 public class UpdateStatement extends ModificationStatement
 {
-    private static final Operation setToEmptyOperation = new Constants.Setter(null, new Constants.Value(ByteBufferUtil.EMPTY_BYTE_BUFFER));
+    private static final Constants.Value EMPTY = new Constants.Value(ByteBufferUtil.EMPTY_BYTE_BUFFER);
 
-    private UpdateStatement(StatementType type, CFMetaData cfm, Attributes attrs)
+    private UpdateStatement(StatementType type, int boundTerms, CFMetaData cfm, Attributes attrs)
     {
-        super(type, cfm, attrs);
+        super(type, boundTerms, cfm, attrs);
     }
 
     public boolean requireFullClusteringKey()
@@ -45,11 +47,9 @@
         return true;
     }
 
-    public void addUpdateForKey(ColumnFamily cf, ByteBuffer key, ColumnNameBuilder builder, UpdateParameters params)
+    public void addUpdateForKey(ColumnFamily cf, ByteBuffer key, Composite prefix, UpdateParameters params)
     throws InvalidRequestException
     {
-        CFDefinition cfDef = cfm.getCfDef();
-
         // Inserting the CQL row marker (see #4361)
         // We always need to insert a marker for INSERT, because of the following situation:
         //   CREATE TABLE t ( k int PRIMARY KEY, c text );
@@ -63,40 +63,38 @@
         // clause which is inintuitive (#6782)
         //
         // We never insert markers for Super CF as this would confuse the thrift side.
-        if (type == StatementType.INSERT && cfDef.isComposite && !cfDef.isCompact && !cfm.isSuper())
-        {
-            ByteBuffer name = builder.copy().add(ByteBufferUtil.EMPTY_BYTE_BUFFER).build();
-            cf.addColumn(params.makeColumn(name, ByteBufferUtil.EMPTY_BYTE_BUFFER));
-        }
+        if (type == StatementType.INSERT && cfm.isCQL3Table() && !prefix.isStatic())
+            cf.addColumn(params.makeColumn(cfm.comparator.rowMarker(prefix), ByteBufferUtil.EMPTY_BYTE_BUFFER));
 
         List<Operation> updates = getOperations();
 
-        if (cfDef.isCompact)
+        if (cfm.comparator.isDense())
         {
-            if (builder.componentCount() == 0)
-                throw new InvalidRequestException(String.format("Missing PRIMARY KEY part %s", cfDef.clusteringColumns().iterator().next()));
+            if (prefix.isEmpty())
+                throw new InvalidRequestException(String.format("Missing PRIMARY KEY part %s", cfm.clusteringColumns().iterator().next()));
 
-            if (cfDef.compactValue() == null)
+            // An empty name for the compact value is what we use to recognize the case where there is not column
+            // outside the PK, see CreateStatement.
+            if (!cfm.compactValueColumn().name.bytes.hasRemaining())
             {
-                // compact + no compact value implies there is no column outside the PK. So no operation could
-                // have passed through validation
+                // There is no column outside the PK. So no operation could have passed through validation
                 assert updates.isEmpty();
-                setToEmptyOperation.execute(key, cf, builder.copy(), params);
+                new Constants.Setter(cfm.compactValueColumn(), EMPTY).execute(key, cf, prefix, params);
             }
             else
             {
-                // compact means we don't have a row marker, so don't accept to set only the PK. See CASSANDRA-5648.
+                // dense means we don't have a row marker, so don't accept to set only the PK. See CASSANDRA-5648.
                 if (updates.isEmpty())
-                    throw new InvalidRequestException(String.format("Column %s is mandatory for this COMPACT STORAGE table", cfDef.compactValue()));
+                    throw new InvalidRequestException(String.format("Column %s is mandatory for this COMPACT STORAGE table", cfm.compactValueColumn().name));
 
                 for (Operation update : updates)
-                    update.execute(key, cf, builder.copy(), params);
+                    update.execute(key, cf, prefix, params);
             }
         }
         else
         {
             for (Operation update : updates)
-                update.execute(key, cf, builder.copy(), params);
+                update.execute(key, cf, prefix, params);
         }
     }
 
@@ -123,9 +121,9 @@
             this.columnValues = columnValues;
         }
 
-        protected ModificationStatement prepareInternal(CFDefinition cfDef, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
+        protected ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
         {
-            UpdateStatement stmt = new UpdateStatement(ModificationStatement.StatementType.INSERT, cfDef.cfm, attrs);
+            UpdateStatement stmt = new UpdateStatement(ModificationStatement.StatementType.INSERT,boundNames.size(), cfm, attrs);
 
             // Created from an INSERT
             if (stmt.isCounter())
@@ -137,28 +135,26 @@
 
             for (int i = 0; i < columnNames.size(); i++)
             {
-                CFDefinition.Name name = cfDef.get(columnNames.get(i));
-                if (name == null)
+                ColumnDefinition def = cfm.getColumnDefinition(columnNames.get(i));
+                if (def == null)
                     throw new InvalidRequestException(String.format("Unknown identifier %s", columnNames.get(i)));
 
                 for (int j = 0; j < i; j++)
-                    if (name.name.equals(columnNames.get(j)))
-                        throw new InvalidRequestException(String.format("Multiple definitions found for column %s", name));
+                    if (def.name.equals(columnNames.get(j)))
+                        throw new InvalidRequestException(String.format("Multiple definitions found for column %s", def.name));
 
                 Term.Raw value = columnValues.get(i);
 
-                switch (name.kind)
+                switch (def.kind)
                 {
-                    case KEY_ALIAS:
-                    case COLUMN_ALIAS:
-                        Term t = value.prepare(name);
+                    case PARTITION_KEY:
+                    case CLUSTERING_COLUMN:
+                        Term t = value.prepare(keyspace(), def);
                         t.collectMarkerSpecification(boundNames);
-                        stmt.addKeyValue(name, t);
+                        stmt.addKeyValue(def, t);
                         break;
-                    case VALUE_ALIAS:
-                    case COLUMN_METADATA:
-                    case STATIC:
-                        Operation operation = new Operation.SetValue(value).prepare(name);
+                    default:
+                        Operation operation = new Operation.SetValue(value).prepare(keyspace(), def);
                         operation.collectMarkerSpecification(boundNames);
                         stmt.addOperation(operation);
                         break;
@@ -194,27 +190,25 @@
             this.whereClause = whereClause;
         }
 
-        protected ModificationStatement prepareInternal(CFDefinition cfDef, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
+        protected ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
         {
-            UpdateStatement stmt = new UpdateStatement(ModificationStatement.StatementType.UPDATE, cfDef.cfm, attrs);
+            UpdateStatement stmt = new UpdateStatement(ModificationStatement.StatementType.UPDATE, boundNames.size(), cfm, attrs);
 
             for (Pair<ColumnIdentifier, Operation.RawUpdate> entry : updates)
             {
-                CFDefinition.Name name = cfDef.get(entry.left);
-                if (name == null)
+                ColumnDefinition def = cfm.getColumnDefinition(entry.left);
+                if (def == null)
                     throw new InvalidRequestException(String.format("Unknown identifier %s", entry.left));
 
-                Operation operation = entry.right.prepare(name);
+                Operation operation = entry.right.prepare(keyspace(), def);
                 operation.collectMarkerSpecification(boundNames);
 
-                switch (name.kind)
+                switch (def.kind)
                 {
-                    case KEY_ALIAS:
-                    case COLUMN_ALIAS:
+                    case PARTITION_KEY:
+                    case CLUSTERING_COLUMN:
                         throw new InvalidRequestException(String.format("PRIMARY KEY part %s found in SET part", entry.left));
-                    case VALUE_ALIAS:
-                    case COLUMN_METADATA:
-                    case STATIC:
+                    default:
                         stmt.addOperation(operation);
                         break;
                 }

diff --git a/src/java/org/apache/cassandra/db/AbstractCell.java b/src/java/org/apache/cassandra/db/AbstractCell.java
new file mode 100644
index 0000000..f27871f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/AbstractCell.java

@@ -0,0 +1,235 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.DataInput;
+import java.io.IOError;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.Iterator;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+
+public abstract class AbstractCell implements Cell
+{
+    public static Iterator<OnDiskAtom> onDiskIterator(final DataInput in,
+                                                      final ColumnSerializer.Flag flag,
+                                                      final int expireBefore,
+                                                      final Descriptor.Version version,
+                                                      final CellNameType type)
+    {
+        return new AbstractIterator<OnDiskAtom>()
+        {
+            protected OnDiskAtom computeNext()
+            {
+                OnDiskAtom atom;
+                try
+                {
+                    atom = type.onDiskAtomSerializer().deserializeFromSSTable(in, flag, expireBefore, version);
+                }
+                catch (IOException e)
+                {
+                    throw new IOError(e);
+                }
+                if (atom == null)
+                    return endOfData();
+
+                return atom;
+            }
+        };
+    }
+
+    public boolean isLive()
+    {
+        return true;
+    }
+
+    public boolean isLive(long now)
+    {
+        return true;
+    }
+
+    public int cellDataSize()
+    {
+        return name().dataSize() + value().remaining() + TypeSizes.NATIVE.sizeof(timestamp());
+    }
+
+    public int serializedSize(CellNameType type, TypeSizes typeSizes)
+    {
+        /*
+         * Size of a column is =
+         *   size of a name (short + length of the string)
+         * + 1 byte to indicate if the column has been deleted
+         * + 8 bytes for timestamp
+         * + 4 bytes which basically indicates the size of the byte array
+         * + entire byte array.
+        */
+        int valueSize = value().remaining();
+        return ((int)type.cellSerializer().serializedSize(name(), typeSizes)) + 1 + typeSizes.sizeof(timestamp()) + typeSizes.sizeof(valueSize) + valueSize;
+    }
+
+    public int serializationFlags()
+    {
+        return 0;
+    }
+
+    public Cell diff(Cell cell)
+    {
+        if (timestamp() < cell.timestamp())
+            return cell;
+        return null;
+    }
+
+    public void updateDigest(MessageDigest digest)
+    {
+        digest.update(name().toByteBuffer().duplicate());
+        digest.update(value().duplicate());
+
+        FBUtilities.updateWithLong(digest, timestamp());
+        FBUtilities.updateWithByte(digest, serializationFlags());
+    }
+
+    public int getLocalDeletionTime()
+    {
+        return Integer.MAX_VALUE;
+    }
+
+    public Cell reconcile(Cell cell)
+    {
+        long ts1 = timestamp(), ts2 = cell.timestamp();
+        if (ts1 != ts2)
+            return ts1 < ts2 ? cell : this;
+        if (isLive() != cell.isLive())
+            return isLive() ? cell : this;
+        return value().compareTo(cell.value()) < 0 ? cell : this;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        return this == o || (o instanceof Cell && equals((Cell) o));
+    }
+
+    public boolean equals(Cell cell)
+    {
+        return timestamp() == cell.timestamp() && name().equals(cell.name()) && value().equals(cell.value());
+    }
+
+    public int hashCode()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public String getString(CellNameType comparator)
+    {
+        return String.format("%s:%b:%d@%d",
+                             comparator.getString(name()),
+                             !isLive(),
+                             value().remaining(),
+                             timestamp());
+    }
+
+    public void validateName(CFMetaData metadata) throws MarshalException
+    {
+        metadata.comparator.validate(name());
+    }
+
+    public void validateFields(CFMetaData metadata) throws MarshalException
+    {
+        validateName(metadata);
+
+        AbstractType<?> valueValidator = metadata.getValueValidator(name());
+        if (valueValidator != null)
+            valueValidator.validateCellValue(value());
+    }
+
+    public static Cell create(CellName name, ByteBuffer value, long timestamp, int ttl, CFMetaData metadata)
+    {
+        if (ttl <= 0)
+            ttl = metadata.getDefaultTimeToLive();
+
+        return ttl > 0
+                ? new BufferExpiringCell(name, value, timestamp, ttl)
+                : new BufferCell(name, value, timestamp);
+    }
+
+    public Cell diffCounter(Cell cell)
+    {
+        assert this instanceof CounterCell : "Wrong class type: " + getClass();
+
+        if (timestamp() < cell.timestamp())
+            return cell;
+
+        // Note that if at that point, cell can't be a tombstone. Indeed,
+        // cell is the result of merging us with other nodes results, and
+        // merging a CounterCell with a tombstone never return a tombstone
+        // unless that tombstone timestamp is greater that the CounterCell
+        // one.
+        assert cell instanceof CounterCell : "Wrong class type: " + cell.getClass();
+
+        if (((CounterCell) this).timestampOfLastDelete() < ((CounterCell) cell).timestampOfLastDelete())
+            return cell;
+
+        CounterContext.Relationship rel = CounterCell.contextManager.diff(cell.value(), value());
+        return (rel == CounterContext.Relationship.GREATER_THAN || rel == CounterContext.Relationship.DISJOINT) ? cell : null;
+    }
+
+    /** This is temporary until we start creating Cells of the different type (buffer vs. native) */
+    public Cell reconcileCounter(Cell cell)
+    {
+        assert this instanceof CounterCell : "Wrong class type: " + getClass();
+
+        // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
+        if (cell instanceof DeletedCell)
+            return cell;
+
+        assert (cell instanceof CounterCell) : "Wrong class type: " + cell.getClass();
+
+        // live < live last delete
+        if (timestamp() < ((CounterCell) cell).timestampOfLastDelete())
+            return cell;
+
+        long timestampOfLastDelete = ((CounterCell) this).timestampOfLastDelete();
+
+        // live last delete > live
+        if (timestampOfLastDelete > cell.timestamp())
+            return this;
+
+        // live + live. return one of the cells if its context is a superset of the other's, or merge them otherwise
+        ByteBuffer context = CounterCell.contextManager.merge(value(), cell.value());
+        if (context == value() && timestamp() >= cell.timestamp() && timestampOfLastDelete >= ((CounterCell) cell).timestampOfLastDelete())
+            return this;
+        else if (context == cell.value() && cell.timestamp() >= timestamp() && ((CounterCell) cell).timestampOfLastDelete() >= timestampOfLastDelete)
+            return cell;
+        else // merge clocks and timestamps.
+            return new BufferCounterCell(name(),
+                                         context,
+                                         Math.max(timestamp(), cell.timestamp()),
+                                         Math.max(timestampOfLastDelete, ((CounterCell) cell).timestampOfLastDelete()));
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/AbstractNativeCell.java b/src/java/org/apache/cassandra/db/AbstractNativeCell.java
new file mode 100644
index 0000000..e01d860
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/AbstractNativeCell.java

@@ -0,0 +1,706 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.security.MessageDigest;
+
+import net.nicoulaj.compilecommand.annotations.Inline;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.FastByteOperations;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.*;
+
+
+/**
+ * Packs a CellName AND a Cell into one off-heap representation.
+ * Layout is:
+ *
+ * Note we store the ColumnIdentifier in full as bytes. This seems an okay tradeoff for now, as we just
+ * look it back up again when we need to, and in the near future we hope to switch to ints, longs or
+ * UUIDs representing column identifiers on disk, at which point we can switch that here as well.
+ *
+ * [timestamp][value offset][name size]][name extra][name offset deltas][cell names][value][Descendants]
+ * [   8b    ][     4b     ][    2b   ][     1b    ][     each 2b      ][ arb < 64k][ arb ][ arbitrary ]
+ *
+ * descendants: any overriding classes will put their state here
+ * name offsets are deltas from their base offset, and don't include the first offset, or the end position of the final entry,
+ * i.e. there will be size - 1 entries, and each is a delta that is added to the offset of the position of the first name
+ * (which is always CELL_NAME_OFFSETS_OFFSET + (2 * (size - 1))). The length of the final name fills up any remaining
+ * space upto the value offset
+ * name extra:  lowest 2 bits indicate the clustering size delta (i.e. how many name items are NOT part of the clustering key)
+ *              the next 2 bits indicate the CellNameType
+ *              the next bit indicates if the column is a static or clustered/dynamic column
+ */
+public abstract class AbstractNativeCell extends AbstractCell implements CellName
+{
+    static final int TIMESTAMP_OFFSET = 4;
+    private static final int VALUE_OFFSET_OFFSET = 12;
+    private static final int CELL_NAME_SIZE_OFFSET = 16;
+    private static final int CELL_NAME_EXTRA_OFFSET = 18;
+    private static final int CELL_NAME_OFFSETS_OFFSET = 19;
+    private static final int CELL_NAME_SIZE_DELTA_MASK = 3;
+    private static final int CELL_NAME_TYPE_SHIFT = 2;
+    private static final int CELL_NAME_TYPE_MASK = 7;
+
+    private static enum NameType
+    {
+        COMPOUND_DENSE(0 << 2), COMPOUND_SPARSE(1 << 2), COMPOUND_SPARSE_STATIC(2 << 2), SIMPLE_DENSE(3 << 2), SIMPLE_SPARSE(4 << 2);
+        static final NameType[] TYPES = NameType.values();
+        final int bits;
+
+        NameType(int bits)
+        {
+            this.bits = bits;
+        }
+
+        static NameType typeOf(CellName name)
+        {
+            if (name instanceof CompoundDenseCellName)
+            {
+                assert !name.isStatic();
+                return COMPOUND_DENSE;
+            }
+
+            if (name instanceof CompoundSparseCellName)
+                return name.isStatic() ? COMPOUND_SPARSE_STATIC : COMPOUND_SPARSE;
+
+            if (name instanceof SimpleDenseCellName)
+            {
+                assert !name.isStatic();
+                return SIMPLE_DENSE;
+            }
+
+            if (name instanceof SimpleSparseCellName)
+            {
+                assert !name.isStatic();
+                return SIMPLE_SPARSE;
+            }
+
+            if (name instanceof NativeCell)
+                return ((NativeCell) name).nametype();
+
+            throw new AssertionError();
+        }
+    }
+
+    private final long peer; // peer is assigned by peer updater in setPeer method
+
+    AbstractNativeCell()
+    {
+        peer = -1;
+    }
+
+    public AbstractNativeCell(NativeAllocator allocator, OpOrder.Group writeOp, Cell copyOf)
+    {
+        int size = sizeOf(copyOf);
+        peer = allocator.allocate(size, writeOp);
+
+        MemoryUtil.setInt(peer, size);
+        construct(copyOf);
+    }
+
+    protected int sizeOf(Cell cell)
+    {
+        int size = CELL_NAME_OFFSETS_OFFSET + Math.max(0, cell.name().size() - 1) * 2 + cell.value().remaining();
+        CellName name = cell.name();
+        for (int i = 0; i < name.size(); i++)
+            size += name.get(i).remaining();
+        return size;
+    }
+
+    protected void construct(Cell from)
+    {
+        setLong(TIMESTAMP_OFFSET, from.timestamp());
+        CellName name = from.name();
+        int nameSize = name.size();
+        int offset = CELL_NAME_SIZE_OFFSET;
+        setShort(offset, (short) nameSize);
+        assert nameSize - name.clusteringSize() <= 2;
+        byte cellNameExtraBits = (byte) ((nameSize - name.clusteringSize()) | NameType.typeOf(name).bits);
+        setByte(offset += 2, cellNameExtraBits);
+        offset += 1;
+        short cellNameDelta = 0;
+        for (int i = 1; i < nameSize; i++)
+        {
+            cellNameDelta += name.get(i - 1).remaining();
+            setShort(offset, cellNameDelta);
+            offset += 2;
+        }
+        for (int i = 0; i < nameSize; i++)
+        {
+            ByteBuffer bb = name.get(i);
+            setBytes(offset, bb);
+            offset += bb.remaining();
+        }
+        setInt(VALUE_OFFSET_OFFSET, offset);
+        setBytes(offset, from.value());
+    }
+
+    // the offset at which to read the short that gives the names
+    private int nameDeltaOffset(int i)
+    {
+        return CELL_NAME_OFFSETS_OFFSET + ((i - 1) * 2);
+    }
+
+    int valueStartOffset()
+    {
+        return getInt(VALUE_OFFSET_OFFSET);
+    }
+
+    private int valueEndOffset()
+    {
+        return (int) (internalSize() - postfixSize());
+    }
+
+    protected int postfixSize()
+    {
+        return 0;
+    }
+
+    @Override
+    public ByteBuffer value()
+    {
+        long offset = valueStartOffset();
+        return getByteBuffer(offset, (int) (internalSize() - (postfixSize() + offset))).order(ByteOrder.BIG_ENDIAN);
+    }
+
+    private int clusteringSizeDelta()
+    {
+        return getByte(CELL_NAME_EXTRA_OFFSET) & CELL_NAME_SIZE_DELTA_MASK;
+    }
+
+    public boolean isStatic()
+    {
+        return nametype() == NameType.COMPOUND_SPARSE_STATIC;
+    }
+
+    NameType nametype()
+    {
+        return NameType.TYPES[(((int) this.getByte(CELL_NAME_EXTRA_OFFSET)) >> CELL_NAME_TYPE_SHIFT) & CELL_NAME_TYPE_MASK];
+    }
+
+    public long minTimestamp()
+    {
+        return timestamp();
+    }
+
+    public long maxTimestamp()
+    {
+        return timestamp();
+    }
+
+    public int clusteringSize()
+    {
+        return size() - clusteringSizeDelta();
+    }
+
+    @Override
+    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
+    {
+        switch (nametype())
+        {
+            case SIMPLE_SPARSE:
+                return getIdentifier(metadata, get(clusteringSize()));
+            case COMPOUND_SPARSE_STATIC:
+            case COMPOUND_SPARSE:
+                ByteBuffer buffer = get(clusteringSize());
+                if (buffer.remaining() == 0)
+                    return CompoundSparseCellNameType.rowMarkerId;
+
+                return getIdentifier(metadata, buffer);
+            case SIMPLE_DENSE:
+            case COMPOUND_DENSE:
+                return null;
+            default:
+                throw new AssertionError();
+        }
+    }
+
+    public ByteBuffer collectionElement()
+    {
+        return isCollectionCell() ? get(size() - 1) : null;
+    }
+
+    // we always have a collection element if our clustering size is 2 less than our total size,
+    // and we never have one otherwiss
+    public boolean isCollectionCell()
+    {
+        return clusteringSizeDelta() == 2;
+    }
+
+    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
+    {
+        switch (nametype())
+        {
+            case SIMPLE_DENSE:
+            case COMPOUND_DENSE:
+                return type.compare(this, other) == 0;
+            case COMPOUND_SPARSE_STATIC:
+            case COMPOUND_SPARSE:
+                int clusteringSize = clusteringSize();
+                if (clusteringSize != other.clusteringSize() || other.isStatic() != isStatic())
+                    return false;
+                for (int i = 0; i < clusteringSize; i++)
+                    if (type.subtype(i).compare(get(i), other.get(i)) != 0)
+                        return false;
+                return true;
+            case SIMPLE_SPARSE:
+                return true;
+            default:
+                throw new AssertionError();
+        }
+    }
+
+    public int size()
+    {
+        return getShort(CELL_NAME_SIZE_OFFSET);
+    }
+
+    public boolean isEmpty()
+    {
+        return size() == 0;
+    }
+
+    public ByteBuffer get(int i)
+    {
+        return get(i, null);
+    }
+
+    private ByteBuffer get(int i, AbstractAllocator copy)
+    {
+        // remember to take dense/sparse into account, and only return EOC when not dense
+        int size = size();
+        assert i >= 0 && i < size();
+        int cellNamesOffset = nameDeltaOffset(size);
+        int startDelta = i == 0 ? 0 : getShort(nameDeltaOffset(i));
+        int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
+        int length = endDelta - startDelta;
+        if (copy == null)
+            return getByteBuffer(cellNamesOffset + startDelta, length).order(ByteOrder.BIG_ENDIAN);
+        ByteBuffer result = copy.allocate(length);
+        FastByteOperations.UnsafeOperations.copy(null, peer + cellNamesOffset + startDelta, result, 0, length);
+        return result;
+    }
+
+    private static final ThreadLocal<byte[]> BUFFER = new ThreadLocal<byte[]>()
+    {
+        protected byte[] initialValue()
+        {
+            return new byte[256];
+        }
+    };
+
+    protected void writeComponentTo(MessageDigest digest, int i, boolean includeSize)
+    {
+        // remember to take dense/sparse into account, and only return EOC when not dense
+        int size = size();
+        assert i >= 0 && i < size();
+        int cellNamesOffset = nameDeltaOffset(size);
+        int startDelta = i == 0 ? 0 : getShort(nameDeltaOffset(i));
+        int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
+
+        int componentStart = cellNamesOffset + startDelta;
+        int count = endDelta - startDelta;
+
+        if (includeSize)
+            FBUtilities.updateWithShort(digest, count);
+
+        writeMemoryTo(digest, componentStart, count);
+    }
+
+    protected void writeMemoryTo(MessageDigest digest, int from, int count)
+    {
+        // only batch if we have more than 16 bytes remaining to transfer, otherwise fall-back to single-byte updates
+        int i = 0, batchEnd = count - 16;
+        if (i < batchEnd)
+        {
+            byte[] buffer = BUFFER.get();
+            while (i < batchEnd)
+            {
+                int transfer = Math.min(count - i, 256);
+                getBytes(from + i, buffer, 0, transfer);
+                digest.update(buffer, 0, transfer);
+                i += transfer;
+            }
+        }
+        while (i < count)
+            digest.update(getByte(from + i++));
+    }
+
+    public EOC eoc()
+    {
+        return EOC.NONE;
+    }
+
+    public Composite withEOC(EOC eoc)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public Composite start()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public Composite end()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public ColumnSlice slice()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public boolean isPrefixOf(CType type, Composite c)
+    {
+        if (size() > c.size() || isStatic() != c.isStatic())
+            return false;
+
+        for (int i = 0; i < size(); i++)
+        {
+            if (type.subtype(i).compare(get(i), c.get(i)) != 0)
+                return false;
+        }
+        return true;
+    }
+
+    public ByteBuffer toByteBuffer()
+    {
+        // for simple sparse we just return our one name buffer
+        switch (nametype())
+        {
+            case SIMPLE_DENSE:
+            case SIMPLE_SPARSE:
+                return get(0);
+            case COMPOUND_DENSE:
+            case COMPOUND_SPARSE_STATIC:
+            case COMPOUND_SPARSE:
+                // This is the legacy format of composites.
+                // See org.apache.cassandra.db.marshal.CompositeType for details.
+                ByteBuffer result = ByteBuffer.allocate(cellDataSize());
+                if (isStatic())
+                    ByteBufferUtil.writeShortLength(result, CompositeType.STATIC_MARKER);
+
+                for (int i = 0; i < size(); i++)
+                {
+                    ByteBuffer bb = get(i);
+                    ByteBufferUtil.writeShortLength(result, bb.remaining());
+                    result.put(bb);
+                    result.put((byte) 0);
+                }
+                result.flip();
+                return result;
+            default:
+                throw new AssertionError();
+        }
+    }
+
+    protected void updateWithName(MessageDigest digest)
+    {
+        // for simple sparse we just return our one name buffer
+        switch (nametype())
+        {
+            case SIMPLE_DENSE:
+            case SIMPLE_SPARSE:
+                writeComponentTo(digest, 0, false);
+                break;
+
+            case COMPOUND_DENSE:
+            case COMPOUND_SPARSE_STATIC:
+            case COMPOUND_SPARSE:
+                // This is the legacy format of composites.
+                // See org.apache.cassandra.db.marshal.CompositeType for details.
+                if (isStatic())
+                    FBUtilities.updateWithShort(digest, CompositeType.STATIC_MARKER);
+
+                for (int i = 0; i < size(); i++)
+                {
+                    writeComponentTo(digest, i, true);
+                    digest.update((byte) 0);
+                }
+                break;
+
+            default:
+                throw new AssertionError();
+        }
+    }
+
+    protected void updateWithValue(MessageDigest digest)
+    {
+        int offset = valueStartOffset();
+        int length = valueEndOffset() - offset;
+        writeMemoryTo(digest, offset, length);
+    }
+
+    @Override // this is the NAME dataSize, only!
+    public int dataSize()
+    {
+        switch (nametype())
+        {
+            case SIMPLE_DENSE:
+            case SIMPLE_SPARSE:
+                return valueStartOffset() - nameDeltaOffset(size());
+            case COMPOUND_DENSE:
+            case COMPOUND_SPARSE_STATIC:
+            case COMPOUND_SPARSE:
+                int size = size();
+                return valueStartOffset() - nameDeltaOffset(size) + 3 * size + (isStatic() ? 2 : 0);
+            default:
+                throw new AssertionError();
+        }
+    }
+
+    public boolean equals(Object obj)
+    {
+        if (obj == this)
+            return true;
+        if (obj instanceof CellName)
+            return equals((CellName) obj);
+        if (obj instanceof Cell)
+            return equals((Cell) obj);
+        return false;
+    }
+
+    public boolean equals(CellName that)
+    {
+        int size = this.size();
+        if (size != that.size())
+            return false;
+
+        for (int i = 0 ; i < size ; i++)
+            if (!get(i).equals(that.get(i)))
+                return false;
+        return true;
+    }
+
+    private static final ByteBuffer[] EMPTY = new ByteBuffer[0];
+
+    @Override
+    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        ByteBuffer[] r;
+        switch (nametype())
+        {
+            case SIMPLE_DENSE:
+                return CellNames.simpleDense(get(0, allocator));
+
+            case COMPOUND_DENSE:
+                r = new ByteBuffer[size()];
+                for (int i = 0; i < r.length; i++)
+                    r[i] = get(i, allocator);
+                return CellNames.compositeDense(r);
+
+            case COMPOUND_SPARSE_STATIC:
+            case COMPOUND_SPARSE:
+                int clusteringSize = clusteringSize();
+                r = clusteringSize == 0 ? EMPTY : new ByteBuffer[clusteringSize()];
+                for (int i = 0; i < clusteringSize; i++)
+                    r[i] = get(i, allocator);
+
+                ByteBuffer nameBuffer = get(r.length);
+                ColumnIdentifier name;
+
+                if (nameBuffer.remaining() == 0)
+                {
+                    name = CompoundSparseCellNameType.rowMarkerId;
+                }
+                else
+                {
+                    name = getIdentifier(cfm, nameBuffer);
+                }
+
+                if (clusteringSizeDelta() == 2)
+                {
+                    ByteBuffer element = allocator.clone(get(size() - 1));
+                    return CellNames.compositeSparseWithCollection(r, element, name, isStatic());
+                }
+                return CellNames.compositeSparse(r, name, isStatic());
+
+            case SIMPLE_SPARSE:
+                return CellNames.simpleSparse(getIdentifier(cfm, get(0)));
+        }
+        throw new IllegalStateException();
+    }
+
+    private static ColumnIdentifier getIdentifier(CFMetaData cfMetaData, ByteBuffer name)
+    {
+        ColumnDefinition def = cfMetaData.getColumnDefinition(name);
+        if (def != null)
+        {
+            return def.name;
+        }
+        else
+        {
+            // it's safe to simply grab based on clusteringPrefixSize() as we are only called if not a dense type
+            AbstractType<?> type = cfMetaData.comparator.subtype(cfMetaData.comparator.clusteringPrefixSize());
+            return new ColumnIdentifier(HeapAllocator.instance.clone(name), type);
+        }
+    }
+
+    @Override
+    public Cell withUpdatedName(CellName newName)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Cell withUpdatedTimestamp(long newTimestamp)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    protected long internalSize()
+    {
+        return MemoryUtil.getInt(peer);
+    }
+
+    private void checkPosition(long offset, long size)
+    {
+        assert size >= 0;
+        assert peer > 0 : "Memory was freed";
+        assert offset >= 0 && offset + size <= internalSize() : String.format("Illegal range: [%d..%d), size: %s", offset, offset + size, internalSize());
+    }
+
+    protected final void setByte(long offset, byte b)
+    {
+        checkPosition(offset, 1);
+        MemoryUtil.setByte(peer + offset, b);
+    }
+
+    protected final void setShort(long offset, short s)
+    {
+        checkPosition(offset, 1);
+        MemoryUtil.setShort(peer + offset, s);
+    }
+
+    protected final void setInt(long offset, int l)
+    {
+        checkPosition(offset, 4);
+        MemoryUtil.setInt(peer + offset, l);
+    }
+
+    protected final void setLong(long offset, long l)
+    {
+        checkPosition(offset, 8);
+        MemoryUtil.setLong(peer + offset, l);
+    }
+
+    protected final void setBytes(long offset, ByteBuffer buffer)
+    {
+        int start = buffer.position();
+        int count = buffer.limit() - start;
+        if (count == 0)
+            return;
+
+        checkPosition(offset, count);
+        MemoryUtil.setBytes(peer + offset, buffer);
+    }
+
+    protected final byte getByte(long offset)
+    {
+        checkPosition(offset, 1);
+        return MemoryUtil.getByte(peer + offset);
+    }
+
+    protected final void getBytes(long offset, byte[] trg, int trgOffset, int count)
+    {
+        checkPosition(offset, count);
+        MemoryUtil.getBytes(peer + offset, trg, trgOffset, count);
+    }
+
+    protected final int getShort(long offset)
+    {
+        checkPosition(offset, 2);
+        return MemoryUtil.getShort(peer + offset);
+    }
+
+    protected final int getInt(long offset)
+    {
+        checkPosition(offset, 4);
+        return MemoryUtil.getInt(peer + offset);
+    }
+
+    protected final long getLong(long offset)
+    {
+        checkPosition(offset, 8);
+        return MemoryUtil.getLong(peer + offset);
+    }
+
+    protected final ByteBuffer getByteBuffer(long offset, int length)
+    {
+        checkPosition(offset, length);
+        return MemoryUtil.getByteBuffer(peer + offset, length);
+    }
+
+    // requires isByteOrderComparable to be true. Compares the name components only; ; may need to compare EOC etc still
+    @Inline
+    public final int compareTo(final Composite that)
+    {
+        if (isStatic() != that.isStatic())
+        {
+            // Static sorts before non-static no matter what, except for empty which
+            // always sort first
+            if (isEmpty())
+                return that.isEmpty() ? 0 : -1;
+            if (that.isEmpty())
+                return 1;
+            return isStatic() ? -1 : 1;
+        }
+
+        int size = size();
+        int size2 = that.size();
+        int minSize = Math.min(size, size2);
+        int startDelta = 0;
+        int cellNamesOffset = nameDeltaOffset(size);
+        for (int i = 0 ; i < minSize ; i++)
+        {
+            int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
+            long offset = peer + cellNamesOffset + startDelta;
+            int length = endDelta - startDelta;
+            int cmp = FastByteOperations.UnsafeOperations.compareTo(null, offset, length, that.get(i));
+            if (cmp != 0)
+                return cmp;
+            startDelta = endDelta;
+        }
+
+        EOC eoc = that.eoc();
+        if (size == size2)
+            return this.eoc().compareTo(eoc);
+
+        return size < size2 ? this.eoc().prefixComparisonResult : -eoc.prefixComparisonResult;
+    }
+
+    public final int compareToSimple(final Composite that)
+    {
+        assert size() == 1 && that.size() == 1;
+        int length = valueStartOffset() - nameDeltaOffset(1);
+        long offset = peer + nameDeltaOffset(1);
+        return FastByteOperations.UnsafeOperations.compareTo(null, offset, length, that.get(0));
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/AbstractRangeCommand.java b/src/java/org/apache/cassandra/db/AbstractRangeCommand.java
index 45302e2..8a07681 100644
--- a/src/java/org/apache/cassandra/db/AbstractRangeCommand.java
+++ b/src/java/org/apache/cassandra/db/AbstractRangeCommand.java

@@ -24,7 +24,6 @@
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.service.IReadCommand;
-import org.apache.cassandra.thrift.IndexExpression;
 
 public abstract class AbstractRangeCommand implements IReadCommand
 {

diff --git a/src/java/org/apache/cassandra/db/AbstractThreadUnsafeSortedColumns.java b/src/java/org/apache/cassandra/db/AbstractThreadUnsafeSortedColumns.java
deleted file mode 100644
index 36b051b..0000000
--- a/src/java/org/apache/cassandra/db/AbstractThreadUnsafeSortedColumns.java
+++ /dev/null

@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import org.apache.cassandra.config.CFMetaData;
-
-public abstract class AbstractThreadUnsafeSortedColumns extends ColumnFamily
-{
-    protected DeletionInfo deletionInfo;
-
-    public AbstractThreadUnsafeSortedColumns(CFMetaData metadata)
-    {
-        this(metadata, DeletionInfo.live());
-    }
-
-    protected AbstractThreadUnsafeSortedColumns(CFMetaData metadata, DeletionInfo deletionInfo)
-    {
-        super(metadata);
-        this.deletionInfo = deletionInfo;
-    }
-
-    public DeletionInfo deletionInfo()
-    {
-        return deletionInfo;
-    }
-
-    public void delete(DeletionTime delTime)
-    {
-        deletionInfo.add(delTime);
-    }
-
-    public void delete(DeletionInfo newInfo)
-    {
-        deletionInfo.add(newInfo);
-    }
-
-    protected void delete(RangeTombstone tombstone)
-    {
-        deletionInfo.add(tombstone, getComparator());
-    }
-
-    public void setDeletionInfo(DeletionInfo newInfo)
-    {
-        deletionInfo = newInfo;
-    }
-
-    /**
-     * Purges any tombstones with a local deletion time before gcBefore.
-     * @param gcBefore a timestamp (in seconds) before which tombstones should be purged
-     */
-    public void purgeTombstones(int gcBefore)
-    {
-        deletionInfo.purge(gcBefore);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java b/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java
index 389e0f8..b5ed8d2 100644
--- a/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java
+++ b/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java

@@ -17,51 +17,86 @@
  */
 package org.apache.cassandra.db;
 
-import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.AbstractCollection;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Iterator;
 
 import com.google.common.base.Function;
 import com.google.common.collect.AbstractIterator;
 import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
 
+import net.nicoulaj.compilecommand.annotations.Inline;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.utils.Allocator;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
 
 /**
- * A ColumnFamily backed by an ArrayList.
+ * A ColumnFamily backed by an array.
  * This implementation is not synchronized and should only be used when
  * thread-safety is not required. This implementation makes sense when the
- * main operations performed are iterating over the map and adding columns
+ * main operations performed are iterating over the cells and adding cells
  * (especially if insertion is in sorted order).
  */
-public class ArrayBackedSortedColumns extends AbstractThreadUnsafeSortedColumns
+public class ArrayBackedSortedColumns extends ColumnFamily
 {
+    private static final Cell[] EMPTY_ARRAY = new Cell[0];
+    private static final int MINIMAL_CAPACITY = 10;
+
     private final boolean reversed;
-    private final ArrayList<Column> columns;
+
+    private DeletionInfo deletionInfo;
+    private Cell[] cells;
+    private int size;
+    private int sortedSize;
+    private volatile boolean isSorted;
 
     public static final ColumnFamily.Factory<ArrayBackedSortedColumns> factory = new Factory<ArrayBackedSortedColumns>()
     {
-        public ArrayBackedSortedColumns create(CFMetaData metadata, boolean insertReversed)
+        public ArrayBackedSortedColumns create(CFMetaData metadata, boolean insertReversed, int initialCapacity)
         {
-            return new ArrayBackedSortedColumns(metadata, insertReversed);
+            return new ArrayBackedSortedColumns(metadata, insertReversed, initialCapacity == 0 ? EMPTY_ARRAY : new Cell[initialCapacity], 0, 0);
         }
     };
 
-    private ArrayBackedSortedColumns(CFMetaData metadata, boolean reversed)
+    private ArrayBackedSortedColumns(CFMetaData metadata, boolean reversed, Cell[] cells, int size, int sortedSize)
     {
         super(metadata);
         this.reversed = reversed;
-        this.columns = new ArrayList<Column>();
+        this.deletionInfo = DeletionInfo.live();
+        this.cells = cells;
+        this.size = size;
+        this.sortedSize = sortedSize;
+        this.isSorted = size == sortedSize;
     }
 
-    private ArrayBackedSortedColumns(Collection<Column> columns, CFMetaData metadata, boolean reversed)
+    protected ArrayBackedSortedColumns(CFMetaData metadata, boolean reversed)
     {
-        super(metadata);
-        this.reversed = reversed;
-        this.columns = new ArrayList<Column>(columns);
+        this(metadata, reversed, EMPTY_ARRAY, 0, 0);
+    }
+
+    private ArrayBackedSortedColumns(ArrayBackedSortedColumns original)
+    {
+        super(original.metadata);
+        this.reversed = original.reversed;
+        this.deletionInfo = DeletionInfo.live(); // this is INTENTIONALLY not set to original.deletionInfo.
+        this.cells = Arrays.copyOf(original.cells, original.size);
+        this.size = original.size;
+        this.sortedSize = original.sortedSize;
+        this.isSorted = original.isSorted;
+    }
+
+    public static ArrayBackedSortedColumns localCopy(ColumnFamily original, AbstractAllocator allocator)
+    {
+        ArrayBackedSortedColumns copy = new ArrayBackedSortedColumns(original.metadata, false, new Cell[original.getColumnCount()], 0, 0);
+        for (Cell cell : original)
+            copy.internalAdd(cell.localCopy(original.metadata, allocator));
+        copy.sortedSize = copy.size; // internalAdd doesn't update sortedSize.
+        copy.delete(original);
+        return copy;
     }
 
     public ColumnFamily.Factory getFactory()
@@ -71,7 +106,7 @@
 
     public ColumnFamily cloneMe()
     {
-        return new ArrayBackedSortedColumns(columns, metadata, reversed);
+        return new ArrayBackedSortedColumns(this);
     }
 
     public boolean isInsertReversed()
@@ -79,262 +114,371 @@
         return reversed;
     }
 
-    private Comparator<ByteBuffer> internalComparator()
+    private Comparator<Composite> internalComparator()
     {
-        return reversed ? getComparator().reverseComparator : getComparator();
+        return reversed ? getComparator().reverseComparator() : getComparator();
     }
 
-    public Column getColumn(ByteBuffer name)
+    private void maybeSortCells()
     {
-        int pos = binarySearch(name);
-        return pos >= 0 ? columns.get(pos) : null;
+        if (!isSorted)
+            sortCells();
     }
 
     /**
-     * AddColumn throws an exception if the column added does not sort after
-     * the last column in the map.
-     * The reasoning is that this implementation can get slower if too much
-     * insertions are done in unsorted order and right now we only use it when
-     * *all* insertion (with this method) are done in sorted order. The
-     * assertion throwing is thus a protection against performance regression
-     * without knowing about (we can revisit that decision later if we have
-     * use cases where most insert are in sorted order but a few are not).
+     * synchronized so that concurrent (read-only) accessors don't mess the internal state.
      */
-    public void addColumn(Column column, Allocator allocator)
+    private synchronized void sortCells()
     {
-        if (columns.isEmpty())
+        if (isSorted)
+            return; // Just sorted by a previous call
+
+        Comparator<Cell> comparator = reversed
+                                    ? getComparator().columnReverseComparator()
+                                    : getComparator().columnComparator(false);
+
+        // Sort the unsorted segment - will still potentially contain duplicate (non-reconciled) cells
+        Arrays.sort(cells, sortedSize, size, comparator);
+
+        // Determine the merge start position for that segment
+        int pos = binarySearch(0, sortedSize, cells[sortedSize].name(), internalComparator());
+        if (pos < 0)
+            pos = -pos - 1;
+
+        // Copy [pos, lastSortedCellIndex] cells into a separate array
+        Cell[] leftCopy = pos == sortedSize
+                        ? EMPTY_ARRAY
+                        : Arrays.copyOfRange(cells, pos, sortedSize);
+
+        // Store the beginning (inclusive) and the end (exclusive) indexes of the right segment
+        int rightStart = sortedSize;
+        int rightEnd = size;
+
+        // 'Trim' the sizes to what's left without the leftCopy
+        size = sortedSize = pos;
+
+        // Merge the cells from both segments. When adding from the left segment we can rely on it not having any
+        // duplicate cells, and thus omit the comparison with the previously entered cell - we'll never need to reconcile.
+        int l = 0, r = rightStart;
+        while (l < leftCopy.length && r < rightEnd)
         {
-            columns.add(column);
+            int cmp = comparator.compare(leftCopy[l], cells[r]);
+            if (cmp < 0)
+                append(leftCopy[l++]);
+            else if (cmp == 0)
+                append(leftCopy[l++].reconcile(cells[r++]));
+            else
+                appendOrReconcile(cells[r++]);
+        }
+        while (l < leftCopy.length)
+            append(leftCopy[l++]);
+        while (r < rightEnd)
+            appendOrReconcile(cells[r++]);
+
+        // Nullify the remainder of the array (in case we had duplicate cells that got reconciled)
+        for (int i = size; i < rightEnd; i++)
+            cells[i] = null;
+
+        // Fully sorted at this point
+        isSorted = true;
+    }
+
+    private void appendOrReconcile(Cell cell)
+    {
+        if (size > 0 && cells[size - 1].name().equals(cell.name()))
+            reconcileWith(size - 1, cell);
+        else
+            append(cell);
+    }
+
+    private void append(Cell cell)
+    {
+        cells[size] = cell;
+        size++;
+        sortedSize++;
+    }
+
+    public Cell getColumn(CellName name)
+    {
+        maybeSortCells();
+        int pos = binarySearch(name);
+        return pos >= 0 ? cells[pos] : null;
+    }
+
+    /**
+      * Adds a cell, assuming that:
+      * - it's non-gc-able (if a tombstone) or not a tombstone
+      * - it has a more recent timestamp than any partition/range tombstone shadowing it
+      * - it sorts *strictly after* the current-last cell in the array.
+      */
+    public void maybeAppendColumn(Cell cell, DeletionInfo.InOrderTester tester, int gcBefore)
+    {
+        if (cell.getLocalDeletionTime() >= gcBefore && !tester.isDeleted(cell))
+        {
+            internalAdd(cell);
+            sortedSize++;
+        }
+    }
+
+    public void addColumn(Cell cell)
+    {
+        if (size == 0)
+        {
+            internalAdd(cell);
+            sortedSize++;
             return;
         }
 
-        // Fast path if inserting at the tail
-        int c = internalComparator().compare(columns.get(getColumnCount() - 1).name(), column.name());
-        // note that we want an assertion here (see addColumn javadoc), but we also want that if
-        // assertion are disabled, addColumn works correctly with unsorted input
-        assert c <= 0 : "Added column does not sort as the " + (reversed ? "first" : "last") + " column";
+        if (!isSorted)
+        {
+            internalAdd(cell);
+            return;
+        }
 
+        int c = internalComparator().compare(cells[size - 1].name(), cell.name());
         if (c < 0)
         {
-            // Insert as last
-            columns.add(column);
+            // Append to the end
+            internalAdd(cell);
+            sortedSize++;
         }
         else if (c == 0)
         {
-            // Resolve against last
-            resolveAgainst(getColumnCount() - 1, column, allocator);
+            // Resolve against the last cell
+            reconcileWith(size - 1, cell);
         }
         else
         {
-            int pos = binarySearch(column.name());
-            if (pos >= 0)
-                resolveAgainst(pos, column, allocator);
+            int pos = binarySearch(cell.name());
+            if (pos >= 0) // Reconcile with an existing cell
+            {
+                reconcileWith(pos, cell);
+            }
             else
-                columns.add(-pos-1, column);
+            {
+                internalAdd(cell); // Append to the end, making cells unsorted from now on
+                isSorted = false;
+            }
+        }
+    }
+
+    public void addAll(ColumnFamily other)
+    {
+        delete(other.deletionInfo());
+
+        if (!other.hasColumns())
+            return;
+
+        // In reality, with ABSC being the only remaining container (aside from ABTC), other will aways be ABSC.
+        if (size == 0 && other instanceof ArrayBackedSortedColumns)
+        {
+            fastAddAll((ArrayBackedSortedColumns) other);
+        }
+        else
+        {
+            Iterator<Cell> iterator = reversed ? other.reverseIterator() : other.iterator();
+            while (iterator.hasNext())
+                addColumn(iterator.next());
+        }
+    }
+
+    // Fast path, when this ABSC is empty.
+    private void fastAddAll(ArrayBackedSortedColumns other)
+    {
+        if (other.isInsertReversed() == isInsertReversed())
+        {
+            cells = Arrays.copyOf(other.cells, other.cells.length);
+            size = other.size;
+            sortedSize = other.sortedSize;
+            isSorted = other.isSorted;
+        }
+        else
+        {
+            if (cells.length < other.getColumnCount())
+                cells = new Cell[Math.max(MINIMAL_CAPACITY, other.getColumnCount())];
+            Iterator<Cell> iterator = reversed ? other.reverseIterator() : other.iterator();
+            while (iterator.hasNext())
+                cells[size++] = iterator.next();
+            sortedSize = size;
+            isSorted = true;
         }
     }
 
     /**
-     * Resolve against element at position i.
-     * Assume that i is a valid position.
+     * Add a cell to the array, 'resizing' it first if necessary (if it doesn't fit).
      */
-    private void resolveAgainst(int i, Column column, Allocator allocator)
+    private void internalAdd(Cell cell)
     {
-        Column oldColumn = columns.get(i);
-
-        // calculate reconciled col from old (existing) col and new col
-        Column reconciledColumn = column.reconcile(oldColumn, allocator);
-        columns.set(i, reconciledColumn);
-    }
-
-    private int binarySearch(ByteBuffer name)
-    {
-        return binarySearch(columns, internalComparator(), name, 0);
+        if (cells.length == size)
+            cells = Arrays.copyOf(cells, Math.max(MINIMAL_CAPACITY, size * 3 / 2 + 1));
+        cells[size++] = cell;
     }
 
     /**
-     * Simple binary search for a given column name.
+     * Remove the cell at a given index, shifting the rest of the array to the left if needed.
+     * Please note that we mostly remove from the end, so the shifting should be rare.
+     */
+    private void internalRemove(int index)
+    {
+        int moving = size - index - 1;
+        if (moving > 0)
+            System.arraycopy(cells, index + 1, cells, index, moving);
+        cells[--size] = null;
+    }
+
+    /**
+     * Reconcile with a cell at position i.
+     * Assume that i is a valid position.
+     */
+    private void reconcileWith(int i, Cell cell)
+    {
+        cells[i] = cell.reconcile(cells[i]);
+    }
+
+    private int binarySearch(CellName name)
+    {
+        return binarySearch(0, size, name, internalComparator());
+    }
+
+    /**
+     * Simple binary search for a given cell name.
      * The return value has the exact same meaning that the one of Collections.binarySearch().
      * (We don't use Collections.binarySearch() directly because it would require us to create
-     * a fake Column (as well as an Column comparator) to do the search, which is ugly.
+     * a fake Cell (as well as an Cell comparator) to do the search, which is ugly.
      */
-    private static int binarySearch(List<Column> columns, Comparator<ByteBuffer> comparator, ByteBuffer name, int start)
+    private int binarySearch(int fromIndex, int toIndex, Composite name, Comparator<Composite> comparator)
     {
-        int low = start;
-        int mid = columns.size();
+        int low = fromIndex;
+        int mid = toIndex;
         int high = mid - 1;
         int result = -1;
         while (low <= high)
         {
             mid = (low + high) >> 1;
-            if ((result = comparator.compare(name, columns.get(mid).name())) > 0)
-            {
+            if ((result = comparator.compare(name, cells[mid].name())) > 0)
                 low = mid + 1;
-            }
             else if (result == 0)
-            {
                 return mid;
-            }
             else
-            {
                 high = mid - 1;
-            }
         }
         return -mid - (result < 0 ? 1 : 2);
     }
 
-    public void addAll(ColumnFamily cm, Allocator allocator, Function<Column, Column> transformation)
+    public Collection<Cell> getSortedColumns()
     {
-        delete(cm.deletionInfo());
-        if (cm.getColumnCount() == 0)
-            return;
-
-        Column[] copy = columns.toArray(new Column[getColumnCount()]);
-        int idx = 0;
-        Iterator<Column> other = reversed ? cm.reverseIterator(ColumnSlice.ALL_COLUMNS_ARRAY) : cm.iterator();
-        Column otherColumn = other.next();
-
-        columns.clear();
-
-        while (idx < copy.length && otherColumn != null)
-        {
-            int c = internalComparator().compare(copy[idx].name(), otherColumn.name());
-            if (c < 0)
-            {
-                columns.add(copy[idx]);
-                idx++;
-            }
-            else if (c > 0)
-            {
-                columns.add(transformation.apply(otherColumn));
-                otherColumn = other.hasNext() ? other.next() : null;
-            }
-            else // c == 0
-            {
-                columns.add(copy[idx]);
-                resolveAgainst(getColumnCount() - 1, transformation.apply(otherColumn), allocator);
-                idx++;
-                otherColumn = other.hasNext() ? other.next() : null;
-            }
-        }
-        while (idx < copy.length)
-        {
-            columns.add(copy[idx++]);
-        }
-        while (otherColumn != null)
-        {
-            columns.add(transformation.apply(otherColumn));
-            otherColumn = other.hasNext() ? other.next() : null;
-        }
+        return new CellCollection(reversed);
     }
 
-    public boolean replace(Column oldColumn, Column newColumn)
+    public Collection<Cell> getReverseSortedColumns()
     {
-        if (!oldColumn.name().equals(newColumn.name()))
-            throw new IllegalArgumentException();
-
-        int pos = binarySearch(oldColumn.name());
-        if (pos >= 0)
-        {
-            columns.set(pos, newColumn);
-        }
-
-        return pos >= 0;
-    }
-
-    public Collection<Column> getSortedColumns()
-    {
-        return reversed ? new ReverseSortedCollection() : columns;
-    }
-
-    public Collection<Column> getReverseSortedColumns()
-    {
-        // If reversed, the element are sorted reversely, so we could expect
-        // to return *this*, but *this* redefine the iterator to be in sorted
-        // order, so we need a collection that uses the super constructor
-        return reversed ? new ForwardSortedCollection() : new ReverseSortedCollection();
+        return new CellCollection(!reversed);
     }
 
     public int getColumnCount()
     {
-        return columns.size();
+        maybeSortCells();
+        return size;
+    }
+
+    public boolean hasColumns()
+    {
+        return size > 0;
     }
 
     public void clear()
     {
         setDeletionInfo(DeletionInfo.live());
-        columns.clear();
+        for (int i = 0; i < size; i++)
+            cells[i] = null;
+        size = sortedSize = 0;
+        isSorted = true;
     }
 
-    public Iterable<ByteBuffer> getColumnNames()
+    public DeletionInfo deletionInfo()
     {
-        return Iterables.transform(columns, new Function<Column, ByteBuffer>()
+        return deletionInfo;
+    }
+
+    public void delete(DeletionTime delTime)
+    {
+        deletionInfo.add(delTime);
+    }
+
+    public void delete(DeletionInfo newInfo)
+    {
+        deletionInfo.add(newInfo);
+    }
+
+    protected void delete(RangeTombstone tombstone)
+    {
+        deletionInfo.add(tombstone, getComparator());
+    }
+
+    public void setDeletionInfo(DeletionInfo newInfo)
+    {
+        deletionInfo = newInfo;
+    }
+
+    /**
+     * Purges any tombstones with a local deletion time before gcBefore.
+     * @param gcBefore a timestamp (in seconds) before which tombstones should be purged
+     */
+    public void purgeTombstones(int gcBefore)
+    {
+        deletionInfo.purge(gcBefore);
+    }
+
+    public Iterable<CellName> getColumnNames()
+    {
+        return Iterables.transform(new CellCollection(false), new Function<Cell, CellName>()
         {
-            public ByteBuffer apply(Column column)
+            public CellName apply(Cell cell)
             {
-                return column.name;
+                return cell.name();
             }
         });
     }
 
-    public Iterator<Column> iterator()
+    public Iterator<Cell> iterator(ColumnSlice[] slices)
     {
-        return reversed ? Lists.reverse(columns).iterator() : columns.iterator();
+        maybeSortCells();
+        return slices.length == 1
+             ? slice(slices[0], reversed, null)
+             : new SlicesIterator(slices, reversed);
     }
 
-    public Iterator<Column> iterator(ColumnSlice[] slices)
+    public Iterator<Cell> reverseIterator(ColumnSlice[] slices)
     {
-        return new SlicesIterator(columns, getComparator(), slices, reversed);
+        maybeSortCells();
+        return slices.length == 1
+             ? slice(slices[0], !reversed, null)
+             : new SlicesIterator(slices, !reversed);
     }
 
-    public Iterator<Column> reverseIterator(ColumnSlice[] slices)
+    private class SlicesIterator extends AbstractIterator<Cell>
     {
-        return new SlicesIterator(columns, getComparator(), slices, !reversed);
-    }
-
-    private static class SlicesIterator extends AbstractIterator<Column>
-    {
-        private final List<Column> list;
         private final ColumnSlice[] slices;
-        private final Comparator<ByteBuffer> comparator;
+        private final boolean invert;
 
         private int idx = 0;
-        private int previousSliceEnd = 0;
-        private Iterator<Column> currentSlice;
+        private int previousSliceEnd;
+        private Iterator<Cell> currentSlice;
 
-        public SlicesIterator(List<Column> list, AbstractType<?> comparator, ColumnSlice[] slices, boolean reversed)
+        public SlicesIterator(ColumnSlice[] slices, boolean invert)
         {
-            this.list = reversed ? Lists.reverse(list) : list;
             this.slices = slices;
-            this.comparator = reversed ? comparator.reverseComparator : comparator;
+            this.invert = invert;
+            previousSliceEnd = invert ? size : 0;
         }
 
-        protected Column computeNext()
+        protected Cell computeNext()
         {
             if (currentSlice == null)
             {
                 if (idx >= slices.length)
                     return endOfData();
-
-                ColumnSlice slice = slices[idx++];
-                // The first idx to include
-                int startIdx = slice.start.remaining() == 0 ? 0 : binarySearch(list, comparator, slice.start, previousSliceEnd);
-                if (startIdx < 0)
-                    startIdx = -startIdx - 1;
-
-                // The first idx to exclude
-                int finishIdx = slice.finish.remaining() == 0 ? list.size() - 1 : binarySearch(list, comparator, slice.finish, previousSliceEnd);
-                if (finishIdx >= 0)
-                    finishIdx++;
-                else
-                    finishIdx = -finishIdx - 1;
-
-                if (startIdx == 0 && finishIdx == list.size())
-                    currentSlice = list.iterator();
-                else
-                    currentSlice = list.subList(startIdx, finishIdx).iterator();
-
-                previousSliceEnd = finishIdx > 0 ? finishIdx - 1 : 0;
+                currentSlice = slice(slices[idx++], invert, this);
             }
 
             if (currentSlice.hasNext())
@@ -345,52 +489,140 @@
         }
     }
 
-    private class ReverseSortedCollection extends AbstractCollection<Column>
+    /**
+     * @return a sub-range of our cells as an Iterator, between the provided composites (inclusive)
+     *
+     * @param slice  The slice with the inclusive start and finish bounds
+     * @param invert If the sort order of our collection is opposite to the desired sort order of the result;
+     *               this results in swapping the start/finish (since they are provided based on the desired
+     *               sort order, not our sort order), to normalise to our sort order, and a backwards iterator is returned
+     * @param iter   If this slice is part of a multi-slice, the iterator will be updated to ensure cells are visited only once
+     */
+    private Iterator<Cell> slice(ColumnSlice slice, boolean invert, SlicesIterator iter)
     {
-        public int size()
+        Composite start = invert ? slice.finish : slice.start;
+        Composite finish = invert ? slice.start : slice.finish;
+
+        int lowerBound = 0, upperBound = size;
+        if (iter != null)
         {
-            return columns.size();
+            if (invert)
+                upperBound = iter.previousSliceEnd;
+            else
+                lowerBound = iter.previousSliceEnd;
         }
 
-        public Iterator<Column> iterator()
+        if (!start.isEmpty())
         {
-            return new Iterator<Column>()
-            {
-                int idx = size() - 1;
-                boolean shouldCallNext = true;
+            lowerBound = binarySearch(lowerBound, upperBound, start, internalComparator());
+            if (lowerBound < 0)
+                lowerBound = -lowerBound - 1;
+        }
 
-                public boolean hasNext()
-                {
-                    return idx >= 0;
-                }
+        if (!finish.isEmpty())
+        {
+            upperBound = binarySearch(lowerBound, upperBound, finish, internalComparator());
+            upperBound = upperBound < 0
+                       ? -upperBound - 1
+                       : upperBound + 1; // upperBound is exclusive for the iterators
+        }
 
-                public Column next()
-                {
-                    shouldCallNext = false;
-                    return columns.get(idx--);
-                }
+        // If we're going backwards (wrt our sort order) we store the startIdx and use it as our upper bound next round
+        if (iter != null)
+            iter.previousSliceEnd = invert ? lowerBound : upperBound;
 
-                public void remove()
-                {
-                    if (shouldCallNext)
-                        throw new IllegalStateException();
-                    columns.remove(idx + 1);
-                    shouldCallNext = true;
-                }
-            };
+        return invert
+             ? new BackwardsCellIterator(lowerBound, upperBound)
+             : new ForwardsCellIterator(lowerBound, upperBound);
+    }
+
+    private final class BackwardsCellIterator implements Iterator<Cell>
+    {
+        private int idx, end;
+        private boolean shouldCallNext = true;
+
+        // lowerBound inclusive, upperBound exclusive
+        private BackwardsCellIterator(int lowerBound, int upperBound)
+        {
+            idx = upperBound - 1;
+            end = lowerBound - 1;
+        }
+
+        public boolean hasNext()
+        {
+            return idx > end;
+        }
+
+        public Cell next()
+        {
+            shouldCallNext = false;
+            return cells[idx--];
+        }
+
+        public void remove()
+        {
+            if (shouldCallNext)
+                throw new IllegalStateException();
+            shouldCallNext = true;
+            internalRemove(idx + 1);
+            sortedSize--;
         }
     }
 
-    private class ForwardSortedCollection extends AbstractCollection<Column>
+    private final class ForwardsCellIterator implements Iterator<Cell>
     {
-        public int size()
+        private int idx, end;
+        private boolean shouldCallNext = true;
+
+        // lowerBound inclusive, upperBound exclusive
+        private ForwardsCellIterator(int lowerBound, int upperBound)
         {
-            return columns.size();
+            idx = lowerBound;
+            end = upperBound;
         }
 
-        public Iterator<Column> iterator()
+        public boolean hasNext()
         {
-            return columns.iterator();
+            return idx < end;
+        }
+
+        public Cell next()
+        {
+            shouldCallNext = false;
+            return cells[idx++];
+        }
+
+        public void remove()
+        {
+            if (shouldCallNext)
+                throw new IllegalStateException();
+            shouldCallNext = true;
+            internalRemove(--idx);
+            sortedSize--;
+            end--;
+        }
+    }
+
+    private final class CellCollection extends AbstractCollection<Cell>
+    {
+        private final boolean invert;
+
+        private CellCollection(boolean invert)
+        {
+            this.invert = invert;
+        }
+
+        public int size()
+        {
+            return getColumnCount();
+        }
+
+        public Iterator<Cell> iterator()
+        {
+            maybeSortCells();
+            return invert
+                 ? new BackwardsCellIterator(0, size)
+                 : new ForwardsCellIterator(0, size);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/AtomDeserializer.java b/src/java/org/apache/cassandra/db/AtomDeserializer.java
new file mode 100644
index 0000000..799ed0e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/AtomDeserializer.java

@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.io.sstable.Descriptor;
+
+/**
+ * Helper class to deserialize OnDiskAtom efficiently.
+ *
+ * More precisely, this class is used by the low-level readers
+ * (IndexedSliceReader and SSTableNamesIterator) to ensure we don't
+ * do more work than necessary (i.e. we don't allocate/deserialize
+ * objects for things we don't care about).
+ */
+public class AtomDeserializer
+{
+    private final CellNameType type;
+    private final CellNameType.Deserializer nameDeserializer;
+    private final DataInput in;
+    private final ColumnSerializer.Flag flag;
+    private final int expireBefore;
+    private final Descriptor.Version version;
+
+    public AtomDeserializer(CellNameType type, DataInput in, ColumnSerializer.Flag flag, int expireBefore, Descriptor.Version version)
+    {
+        this.type = type;
+        this.nameDeserializer = type.newDeserializer(in);
+        this.in = in;
+        this.flag = flag;
+        this.expireBefore = expireBefore;
+        this.version = version;
+    }
+
+    /**
+     * Whether or not there is more atom to read.
+     */
+    public boolean hasNext() throws IOException
+    {
+        return nameDeserializer.hasNext();
+    }
+
+    /**
+     * Whether or not some atom has been read but not processed (neither readNext() nor
+     * skipNext() has been called for that atom) yet.
+     */
+    public boolean hasUnprocessed() throws IOException
+    {
+        return nameDeserializer.hasUnprocessed();
+    }
+
+    /**
+     * Compare the provided composite to the next atom to read on disk.
+     *
+     * This will not read/deserialize the whole atom but only what is necessary for the
+     * comparison. Whenever we know what to do with this atom (read it or skip it),
+     * readNext or skipNext should be called.
+     */
+    public int compareNextTo(Composite composite) throws IOException
+    {
+        return nameDeserializer.compareNextTo(composite);
+    }
+
+    /**
+     * Returns the next atom.
+     */
+    public OnDiskAtom readNext() throws IOException
+    {
+        Composite name = nameDeserializer.readNext();
+        assert !name.isEmpty(); // This would imply hasNext() hasn't been called
+        int b = in.readUnsignedByte();
+        if ((b & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0)
+            return type.rangeTombstoneSerializer().deserializeBody(in, name, version);
+        else
+            return type.columnSerializer().deserializeColumnBody(in, (CellName)name, b, flag, expireBefore);
+    }
+
+    /**
+     * Skips the next atom.
+     */
+    public void skipNext() throws IOException
+    {
+        nameDeserializer.skipNext();
+        int b = in.readUnsignedByte();
+        if ((b & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0)
+            type.rangeTombstoneSerializer().skipBody(in, version);
+        else
+            type.columnSerializer().skipColumnBody(in, b);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java b/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java
new file mode 100644
index 0000000..7b5e8a8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java

@@ -0,0 +1,555 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.util.AbstractCollection;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
+import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
+
+import com.google.common.base.Function;
+import com.google.common.base.Functions;
+import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+import org.apache.cassandra.utils.concurrent.Locks;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.HeapAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+import org.apache.cassandra.utils.memory.NativePool;
+
+import static org.apache.cassandra.db.index.SecondaryIndexManager.Updater;
+
+/**
+ * A thread-safe and atomic ISortedColumns implementation.
+ * Operations (in particular addAll) on this implemenation are atomic and
+ * isolated (in the sense of ACID). Typically a addAll is guaranteed that no
+ * other thread can see the state where only parts but not all columns have
+ * been added.
+ * <p/>
+ * WARNING: removing element through getSortedColumns().iterator() is *not* supported
+ */
+public class AtomicBTreeColumns extends ColumnFamily
+{
+    static final long EMPTY_SIZE = ObjectSizes.measure(new AtomicBTreeColumns(CFMetaData.IndexCf, null))
+            + ObjectSizes.measure(new Holder(null, null));
+
+    // Reserved values for wasteTracker field. These values must not be consecutive (see avoidReservedValues)
+    private static final int TRACKER_NEVER_WASTED = 0;
+    private static final int TRACKER_PESSIMISTIC_LOCKING = Integer.MAX_VALUE;
+
+    // The granularity with which we track wasted allocation/work; we round up
+    private static final int ALLOCATION_GRANULARITY_BYTES = 1024;
+    // The number of bytes we have to waste in excess of our acceptable realtime rate of waste (defined below)
+    private static final long EXCESS_WASTE_BYTES = 10 * 1024 * 1024L;
+    private static final int EXCESS_WASTE_OFFSET = (int) (EXCESS_WASTE_BYTES / ALLOCATION_GRANULARITY_BYTES);
+    // Note this is a shift, because dividing a long time and then picking the low 32 bits doesn't give correct rollover behavior
+    private static final int CLOCK_SHIFT = 17;
+    // CLOCK_GRANULARITY = 1^9ns >> CLOCK_SHIFT == 132us == (1/7.63)ms
+
+    /**
+     * (clock + allocation) granularity are combined to give us an acceptable (waste) allocation rate that is defined by
+     * the passage of real time of ALLOCATION_GRANULARITY_BYTES/CLOCK_GRANULARITY, or in this case 7.63Kb/ms, or 7.45Mb/s
+     *
+     * in wasteTracker we maintain within EXCESS_WASTE_OFFSET before the current time; whenever we waste bytes
+     * we increment the current value if it is within this window, and set it to the min of the window plus our waste
+     * otherwise.
+     */
+    private volatile int wasteTracker = TRACKER_NEVER_WASTED;
+
+    private static final AtomicIntegerFieldUpdater<AtomicBTreeColumns> wasteTrackerUpdater = AtomicIntegerFieldUpdater.newUpdater(AtomicBTreeColumns.class, "wasteTracker");
+
+    private static final Function<Cell, CellName> NAME = new Function<Cell, CellName>()
+    {
+        public CellName apply(Cell column)
+        {
+            return column.name();
+        }
+    };
+
+    public static final Factory<AtomicBTreeColumns> factory = new Factory<AtomicBTreeColumns>()
+    {
+        public AtomicBTreeColumns create(CFMetaData metadata, boolean insertReversed, int initialCapacity)
+        {
+            if (insertReversed)
+                throw new IllegalArgumentException();
+            return new AtomicBTreeColumns(metadata);
+        }
+    };
+
+    private static final DeletionInfo LIVE = DeletionInfo.live();
+    // This is a small optimization: DeletionInfo is mutable, but we know that we will always copy it in that class,
+    // so we can safely alias one DeletionInfo.live() reference and avoid some allocations.
+    private static final Holder EMPTY = new Holder(BTree.empty(), LIVE);
+
+    private volatile Holder ref;
+
+    private static final AtomicReferenceFieldUpdater<AtomicBTreeColumns, Holder> refUpdater = AtomicReferenceFieldUpdater.newUpdater(AtomicBTreeColumns.class, Holder.class, "ref");
+
+    private AtomicBTreeColumns(CFMetaData metadata)
+    {
+        this(metadata, EMPTY);
+    }
+
+    private AtomicBTreeColumns(CFMetaData metadata, Holder holder)
+    {
+        super(metadata);
+        this.ref = holder;
+    }
+
+    public Factory getFactory()
+    {
+        return factory;
+    }
+
+    public ColumnFamily cloneMe()
+    {
+        return new AtomicBTreeColumns(metadata, ref);
+    }
+
+    public DeletionInfo deletionInfo()
+    {
+        return ref.deletionInfo;
+    }
+
+    public void delete(DeletionTime delTime)
+    {
+        delete(new DeletionInfo(delTime));
+    }
+
+    protected void delete(RangeTombstone tombstone)
+    {
+        delete(new DeletionInfo(tombstone, getComparator()));
+    }
+
+    public void delete(DeletionInfo info)
+    {
+        if (info.isLive())
+            return;
+
+        // Keeping deletion info for max markedForDeleteAt value
+        while (true)
+        {
+            Holder current = ref;
+            DeletionInfo curDelInfo = current.deletionInfo;
+            DeletionInfo newDelInfo = info.mayModify(curDelInfo) ? curDelInfo.copy().add(info) : curDelInfo;
+            if (refUpdater.compareAndSet(this, current, current.with(newDelInfo)))
+                break;
+        }
+    }
+
+    public void setDeletionInfo(DeletionInfo newInfo)
+    {
+        ref = ref.with(newInfo);
+    }
+
+    public void purgeTombstones(int gcBefore)
+    {
+        while (true)
+        {
+            Holder current = ref;
+            if (!current.deletionInfo.hasPurgeableTombstones(gcBefore))
+                break;
+
+            DeletionInfo purgedInfo = current.deletionInfo.copy();
+            purgedInfo.purge(gcBefore);
+            if (refUpdater.compareAndSet(this, current, current.with(purgedInfo)))
+                break;
+        }
+    }
+
+    /**
+     * This is only called by Memtable.resolve, so only AtomicBTreeColumns needs to implement it.
+     *
+     * @return the difference in size seen after merging the given columns
+     */
+    public long addAllWithSizeDelta(final ColumnFamily cm, MemtableAllocator allocator, OpOrder.Group writeOp, Updater indexer)
+    {
+        ColumnUpdater updater = new ColumnUpdater(this, cm.metadata, allocator, writeOp, indexer);
+        DeletionInfo inputDeletionInfoCopy = null;
+
+        boolean monitorOwned = false;
+        try
+        {
+            if (usePessimisticLocking())
+            {
+                Locks.monitorEnterUnsafe(this);
+                monitorOwned = true;
+            }
+            while (true)
+            {
+                Holder current = ref;
+                updater.ref = current;
+                updater.reset();
+
+                DeletionInfo deletionInfo;
+                if (cm.deletionInfo().mayModify(current.deletionInfo))
+                {
+                    if (inputDeletionInfoCopy == null)
+                        inputDeletionInfoCopy = cm.deletionInfo().copy(HeapAllocator.instance);
+
+                    deletionInfo = current.deletionInfo.copy().add(inputDeletionInfoCopy);
+                    updater.allocated(deletionInfo.unsharedHeapSize() - current.deletionInfo.unsharedHeapSize());
+                }
+                else
+                {
+                    deletionInfo = current.deletionInfo;
+                }
+
+                Object[] tree = BTree.update(current.tree, metadata.comparator.columnComparator(Memtable.MEMORY_POOL instanceof NativePool), cm, cm.getColumnCount(), true, updater);
+
+                if (tree != null && refUpdater.compareAndSet(this, current, new Holder(tree, deletionInfo)))
+                {
+                    indexer.updateRowLevelIndexes();
+                    updater.finish();
+                    return updater.dataSize;
+                }
+                else if (!monitorOwned)
+                {
+                    boolean shouldLock = usePessimisticLocking();
+                    if (!shouldLock)
+                    {
+                        shouldLock = updateWastedAllocationTracker(updater.heapSize);
+                    }
+                    if (shouldLock)
+                    {
+                        Locks.monitorEnterUnsafe(this);
+                        monitorOwned = true;
+                    }
+                }
+            }
+        }
+        finally
+        {
+            if (monitorOwned)
+                Locks.monitorExitUnsafe(this);
+        }
+    }
+
+    boolean usePessimisticLocking()
+    {
+        return wasteTracker == TRACKER_PESSIMISTIC_LOCKING;
+    }
+
+    /**
+     * Update the wasted allocation tracker state based on newly wasted allocation information
+     *
+     * @param wastedBytes the number of bytes wasted by this thread
+     * @return true if the caller should now proceed with pessimistic locking because the waste limit has been reached
+     */
+    private boolean updateWastedAllocationTracker(long wastedBytes) {
+        // Early check for huge allocation that exceeds the limit
+        if (wastedBytes < EXCESS_WASTE_BYTES)
+        {
+            // We round up to ensure work < granularity are still accounted for
+            int wastedAllocation = ((int) (wastedBytes + ALLOCATION_GRANULARITY_BYTES - 1)) / ALLOCATION_GRANULARITY_BYTES;
+
+            int oldTrackerValue;
+            while (TRACKER_PESSIMISTIC_LOCKING != (oldTrackerValue = wasteTracker))
+            {
+                // Note this time value has an arbitrary offset, but is a constant rate 32 bit counter (that may wrap)
+                int time = (int) (System.nanoTime() >>> CLOCK_SHIFT);
+                int delta = oldTrackerValue - time;
+                if (oldTrackerValue == TRACKER_NEVER_WASTED || delta >= 0 || delta < -EXCESS_WASTE_OFFSET)
+                    delta = -EXCESS_WASTE_OFFSET;
+                delta += wastedAllocation;
+                if (delta >= 0)
+                    break;
+                if (wasteTrackerUpdater.compareAndSet(this, oldTrackerValue, avoidReservedValues(time + delta)))
+                    return false;
+            }
+        }
+        // We have definitely reached our waste limit so set the state if it isn't already
+        wasteTrackerUpdater.set(this, TRACKER_PESSIMISTIC_LOCKING);
+        // And tell the caller to proceed with pessimistic locking
+        return true;
+    }
+
+    private static int avoidReservedValues(int wasteTracker)
+    {
+        if (wasteTracker == TRACKER_NEVER_WASTED || wasteTracker == TRACKER_PESSIMISTIC_LOCKING)
+            return wasteTracker + 1;
+        return wasteTracker;
+    }
+
+    // no particular reason not to implement these next methods, we just haven't needed them yet
+
+    public void addColumn(Cell column)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void maybeAppendColumn(Cell cell, DeletionInfo.InOrderTester tester, int gcBefore)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void addAll(ColumnFamily cf)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void clear()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public Cell getColumn(CellName name)
+    {
+        return (Cell) BTree.find(ref.tree, asymmetricComparator(), name);
+    }
+
+    private Comparator<Object> asymmetricComparator()
+    {
+        return metadata.comparator.asymmetricColumnComparator(Memtable.MEMORY_POOL instanceof NativePool);
+    }
+
+    public Iterable<CellName> getColumnNames()
+    {
+        return collection(false, NAME);
+    }
+
+    public Collection<Cell> getSortedColumns()
+    {
+        return collection(true, Functions.<Cell>identity());
+    }
+
+    public Collection<Cell> getReverseSortedColumns()
+    {
+        return collection(false, Functions.<Cell>identity());
+    }
+
+    private <V> Collection<V> collection(final boolean forwards, final Function<Cell, V> f)
+    {
+        final Holder ref = this.ref;
+        return new AbstractCollection<V>()
+        {
+            public Iterator<V> iterator()
+            {
+                return Iterators.transform(BTree.<Cell>slice(ref.tree, forwards), f);
+            }
+
+            public int size()
+            {
+                return BTree.slice(ref.tree, true).count();
+            }
+        };
+    }
+
+    public int getColumnCount()
+    {
+        return BTree.slice(ref.tree, true).count();
+    }
+
+    public boolean hasColumns()
+    {
+        return !BTree.isEmpty(ref.tree);
+    }
+
+    public Iterator<Cell> iterator(ColumnSlice[] slices)
+    {
+        return slices.length == 1
+             ? slice(ref.tree, asymmetricComparator(), slices[0].start, slices[0].finish, true)
+             : new SliceIterator(ref.tree, asymmetricComparator(), true, slices);
+    }
+
+    public Iterator<Cell> reverseIterator(ColumnSlice[] slices)
+    {
+        return slices.length == 1
+             ? slice(ref.tree, asymmetricComparator(), slices[0].finish, slices[0].start, false)
+             : new SliceIterator(ref.tree, asymmetricComparator(), false, slices);
+    }
+
+    public boolean isInsertReversed()
+    {
+        return false;
+    }
+
+    private static final class Holder
+    {
+        final DeletionInfo deletionInfo;
+        // the btree of columns
+        final Object[] tree;
+
+        Holder(Object[] tree, DeletionInfo deletionInfo)
+        {
+            this.tree = tree;
+            this.deletionInfo = deletionInfo;
+        }
+
+        Holder with(DeletionInfo info)
+        {
+            return new Holder(this.tree, info);
+        }
+    }
+
+    // the function we provide to the btree utilities to perform any column replacements
+    private static final class ColumnUpdater implements UpdateFunction<Cell>
+    {
+        final AtomicBTreeColumns updating;
+        final CFMetaData metadata;
+        final MemtableAllocator allocator;
+        final OpOrder.Group writeOp;
+        final Updater indexer;
+        Holder ref;
+        long dataSize;
+        long heapSize;
+        final MemtableAllocator.DataReclaimer reclaimer;
+        List<Cell> inserted; // TODO: replace with walk of aborted BTree
+
+        private ColumnUpdater(AtomicBTreeColumns updating, CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group writeOp, Updater indexer)
+        {
+            this.updating = updating;
+            this.allocator = allocator;
+            this.writeOp = writeOp;
+            this.indexer = indexer;
+            this.metadata = metadata;
+            this.reclaimer = allocator.reclaimer();
+        }
+
+        public Cell apply(Cell insert)
+        {
+            indexer.insert(insert);
+            insert = insert.localCopy(metadata, allocator, writeOp);
+            this.dataSize += insert.cellDataSize();
+            this.heapSize += insert.unsharedHeapSizeExcludingData();
+            if (inserted == null)
+                inserted = new ArrayList<>();
+            inserted.add(insert);
+            return insert;
+        }
+
+        public Cell apply(Cell existing, Cell update)
+        {
+            Cell reconciled = existing.reconcile(update);
+            indexer.update(existing, reconciled);
+            if (existing != reconciled)
+            {
+                reconciled = reconciled.localCopy(metadata, allocator, writeOp);
+                dataSize += reconciled.cellDataSize() - existing.cellDataSize();
+                heapSize += reconciled.unsharedHeapSizeExcludingData() - existing.unsharedHeapSizeExcludingData();
+                if (inserted == null)
+                    inserted = new ArrayList<>();
+                inserted.add(reconciled);
+                discard(existing);
+            }
+            return reconciled;
+        }
+
+        protected void reset()
+        {
+            this.dataSize = 0;
+            this.heapSize = 0;
+            if (inserted != null)
+            {
+                for (Cell cell : inserted)
+                    abort(cell);
+                inserted.clear();
+            }
+            reclaimer.cancel();
+        }
+
+        protected void abort(Cell abort)
+        {
+            reclaimer.reclaimImmediately(abort);
+        }
+
+        protected void discard(Cell discard)
+        {
+            reclaimer.reclaim(discard);
+        }
+
+        public boolean abortEarly()
+        {
+            return updating.ref != ref;
+        }
+
+        public void allocated(long heapSize)
+        {
+            this.heapSize += heapSize;
+        }
+
+        protected void finish()
+        {
+            allocator.onHeap().allocate(heapSize, writeOp);
+            reclaimer.commit();
+        }
+    }
+
+    private static class SliceIterator extends AbstractIterator<Cell>
+    {
+        private final Object[] btree;
+        private final boolean forwards;
+        private final Comparator<Object> comparator;
+        private final ColumnSlice[] slices;
+
+        private int idx = 0;
+        private Iterator<Cell> currentSlice;
+
+        SliceIterator(Object[] btree, Comparator<Object> comparator, boolean forwards, ColumnSlice[] slices)
+        {
+            this.btree = btree;
+            this.comparator = comparator;
+            this.slices = slices;
+            this.forwards = forwards;
+        }
+
+        protected Cell computeNext()
+        {
+            if (currentSlice == null)
+            {
+                if (idx >= slices.length)
+                    return endOfData();
+
+                ColumnSlice slice = slices[idx++];
+                if (forwards)
+                    currentSlice = slice(btree, comparator, slice.start, slice.finish, true);
+                else
+                    currentSlice = slice(btree, comparator, slice.finish, slice.start, false);
+            }
+
+            if (currentSlice.hasNext())
+                return currentSlice.next();
+
+            currentSlice = null;
+            return computeNext();
+        }
+    }
+
+    private static Iterator<Cell> slice(Object[] btree, Comparator<Object> comparator, Composite start, Composite finish, boolean forwards)
+    {
+        return BTree.slice(btree,
+                           comparator,
+                           start.isEmpty() ? null : start,
+                           true,
+                           finish.isEmpty() ? null : finish,
+                           true,
+                           forwards);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/AtomicSortedColumns.java b/src/java/org/apache/cassandra/db/AtomicSortedColumns.java
deleted file mode 100644
index cacd3bb..0000000
--- a/src/java/org/apache/cassandra/db/AtomicSortedColumns.java
+++ /dev/null

@@ -1,337 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.atomic.AtomicReference;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-
-import edu.stanford.ppl.concurrent.SnapTreeMap;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.utils.Allocator;
-
-/**
- * A thread-safe and atomic ISortedColumns implementation.
- * Operations (in particular addAll) on this implemenation are atomic and
- * isolated (in the sense of ACID). Typically a addAll is guaranteed that no
- * other thread can see the state where only parts but not all columns have
- * been added.
- *
- * The implementation uses snaptree (https://github.com/nbronson/snaptree),
- * and in particular it's copy-on-write clone operation to achieve its
- * atomicity guarantee.
- *
- * WARNING: removing element through getSortedColumns().iterator() is *not*
- * isolated of other operations and could actually be fully ignored in the
- * face of a concurrent. Don't use it unless in a non-concurrent context.
- */
-public class AtomicSortedColumns extends ColumnFamily
-{
-    private final AtomicReference<Holder> ref;
-
-    public static final ColumnFamily.Factory<AtomicSortedColumns> factory = new Factory<AtomicSortedColumns>()
-    {
-        public AtomicSortedColumns create(CFMetaData metadata, boolean insertReversed)
-        {
-            return new AtomicSortedColumns(metadata);
-        }
-    };
-
-    private AtomicSortedColumns(CFMetaData metadata)
-    {
-        this(metadata, new Holder(metadata.comparator));
-    }
-
-    private AtomicSortedColumns(CFMetaData metadata, Holder holder)
-    {
-        super(metadata);
-        this.ref = new AtomicReference<>(holder);
-    }
-
-    public AbstractType<?> getComparator()
-    {
-        return (AbstractType<?>)ref.get().map.comparator();
-    }
-
-    public ColumnFamily.Factory getFactory()
-    {
-        return factory;
-    }
-
-    public ColumnFamily cloneMe()
-    {
-        return new AtomicSortedColumns(metadata, ref.get().cloneMe());
-    }
-
-    public DeletionInfo deletionInfo()
-    {
-        return ref.get().deletionInfo;
-    }
-
-    public void delete(DeletionTime delTime)
-    {
-        delete(new DeletionInfo(delTime));
-    }
-
-    protected void delete(RangeTombstone tombstone)
-    {
-        delete(new DeletionInfo(tombstone, getComparator()));
-    }
-
-    public void delete(DeletionInfo info)
-    {
-        if (info.isLive())
-            return;
-
-        // Keeping deletion info for max markedForDeleteAt value
-        while (true)
-        {
-            Holder current = ref.get();
-            DeletionInfo newDelInfo = current.deletionInfo.copy().add(info);
-            if (ref.compareAndSet(current, current.with(newDelInfo)))
-                break;
-        }
-    }
-
-    public void setDeletionInfo(DeletionInfo newInfo)
-    {
-        ref.set(ref.get().with(newInfo));
-    }
-
-    public void purgeTombstones(int gcBefore)
-    {
-        while (true)
-        {
-            Holder current = ref.get();
-            if (!current.deletionInfo.hasPurgeableTombstones(gcBefore))
-                break;
-
-            DeletionInfo purgedInfo = current.deletionInfo.copy();
-            purgedInfo.purge(gcBefore);
-            if (ref.compareAndSet(current, current.with(purgedInfo)))
-                break;
-        }
-    }
-
-    public void addColumn(Column column, Allocator allocator)
-    {
-        Holder current, modified;
-        do
-        {
-            current = ref.get();
-            modified = current.cloneMe();
-            modified.addColumn(column, allocator, SecondaryIndexManager.nullUpdater);
-        }
-        while (!ref.compareAndSet(current, modified));
-    }
-
-    public void addAll(ColumnFamily cm, Allocator allocator, Function<Column, Column> transformation)
-    {
-        addAllWithSizeDelta(cm, allocator, transformation, SecondaryIndexManager.nullUpdater);
-    }
-
-    /**
-     *  This is only called by Memtable.resolve, so only AtomicSortedColumns needs to implement it.
-     *
-     *  @return the difference in size seen after merging the given columns
-     */
-    public long addAllWithSizeDelta(ColumnFamily cm, Allocator allocator, Function<Column, Column> transformation, SecondaryIndexManager.Updater indexer)
-    {
-        /*
-         * This operation needs to atomicity and isolation. To that end, we
-         * add the new column to a copy of the map (a cheap O(1) snapTree
-         * clone) and atomically compare and swap when everything has been
-         * added. Of course, we must not forget to update the deletion times
-         * too.
-         * In case we are adding a lot of columns, failing the final compare
-         * and swap could be expensive. To mitigate, we check we haven't been
-         * beaten by another thread after every column addition. If we have,
-         * we bail early, avoiding unnecessary work if possible.
-         */
-        Holder current, modified;
-        long sizeDelta;
-
-        main_loop:
-        do
-        {
-            sizeDelta = 0;
-            current = ref.get();
-            DeletionInfo newDelInfo = current.deletionInfo;
-            if (cm.deletionInfo().mayModify(newDelInfo))
-            {
-                newDelInfo = current.deletionInfo.copy().add(cm.deletionInfo());
-                sizeDelta += newDelInfo.dataSize() - current.deletionInfo.dataSize();
-            }
-            modified = new Holder(current.map.clone(), newDelInfo);
-
-            for (Column column : cm)
-            {
-                sizeDelta += modified.addColumn(transformation.apply(column), allocator, indexer);
-                // bail early if we know we've been beaten
-                if (ref.get() != current)
-                    continue main_loop;
-            }
-        }
-        while (!ref.compareAndSet(current, modified));
-
-        indexer.updateRowLevelIndexes();
-
-        return sizeDelta;
-    }
-
-    public boolean replace(Column oldColumn, Column newColumn)
-    {
-        if (!oldColumn.name().equals(newColumn.name()))
-            throw new IllegalArgumentException();
-
-        Holder current, modified;
-        boolean replaced;
-        do
-        {
-            current = ref.get();
-            modified = current.cloneMe();
-            replaced = modified.map.replace(oldColumn.name(), oldColumn, newColumn);
-        }
-        while (!ref.compareAndSet(current, modified));
-        return replaced;
-    }
-
-    public void clear()
-    {
-        Holder current, modified;
-        do
-        {
-            current = ref.get();
-            modified = current.clear();
-        }
-        while (!ref.compareAndSet(current, modified));
-    }
-
-    public Column getColumn(ByteBuffer name)
-    {
-        return ref.get().map.get(name);
-    }
-
-    public SortedSet<ByteBuffer> getColumnNames()
-    {
-        return ref.get().map.keySet();
-    }
-
-    public Collection<Column> getSortedColumns()
-    {
-        return ref.get().map.values();
-    }
-
-    public Collection<Column> getReverseSortedColumns()
-    {
-        return ref.get().map.descendingMap().values();
-    }
-
-    public int getColumnCount()
-    {
-        return ref.get().map.size();
-    }
-
-    public Iterator<Column> iterator(ColumnSlice[] slices)
-    {
-        return new ColumnSlice.NavigableMapIterator(ref.get().map, slices);
-    }
-
-    public Iterator<Column> reverseIterator(ColumnSlice[] slices)
-    {
-        return new ColumnSlice.NavigableMapIterator(ref.get().map.descendingMap(), slices);
-    }
-
-    public boolean isInsertReversed()
-    {
-        return false;
-    }
-
-    private static class Holder
-    {
-        // This is a small optimization: DeletionInfo is mutable, but we know that we will always copy it in that class,
-        // so we can safely alias one DeletionInfo.live() reference and avoid some allocations.
-        private static final DeletionInfo LIVE = DeletionInfo.live();
-
-        final SnapTreeMap<ByteBuffer, Column> map;
-        final DeletionInfo deletionInfo;
-
-        Holder(AbstractType<?> comparator)
-        {
-            this(new SnapTreeMap<ByteBuffer, Column>(comparator), LIVE);
-        }
-
-        Holder(SnapTreeMap<ByteBuffer, Column> map, DeletionInfo deletionInfo)
-        {
-            this.map = map;
-            this.deletionInfo = deletionInfo;
-        }
-
-        Holder cloneMe()
-        {
-            return with(map.clone());
-        }
-
-        Holder with(DeletionInfo info)
-        {
-            return new Holder(map, info);
-        }
-
-        Holder with(SnapTreeMap<ByteBuffer, Column> newMap)
-        {
-            return new Holder(newMap, deletionInfo);
-        }
-
-        // There is no point in cloning the underlying map to clear it
-        // afterwards.
-        Holder clear()
-        {
-            return new Holder(new SnapTreeMap<ByteBuffer, Column>(map.comparator()), LIVE);
-        }
-
-        long addColumn(Column column, Allocator allocator, SecondaryIndexManager.Updater indexer)
-        {
-            ByteBuffer name = column.name();
-            while (true)
-            {
-                Column oldColumn = map.putIfAbsent(name, column);
-                if (oldColumn == null)
-                {
-                    indexer.insert(column);
-                    return column.dataSize();
-                }
-
-                Column reconciledColumn = column.reconcile(oldColumn, allocator);
-                if (map.replace(name, oldColumn, reconciledColumn))
-                {
-                    indexer.update(oldColumn, reconciledColumn);
-                    return reconciledColumn.dataSize() - oldColumn.dataSize();
-                }
-                // We failed to replace column due to a concurrent update or a concurrent removal. Keep trying.
-                // (Currently, concurrent removal should not happen (only updates), but let us support that anyway.)
-            }
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/BatchlogManager.java b/src/java/org/apache/cassandra/db/BatchlogManager.java
index 48f4c3c..18d9a17 100644
--- a/src/java/org/apache/cassandra/db/BatchlogManager.java
+++ b/src/java/org/apache/cassandra/db/BatchlogManager.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInputStream;
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.lang.management.ManagementFactory;
 import java.net.InetAddress;
@@ -30,7 +29,7 @@
 import javax.management.ObjectName;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.Iterables;
+import com.google.common.collect.*;
 import com.google.common.util.concurrent.RateLimiter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -38,18 +37,17 @@
 import org.apache.cassandra.concurrent.DebuggableScheduledThreadPoolExecutor;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.db.marshal.UUIDType;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.WriteTimeoutException;
 import org.apache.cassandra.gms.FailureDetector;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.io.util.FastByteArrayOutputStream;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.StorageProxy;
 import org.apache.cassandra.service.StorageService;
@@ -58,6 +56,8 @@
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.WrappedRunnable;
 
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+
 public class BatchlogManager implements BatchlogManagerMBean
 {
     private static final String MBEAN_NAME = "org.apache.cassandra.db:type=BatchlogManager";
@@ -97,7 +97,8 @@
 
     public int countAllBatches()
     {
-        return (int) process("SELECT count(*) FROM %s.%s", Keyspace.SYSTEM_KS, SystemKeyspace.BATCHLOG_CF).one().getLong("count");
+        String query = String.format("SELECT count(*) FROM %s.%s", Keyspace.SYSTEM_KS, SystemKeyspace.BATCHLOG_CF);
+        return (int) executeInternal(query).one().getLong("count");
     }
 
     public long getTotalBatchesReplayed()
@@ -123,42 +124,38 @@
         return batchlogTasks.submit(runnable);
     }
 
-    public static RowMutation getBatchlogMutationFor(Collection<RowMutation> mutations, UUID uuid)
+    public static Mutation getBatchlogMutationFor(Collection<Mutation> mutations, UUID uuid, int version)
     {
-        return getBatchlogMutationFor(mutations, uuid, FBUtilities.timestampMicros());
+        return getBatchlogMutationFor(mutations, uuid, version, FBUtilities.timestampMicros());
     }
 
     @VisibleForTesting
-    static RowMutation getBatchlogMutationFor(Collection<RowMutation> mutations, UUID uuid, long now)
+    static Mutation getBatchlogMutationFor(Collection<Mutation> mutations, UUID uuid, int version, long now)
     {
-        ByteBuffer writtenAt = LongType.instance.decompose(now / 1000);
-        ByteBuffer data = serializeRowMutations(mutations);
-
         ColumnFamily cf = ArrayBackedSortedColumns.factory.create(CFMetaData.BatchlogCf);
-        cf.addColumn(new Column(columnName(""), ByteBufferUtil.EMPTY_BYTE_BUFFER, now));
-        cf.addColumn(new Column(columnName("data"), data, now));
-        cf.addColumn(new Column(columnName("written_at"), writtenAt, now));
-
-        return new RowMutation(Keyspace.SYSTEM_KS, UUIDType.instance.decompose(uuid), cf);
+        CFRowAdder adder = new CFRowAdder(cf, CFMetaData.BatchlogCf.comparator.builder().build(), now);
+        adder.add("data", serializeMutations(mutations, version))
+             .add("written_at", new Date(now / 1000))
+             .add("version", version);
+        return new Mutation(Keyspace.SYSTEM_KS, UUIDType.instance.decompose(uuid), cf);
     }
 
-    private static ByteBuffer serializeRowMutations(Collection<RowMutation> mutations)
+    private static ByteBuffer serializeMutations(Collection<Mutation> mutations, int version)
     {
-        FastByteArrayOutputStream bos = new FastByteArrayOutputStream();
-        DataOutputStream out = new DataOutputStream(bos);
+        DataOutputBuffer buf = new DataOutputBuffer();
 
         try
         {
-            out.writeInt(mutations.size());
-            for (RowMutation rm : mutations)
-                RowMutation.serializer.serialize(rm, out, MessagingService.VERSION_12);
+            buf.writeInt(mutations.size());
+            for (Mutation mutation : mutations)
+                Mutation.serializer.serialize(mutation, buf, version);
         }
         catch (IOException e)
         {
             throw new AssertionError(); // cannot happen.
         }
 
-        return ByteBuffer.wrap(bos.toByteArray());
+        return buf.asByteBuffer();
     }
 
     private void replayAllFailedBatches() throws ExecutionException, InterruptedException
@@ -170,10 +167,10 @@
         int throttleInKB = DatabaseDescriptor.getBatchlogReplayThrottleInKB() / StorageService.instance.getTokenMetadata().getAllEndpoints().size();
         RateLimiter rateLimiter = RateLimiter.create(throttleInKB == 0 ? Double.MAX_VALUE : throttleInKB * 1024);
 
-        UntypedResultSet page = process("SELECT id, data, written_at, version FROM %s.%s LIMIT %d",
-                                        Keyspace.SYSTEM_KS,
-                                        SystemKeyspace.BATCHLOG_CF,
-                                        PAGE_SIZE);
+        UntypedResultSet page = executeInternal(String.format("SELECT id, data, written_at, version FROM %s.%s LIMIT %d",
+                                                              Keyspace.SYSTEM_KS,
+                                                              SystemKeyspace.BATCHLOG_CF,
+                                                              PAGE_SIZE));
 
         while (!page.isEmpty())
         {
@@ -182,11 +179,11 @@
             if (page.size() < PAGE_SIZE)
                 break; // we've exhausted the batchlog, next query would be empty.
 
-            page = process("SELECT id, data, written_at, version FROM %s.%s WHERE token(id) > token(%s) LIMIT %d",
-                           Keyspace.SYSTEM_KS,
-                           SystemKeyspace.BATCHLOG_CF,
-                           id,
-                           PAGE_SIZE);
+            page = executeInternal(String.format("SELECT id, data, written_at, version FROM %s.%s WHERE token(id) > token(?) LIMIT %d",
+                                                 Keyspace.SYSTEM_KS,
+                                                 SystemKeyspace.BATCHLOG_CF,
+                                                 PAGE_SIZE),
+                                   id);
         }
 
         cleanup();
@@ -194,21 +191,59 @@
         logger.debug("Finished replayAllFailedBatches");
     }
 
-    // returns the UUID of the last seen batch
+    private void deleteBatch(UUID id)
+    {
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, UUIDType.instance.decompose(id));
+        mutation.delete(SystemKeyspace.BATCHLOG_CF, FBUtilities.timestampMicros());
+        mutation.apply();
+    }
+
     private UUID processBatchlogPage(UntypedResultSet page, RateLimiter rateLimiter)
     {
         UUID id = null;
+        ArrayList<Batch> batches = new ArrayList<>(page.size());
+
+        // Sending out batches for replay without waiting for them, so that one stuck batch doesn't affect others
         for (UntypedResultSet.Row row : page)
         {
             id = row.getUUID("id");
             long writtenAt = row.getLong("written_at");
-            int version = row.has("version") ? row.getInt("version") : MessagingService.VERSION_12;
             // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
             long timeout = getBatchlogTimeout();
             if (System.currentTimeMillis() < writtenAt + timeout)
                 continue; // not ready to replay yet, might still get a deletion.
-            replayBatch(id, row.getBytes("data"), writtenAt, version, rateLimiter);
+
+            int version = row.has("version") ? row.getInt("version") : MessagingService.VERSION_12;
+            Batch batch = new Batch(id, writtenAt, row.getBytes("data"), version);
+            try
+            {
+                if (batch.replay(rateLimiter) > 0)
+                {
+                    batches.add(batch);
+                }
+                else
+                {
+                    deleteBatch(id); // no write mutations were sent (either expired or all CFs involved truncated).
+                    totalBatchesReplayed.incrementAndGet();
+                }
+            }
+            catch (IOException e)
+            {
+                logger.warn("Skipped batch replay of {} due to {}", id, e);
+                deleteBatch(id);
+            }
         }
+
+        // now waiting for all batches to complete their processing
+        // schedule hints for timed out deliveries
+        for (Batch batch : batches)
+        {
+            batch.finish();
+            deleteBatch(batch.id);
+        }
+
+        totalBatchesReplayed.addAndGet(batches.size());
+
         return id;
     }
 
@@ -217,149 +252,191 @@
         return DatabaseDescriptor.getWriteRpcTimeout() * 2; // enough time for the actual write + BM removal mutation
     }
 
-    private void replayBatch(UUID id, ByteBuffer data, long writtenAt, int version, RateLimiter rateLimiter)
+    private static class Batch
     {
-        logger.debug("Replaying batch {}", id);
+        private final UUID id;
+        private final long writtenAt;
+        private final ByteBuffer data;
+        private final int version;
 
-        try
+        private List<ReplayWriteResponseHandler> replayHandlers;
+
+        public Batch(UUID id, long writtenAt, ByteBuffer data, int version)
         {
-            replaySerializedMutations(data, writtenAt, version, rateLimiter);
-        }
-        catch (IOException e)
-        {
-            logger.warn("Skipped batch replay of {} due to {}", id, e);
+            this.id = id;
+            this.writtenAt = writtenAt;
+            this.data = data;
+            this.version = version;
         }
 
-        deleteBatch(id);
-
-        totalBatchesReplayed.incrementAndGet();
-    }
-
-    private void deleteBatch(UUID id)
-    {
-        RowMutation mutation = new RowMutation(Keyspace.SYSTEM_KS, UUIDType.instance.decompose(id));
-        mutation.delete(SystemKeyspace.BATCHLOG_CF, FBUtilities.timestampMicros());
-        mutation.apply();
-    }
-
-    private void replaySerializedMutations(ByteBuffer data, long writtenAt, int version, RateLimiter rateLimiter) throws IOException
-    {
-        DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(data));
-        int size = in.readInt();
-        List<RowMutation> mutations = new ArrayList<>(size);
-
-        for (int i = 0; i < size; i++)
+        public int replay(RateLimiter rateLimiter) throws IOException
         {
-            RowMutation mutation = RowMutation.serializer.deserialize(in, version);
+            logger.debug("Replaying batch {}", id);
 
-            // Remove CFs that have been truncated since. writtenAt and SystemTable#getTruncatedAt() both return millis.
-            // We don't abort the replay entirely b/c this can be considered a succes (truncated is same as delivered then
-            // truncated.
-            for (UUID cfId : mutation.getColumnFamilyIds())
-                if (writtenAt <= SystemKeyspace.getTruncatedAt(cfId))
-                    mutation = mutation.without(cfId);
+            List<Mutation> mutations = replayingMutations();
 
-            if (!mutation.isEmpty())
-                mutations.add(mutation);
+            if (mutations.isEmpty())
+                return 0;
+
+            int ttl = calculateHintTTL(mutations);
+            if (ttl <= 0)
+                return 0;
+
+            replayHandlers = sendReplays(mutations, writtenAt, ttl);
+
+            rateLimiter.acquire(data.remaining()); // acquire afterwards, to not mess up ttl calculation.
+
+            return replayHandlers.size();
         }
 
-        if (!mutations.isEmpty())
-            replayMutations(mutations, writtenAt, version, rateLimiter);
-    }
-
-    /*
-     * We try to deliver the mutations to the replicas ourselves if they are alive and only resort to writing hints
-     * when a replica is down or a write request times out.
-     */
-    private void replayMutations(List<RowMutation> mutations, long writtenAt, int version, RateLimiter rateLimiter) throws IOException
-    {
-        int ttl = calculateHintTTL(mutations, writtenAt);
-        if (ttl <= 0)
-            return; // this batchlog entry has 'expired'
-
-        List<InetAddress> liveEndpoints = new ArrayList<>();
-        List<InetAddress> hintEndpoints = new ArrayList<>();
-        
-        for (RowMutation mutation : mutations)
+        public void finish()
         {
+            for (int i = 0; i < replayHandlers.size(); i++)
+            {
+                ReplayWriteResponseHandler handler = replayHandlers.get(i);
+                try
+                {
+                    handler.get();
+                }
+                catch (WriteTimeoutException e)
+                {
+                    logger.debug("Timed out replaying a batched mutation to a node, will write a hint");
+                    // writing hints for the rest to hints, starting from i
+                    writeHintsForUndeliveredEndpoints(i);
+                    return;
+                }
+            }
+        }
+
+        private List<Mutation> replayingMutations() throws IOException
+        {
+            DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(data));
+            int size = in.readInt();
+            List<Mutation> mutations = new ArrayList<>(size);
+            for (int i = 0; i < size; i++)
+            {
+                Mutation mutation = Mutation.serializer.deserialize(in, version);
+
+                // Remove CFs that have been truncated since. writtenAt and SystemTable#getTruncatedAt() both return millis.
+                // We don't abort the replay entirely b/c this can be considered a success (truncated is same as delivered then
+                // truncated.
+                for (UUID cfId : mutation.getColumnFamilyIds())
+                    if (writtenAt <= SystemKeyspace.getTruncatedAt(cfId))
+                        mutation = mutation.without(cfId);
+
+                if (!mutation.isEmpty())
+                    mutations.add(mutation);
+            }
+            return mutations;
+        }
+
+        private void writeHintsForUndeliveredEndpoints(int startFrom)
+        {
+            try
+            {
+                // Here we deserialize mutations 2nd time from byte buffer.
+                // but this is ok, because timeout on batch direct delivery is rare
+                // (it can happen only several seconds until node is marked dead)
+                // so trading some cpu to keep less objects
+                List<Mutation> replayingMutations = replayingMutations();
+                for (int i = startFrom; i < replayHandlers.size(); i++)
+                {
+                    Mutation undeliveredMutation = replayingMutations.get(i);
+                    int ttl = calculateHintTTL(replayingMutations);
+                    ReplayWriteResponseHandler handler = replayHandlers.get(i);
+
+                    if (ttl > 0 && handler != null)
+                        for (InetAddress endpoint : handler.undelivered)
+                            StorageProxy.writeHintForMutation(undeliveredMutation, writtenAt, ttl, endpoint);
+                }
+            }
+            catch (IOException e)
+            {
+                logger.error("Cannot schedule hints for undelivered batch", e);
+            }
+        }
+
+        private List<ReplayWriteResponseHandler> sendReplays(List<Mutation> mutations, long writtenAt, int ttl)
+        {
+            List<ReplayWriteResponseHandler> handlers = new ArrayList<>(mutations.size());
+            for (Mutation mutation : mutations)
+            {
+                ReplayWriteResponseHandler handler = sendSingleReplayMutation(mutation, writtenAt, ttl);
+                if (handler != null)
+                    handlers.add(handler);
+            }
+            return handlers;
+        }
+
+        /**
+         * We try to deliver the mutations to the replicas ourselves if they are alive and only resort to writing hints
+         * when a replica is down or a write request times out.
+         *
+         * @return direct delivery handler to wait on or null, if no live nodes found
+         */
+        private ReplayWriteResponseHandler sendSingleReplayMutation(final Mutation mutation, long writtenAt, int ttl)
+        {
+            Set<InetAddress> liveEndpoints = new HashSet<>();
             String ks = mutation.getKeyspaceName();
-            Token tk = StorageService.getPartitioner().getToken(mutation.key());
-            int mutationSize = (int) RowMutation.serializer.serializedSize(mutation, version);
+            Token<?> tk = StorageService.getPartitioner().getToken(mutation.key());
 
             for (InetAddress endpoint : Iterables.concat(StorageService.instance.getNaturalEndpoints(ks, tk),
                                                          StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, ks)))
             {
-                rateLimiter.acquire(mutationSize);
                 if (endpoint.equals(FBUtilities.getBroadcastAddress()))
                     mutation.apply();
                 else if (FailureDetector.instance.isAlive(endpoint))
                     liveEndpoints.add(endpoint); // will try delivering directly instead of writing a hint.
                 else
-                    hintEndpoints.add(endpoint);
+                    StorageProxy.writeHintForMutation(mutation, writtenAt, ttl, endpoint);
             }
 
-            if (!liveEndpoints.isEmpty())
-                hintEndpoints.addAll(attemptDirectDelivery(mutation, liveEndpoints));
+            if (liveEndpoints.isEmpty())
+                return null;
 
-            for (InetAddress endpoint : hintEndpoints)
-                StorageProxy.writeHintForMutation(mutation, writtenAt, ttl, endpoint);
-            
-            liveEndpoints.clear();
-            hintEndpoints.clear();
+            ReplayWriteResponseHandler handler = new ReplayWriteResponseHandler(liveEndpoints);
+            MessageOut<Mutation> message = mutation.createMessage();
+            for (InetAddress endpoint : liveEndpoints)
+                MessagingService.instance().sendRR(message, endpoint, handler, false);
+            return handler;
         }
-    }
 
-    // Returns the endpoints we failed to deliver to.
-    private Set<InetAddress> attemptDirectDelivery(RowMutation mutation, List<InetAddress> endpoints) throws IOException
-    {
-        final List<WriteResponseHandler> handlers = new ArrayList<>();
-        final Set<InetAddress> undelivered = Collections.synchronizedSet(new HashSet<InetAddress>());
-
-        for (final InetAddress ep : endpoints)
+        /*
+         * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
+         * This ensures that deletes aren't "undone" by an old batch replay.
+         */
+        private int calculateHintTTL(Collection<Mutation> mutations)
         {
-            Runnable callback = new Runnable()
-            {
-                public void run()
-                {
-                    undelivered.remove(ep);
-                }
-            };
-            WriteResponseHandler handler = new WriteResponseHandler(ep, WriteType.UNLOGGED_BATCH, callback);
-            MessagingService.instance().sendRR(mutation.createMessage(), ep, handler, false);
-            handlers.add(handler);
+            int unadjustedTTL = Integer.MAX_VALUE;
+            for (Mutation mutation : mutations)
+                unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+            return unadjustedTTL - (int) TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - writtenAt);
         }
 
-        // Wait for all the requests to complete.
-        for (WriteResponseHandler handler : handlers)
+        private static class ReplayWriteResponseHandler extends WriteResponseHandler
         {
-            try
+            private final Set<InetAddress> undelivered = Collections.newSetFromMap(new ConcurrentHashMap<InetAddress, Boolean>());
+
+            public ReplayWriteResponseHandler(Collection<InetAddress> writeEndpoints)
             {
-                handler.get();
+                super(writeEndpoints, Collections.<InetAddress>emptySet(), null, null, null, WriteType.UNLOGGED_BATCH);
+                undelivered.addAll(writeEndpoints);
             }
-            catch (WriteTimeoutException e)
+
+            @Override
+            protected int totalBlockFor()
             {
-                logger.debug("Timed out replaying a batched mutation to a node, will write a hint");
+                return this.naturalEndpoints.size();
+            }
+
+            @Override
+            public void response(MessageIn m)
+            {
+                boolean removed = undelivered.remove(m.from);
+                assert removed;
+                super.response(m);
             }
         }
-
-        return undelivered;
-    }
-
-    /*
-     * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
-     * This ensures that deletes aren't "undone" by an old batch replay.
-     */
-    private int calculateHintTTL(List<RowMutation> mutations, long writtenAt)
-    {
-        int unadjustedTTL = Integer.MAX_VALUE;
-        for (RowMutation mutation : mutations)
-            unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
-        return unadjustedTTL - (int) TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - writtenAt);
-    }
-
-    private static ByteBuffer columnName(String name)
-    {
-        return CFMetaData.BatchlogCf.getCfDef().getColumnNameBuilder().add(UTF8Type.instance.decompose(name)).build();
     }
 
     // force flush + compaction to reclaim space from the replayed batches
@@ -374,8 +451,81 @@
             CompactionManager.instance.submitUserDefined(cfs, descriptors, Integer.MAX_VALUE).get();
     }
 
-    private static UntypedResultSet process(String format, Object... args)
+    public static class EndpointFilter
     {
-        return QueryProcessor.processInternal(String.format(format, args));
+        private final String localRack;
+        private final Multimap<String, InetAddress> endpoints;
+
+        public EndpointFilter(String localRack, Multimap<String, InetAddress> endpoints)
+        {
+            this.localRack = localRack;
+            this.endpoints = endpoints;
+        }
+
+        /**
+         * @return list of candidates for batchlog hosting. If possible these will be two nodes from different racks.
+         */
+        public Collection<InetAddress> filter()
+        {
+            // special case for single-node data centers
+            if (endpoints.values().size() == 1)
+                return endpoints.values();
+
+            // strip out dead endpoints and localhost
+            ListMultimap<String, InetAddress> validated = ArrayListMultimap.create();
+            for (Map.Entry<String, InetAddress> entry : endpoints.entries())
+                if (isValid(entry.getValue()))
+                    validated.put(entry.getKey(), entry.getValue());
+
+            if (validated.size() <= 2)
+                return validated.values();
+
+            if (validated.size() - validated.get(localRack).size() >= 2)
+            {
+                // we have enough endpoints in other racks
+                validated.removeAll(localRack);
+            }
+
+            if (validated.keySet().size() == 1)
+            {
+                // we have only 1 `other` rack
+                Collection<InetAddress> otherRack = Iterables.getOnlyElement(validated.asMap().values());
+                return Lists.newArrayList(Iterables.limit(otherRack, 2));
+            }
+
+            // randomize which racks we pick from if more than 2 remaining
+            Collection<String> racks;
+            if (validated.keySet().size() == 2)
+            {
+                racks = validated.keySet();
+            }
+            else
+            {
+                racks = Lists.newArrayList(validated.keySet());
+                Collections.shuffle((List) racks);
+            }
+
+            // grab a random member of up to two racks
+            List<InetAddress> result = new ArrayList<>(2);
+            for (String rack : Iterables.limit(racks, 2))
+            {
+                List<InetAddress> rackMembers = validated.get(rack);
+                result.add(rackMembers.get(getRandomInt(rackMembers.size())));
+            }
+
+            return result;
+        }
+
+        @VisibleForTesting
+        protected boolean isValid(InetAddress input)
+        {
+            return !input.equals(FBUtilities.getBroadcastAddress()) && FailureDetector.instance.isAlive(input);
+        }
+
+        @VisibleForTesting
+        protected int getRandomInt(int bound)
+        {
+            return ThreadLocalRandom.current().nextInt(bound);
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/BlacklistedDirectories.java b/src/java/org/apache/cassandra/db/BlacklistedDirectories.java
index 999483f..de6c968 100644
--- a/src/java/org/apache/cassandra/db/BlacklistedDirectories.java
+++ b/src/java/org/apache/cassandra/db/BlacklistedDirectories.java

@@ -48,7 +48,7 @@
         }
         catch (Exception e)
         {
-            logger.error("error registering MBean " + MBEAN_NAME, e);
+            logger.error("error registering MBean {}", MBEAN_NAME, e);
             //Allow the server to start even if the bean can't be registered
         }
     }

diff --git a/src/java/org/apache/cassandra/db/BufferCell.java b/src/java/org/apache/cassandra/db/BufferCell.java
new file mode 100644
index 0000000..a7d632d
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/BufferCell.java

@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+public class BufferCell extends AbstractCell
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new BufferCell(CellNames.simpleDense(ByteBuffer.allocate(1))));
+
+    protected final CellName name;
+    protected final ByteBuffer value;
+    protected final long timestamp;
+
+    BufferCell(CellName name)
+    {
+        this(name, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+    }
+
+    public BufferCell(CellName name, ByteBuffer value)
+    {
+        this(name, value, 0);
+    }
+
+    public BufferCell(CellName name, ByteBuffer value, long timestamp)
+    {
+        assert name != null;
+        assert value != null;
+
+        this.name = name;
+        this.value = value;
+        this.timestamp = timestamp;
+    }
+
+    @Override
+    public Cell withUpdatedName(CellName newName)
+    {
+        return new BufferCell(newName, value, timestamp);
+    }
+
+    @Override
+    public Cell withUpdatedTimestamp(long newTimestamp)
+    {
+        return new BufferCell(name, value, newTimestamp);
+    }
+
+    @Override
+    public CellName name() {
+        return name;
+    }
+
+    @Override
+    public ByteBuffer value() {
+        return value;
+    }
+
+    @Override
+    public long timestamp() {
+        return timestamp;
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return EMPTY_SIZE + name.unsharedHeapSizeExcludingData() + ObjectSizes.sizeOnHeapExcludingData(value);
+    }
+
+    @Override
+    public Cell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        return new BufferCell(name.copy(metadata, allocator), allocator.clone(value), timestamp);
+    }
+
+    @Override
+    public Cell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        return allocator.clone(this, metadata, opGroup);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/BufferCounterCell.java b/src/java/org/apache/cassandra/db/BufferCounterCell.java
new file mode 100644
index 0000000..bdd97a7
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/BufferCounterCell.java

@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+public class BufferCounterCell extends BufferCell implements CounterCell
+{
+    private final long timestampOfLastDelete;
+
+    public BufferCounterCell(CellName name, ByteBuffer value, long timestamp)
+    {
+        this(name, value, timestamp, Long.MIN_VALUE);
+    }
+
+    public BufferCounterCell(CellName name, ByteBuffer value, long timestamp, long timestampOfLastDelete)
+    {
+        super(name, value, timestamp);
+        this.timestampOfLastDelete = timestampOfLastDelete;
+    }
+
+    public static CounterCell create(CellName name, ByteBuffer value, long timestamp, long timestampOfLastDelete, ColumnSerializer.Flag flag)
+    {
+        if (flag == ColumnSerializer.Flag.FROM_REMOTE || (flag == ColumnSerializer.Flag.LOCAL && contextManager.shouldClearLocal(value)))
+            value = contextManager.clearAllLocal(value);
+        return new BufferCounterCell(name, value, timestamp, timestampOfLastDelete);
+    }
+
+    // For use by tests of compatibility with pre-2.1 counter only.
+    public static CounterCell createLocal(CellName name, long value, long timestamp, long timestampOfLastDelete)
+    {
+        return new BufferCounterCell(name, contextManager.createLocal(value), timestamp, timestampOfLastDelete);
+    }
+
+    @Override
+    public Cell withUpdatedName(CellName newName)
+    {
+        return new BufferCounterCell(newName, value, timestamp, timestampOfLastDelete);
+    }
+
+    @Override
+    public long timestampOfLastDelete()
+    {
+        return timestampOfLastDelete;
+    }
+
+    @Override
+    public long total()
+    {
+        return contextManager.total(value);
+    }
+
+    @Override
+    public int cellDataSize()
+    {
+        // A counter column adds 8 bytes for timestampOfLastDelete to Cell.
+        return super.cellDataSize() + TypeSizes.NATIVE.sizeof(timestampOfLastDelete);
+    }
+
+    @Override
+    public int serializedSize(CellNameType type, TypeSizes typeSizes)
+    {
+        return super.serializedSize(type, typeSizes) + typeSizes.sizeof(timestampOfLastDelete);
+    }
+
+    @Override
+    public Cell diff(Cell cell)
+    {
+        return diffCounter(cell);
+    }
+
+    /*
+     * We have to special case digest creation for counter column because
+     * we don't want to include the information about which shard of the
+     * context is a delta or not, since this information differs from node to
+     * node.
+     */
+    @Override
+    public void updateDigest(MessageDigest digest)
+    {
+        digest.update(name().toByteBuffer().duplicate());
+        // We don't take the deltas into account in a digest
+        contextManager.updateDigest(digest, value());
+
+        FBUtilities.updateWithLong(digest, timestamp);
+        FBUtilities.updateWithByte(digest, serializationFlags());
+        FBUtilities.updateWithLong(digest, timestampOfLastDelete);
+    }
+
+    @Override
+    public Cell reconcile(Cell cell)
+    {
+        return reconcileCounter(cell);
+    }
+
+    @Override
+    public boolean hasLegacyShards()
+    {
+        return contextManager.hasLegacyShards(value);
+    }
+
+    @Override
+    public CounterCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        return new BufferCounterCell(name.copy(metadata, allocator), allocator.clone(value), timestamp, timestampOfLastDelete);
+    }
+
+    @Override
+    public CounterCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        return allocator.clone(this, metadata, opGroup);
+    }
+
+    @Override
+    public String getString(CellNameType comparator)
+    {
+        return String.format("%s:false:%s@%d!%d",
+                             comparator.getString(name()),
+                             contextManager.toString(value()),
+                             timestamp(),
+                             timestampOfLastDelete);
+    }
+
+    @Override
+    public int serializationFlags()
+    {
+        return ColumnSerializer.COUNTER_MASK;
+    }
+
+    @Override
+    public void validateFields(CFMetaData metadata) throws MarshalException
+    {
+        validateName(metadata);
+        // We cannot use the value validator as for other columns as the CounterColumnType validate a long,
+        // which is not the internal representation of counters
+        contextManager.validateContext(value());
+    }
+
+    @Override
+    public Cell markLocalToBeCleared()
+    {
+        ByteBuffer marked = contextManager.markLocalToBeCleared(value());
+        return marked == value() ? this : new BufferCounterCell(name(), marked, timestamp(), timestampOfLastDelete);
+    }
+
+    @Override
+    public boolean equals(Cell cell)
+    {
+        return cell instanceof CounterCell && equals((CounterCell) cell);
+    }
+
+    public boolean equals(CounterCell cell)
+    {
+        return super.equals(cell) && timestampOfLastDelete == cell.timestampOfLastDelete();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/BufferCounterUpdateCell.java b/src/java/org/apache/cassandra/db/BufferCounterUpdateCell.java
new file mode 100644
index 0000000..f7df3ea
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/BufferCounterUpdateCell.java

@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+public class BufferCounterUpdateCell extends BufferCell implements CounterUpdateCell
+{
+    public BufferCounterUpdateCell(CellName name, long value, long timestamp)
+    {
+        this(name, ByteBufferUtil.bytes(value), timestamp);
+    }
+
+    public BufferCounterUpdateCell(CellName name, ByteBuffer value, long timestamp)
+    {
+        super(name, value, timestamp);
+    }
+
+    @Override
+    public Cell withUpdatedName(CellName newName)
+    {
+        return new BufferCounterUpdateCell(newName, value, timestamp);
+    }
+
+    public long delta()
+    {
+        return value().getLong(value.position());
+    }
+
+    @Override
+    public Cell diff(Cell cell)
+    {
+        // Diff is used during reads, but we should never read those columns
+        throw new UnsupportedOperationException("This operation is unsupported on CounterUpdateCell.");
+    }
+
+    @Override
+    public Cell reconcile(Cell cell)
+    {
+        // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
+        if (cell instanceof DeletedCell)
+            return cell;
+
+        assert cell instanceof CounterUpdateCell : "Wrong class type.";
+
+        // The only time this could happen is if a batch ships two increments for the same cell. Hence we simply sum the deltas.
+        return new BufferCounterUpdateCell(name, delta() + ((CounterUpdateCell) cell).delta(), Math.max(timestamp, cell.timestamp()));
+    }
+
+    @Override
+    public int serializationFlags()
+    {
+        return ColumnSerializer.COUNTER_UPDATE_MASK;
+    }
+
+    @Override
+    public Cell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Cell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public String getString(CellNameType comparator)
+    {
+        return String.format("%s:%s@%d", comparator.getString(name()), ByteBufferUtil.toLong(value), timestamp());
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/Allocator.java b/src/java/org/apache/cassandra/db/BufferDecoratedKey.java
similarity index 64%
copy from src/java/org/apache/cassandra/utils/Allocator.java
copy to src/java/org/apache/cassandra/db/BufferDecoratedKey.java
index 7134353..8a1ad59 100644
--- a/src/java/org/apache/cassandra/utils/Allocator.java
+++ b/src/java/org/apache/cassandra/db/BufferDecoratedKey.java

@@ -15,27 +15,27 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.utils;
+package org.apache.cassandra.db;
 
 import java.nio.ByteBuffer;
 
-public abstract class Allocator
-{
-    /**
-     * Allocate a slice of the given length.
-     */
-    public ByteBuffer clone(ByteBuffer buffer)
-    {
-        assert buffer != null;
-        ByteBuffer cloned = allocate(buffer.remaining());
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.utils.FastByteOperations;
+import org.apache.cassandra.utils.memory.MemoryUtil;
 
-        cloned.mark();
-        cloned.put(buffer.duplicate());
-        cloned.reset();
-        return cloned;
+public class BufferDecoratedKey extends DecoratedKey
+{
+    private final ByteBuffer key;
+
+    public BufferDecoratedKey(Token token, ByteBuffer key)
+    {
+        super(token);
+        assert key != null;
+        this.key = key;
     }
 
-    public abstract ByteBuffer allocate(int size);
-
-    public abstract long getMinimumSize();
+    public ByteBuffer getKey()
+    {
+        return key;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/BufferDeletedCell.java b/src/java/org/apache/cassandra/db/BufferDeletedCell.java
new file mode 100644
index 0000000..bcc170f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/BufferDeletedCell.java

@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+public class BufferDeletedCell extends BufferCell implements DeletedCell
+{
+    public BufferDeletedCell(CellName name, int localDeletionTime, long timestamp)
+    {
+        this(name, ByteBufferUtil.bytes(localDeletionTime), timestamp);
+    }
+
+    public BufferDeletedCell(CellName name, ByteBuffer value, long timestamp)
+    {
+        super(name, value, timestamp);
+    }
+
+    @Override
+    public Cell withUpdatedName(CellName newName)
+    {
+        return new BufferDeletedCell(newName, value, timestamp);
+    }
+
+    @Override
+    public Cell withUpdatedTimestamp(long newTimestamp)
+    {
+        return new BufferDeletedCell(name, value, newTimestamp);
+    }
+
+    @Override
+    public boolean isLive()
+    {
+        return false;
+    }
+
+    @Override
+    public boolean isLive(long now)
+    {
+        return false;
+    }
+
+    @Override
+    public int getLocalDeletionTime()
+    {
+       return value().getInt(value.position());
+    }
+
+    @Override
+    public Cell reconcile(Cell cell)
+    {
+        if (cell instanceof DeletedCell)
+            return super.reconcile(cell);
+        return cell.reconcile(this);
+    }
+
+    @Override
+    public DeletedCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        return new BufferDeletedCell(name.copy(metadata, allocator), allocator.clone(value), timestamp);
+    }
+
+    @Override
+    public DeletedCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        return allocator.clone(this, metadata, opGroup);
+    }
+
+    @Override
+    public int serializationFlags()
+    {
+        return ColumnSerializer.DELETION_MASK;
+    }
+
+    @Override
+    public void validateFields(CFMetaData metadata) throws MarshalException
+    {
+        validateName(metadata);
+        if (value().remaining() != 4)
+            throw new MarshalException("A tombstone value should be 4 bytes long");
+        if (getLocalDeletionTime() < 0)
+            throw new MarshalException("The local deletion time should not be negative");
+    }
+
+    public boolean equals(Cell cell)
+    {
+        return timestamp() == cell.timestamp() && getLocalDeletionTime() == cell.getLocalDeletionTime() && name().equals(cell.name());
+    }
+
+    @Override
+    public void updateDigest(MessageDigest digest)
+    {
+        digest.update(name().toByteBuffer().duplicate());
+
+        FBUtilities.updateWithLong(digest, timestamp());
+        FBUtilities.updateWithByte(digest, serializationFlags());
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/BufferExpiringCell.java b/src/java/org/apache/cassandra/db/BufferExpiringCell.java
new file mode 100644
index 0000000..347604a
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/BufferExpiringCell.java

@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+public class BufferExpiringCell extends BufferCell implements ExpiringCell
+{
+    private final int localExpirationTime;
+    private final int timeToLive;
+
+    public BufferExpiringCell(CellName name, ByteBuffer value, long timestamp, int timeToLive)
+    {
+        this(name, value, timestamp, timeToLive, (int) (System.currentTimeMillis() / 1000) + timeToLive);
+    }
+
+    public BufferExpiringCell(CellName name, ByteBuffer value, long timestamp, int timeToLive, int localExpirationTime)
+    {
+        super(name, value, timestamp);
+        assert timeToLive > 0 : timeToLive;
+        assert localExpirationTime > 0 : localExpirationTime;
+        this.timeToLive = timeToLive;
+        this.localExpirationTime = localExpirationTime;
+    }
+
+    public int getTimeToLive()
+    {
+        return timeToLive;
+    }
+
+    @Override
+    public Cell withUpdatedName(CellName newName)
+    {
+        return new BufferExpiringCell(newName, value(), timestamp(), timeToLive, localExpirationTime);
+    }
+
+    @Override
+    public Cell withUpdatedTimestamp(long newTimestamp)
+    {
+        return new BufferExpiringCell(name(), value(), newTimestamp, timeToLive, localExpirationTime);
+    }
+
+    @Override
+    public int cellDataSize()
+    {
+        return super.cellDataSize() + TypeSizes.NATIVE.sizeof(localExpirationTime) + TypeSizes.NATIVE.sizeof(timeToLive);
+    }
+
+    @Override
+    public int serializedSize(CellNameType type, TypeSizes typeSizes)
+    {
+        /*
+         * An expired column adds to a Cell :
+         *    4 bytes for the localExpirationTime
+         *  + 4 bytes for the timeToLive
+        */
+        return super.serializedSize(type, typeSizes) + typeSizes.sizeof(localExpirationTime) + typeSizes.sizeof(timeToLive);
+    }
+
+    @Override
+    public void updateDigest(MessageDigest digest)
+    {
+        super.updateDigest(digest);
+        FBUtilities.updateWithInt(digest, timeToLive);
+    }
+
+    @Override
+    public int getLocalDeletionTime()
+    {
+        return localExpirationTime;
+    }
+
+    @Override
+    public ExpiringCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        return new BufferExpiringCell(name.copy(metadata, allocator), allocator.clone(value), timestamp, timeToLive, localExpirationTime);
+    }
+
+    @Override
+    public ExpiringCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        return allocator.clone(this, metadata, opGroup);
+    }
+
+    @Override
+    public String getString(CellNameType comparator)
+    {
+        return String.format("%s!%d", super.getString(comparator), timeToLive);
+    }
+
+    @Override
+    public boolean isLive()
+    {
+        return isLive(System.currentTimeMillis());
+    }
+
+    @Override
+    public boolean isLive(long now)
+    {
+        return (int) (now / 1000) < getLocalDeletionTime();
+    }
+
+    @Override
+    public int serializationFlags()
+    {
+        return ColumnSerializer.EXPIRATION_MASK;
+    }
+
+    @Override
+    public void validateFields(CFMetaData metadata) throws MarshalException
+    {
+        super.validateFields(metadata);
+
+        if (timeToLive <= 0)
+            throw new MarshalException("A column TTL should be > 0");
+        if (localExpirationTime < 0)
+            throw new MarshalException("The local expiration time should not be negative");
+    }
+
+    @Override
+    public Cell reconcile(Cell cell)
+    {
+        long ts1 = timestamp(), ts2 = cell.timestamp();
+        if (ts1 != ts2)
+            return ts1 < ts2 ? cell : this;
+        // we should prefer tombstones
+        if (cell instanceof DeletedCell)
+            return cell;
+        int c = value().compareTo(cell.value());
+        if (c != 0)
+            return c < 0 ? cell : this;
+        // If we have same timestamp and value, prefer the longest ttl
+        if (cell instanceof ExpiringCell)
+        {
+            int let1 = localExpirationTime, let2 = cell.getLocalDeletionTime();
+            if (let1 < let2)
+                return cell;
+        }
+        return this;
+    }
+
+    @Override
+    public boolean equals(Cell cell)
+    {
+        return cell instanceof ExpiringCell && equals((ExpiringCell) cell);
+    }
+
+    public boolean equals(ExpiringCell cell)
+    {
+        // super.equals() returns false if o is not a CounterCell
+        return super.equals(cell)
+               && getLocalDeletionTime() == cell.getLocalDeletionTime()
+               && getTimeToLive() == cell.getTimeToLive();
+    }
+
+    /** @return Either a DeletedCell, or an ExpiringCell. */
+    public static Cell create(CellName name, ByteBuffer value, long timestamp, int timeToLive, int localExpirationTime, int expireBefore, ColumnSerializer.Flag flag)
+    {
+        if (localExpirationTime >= expireBefore || flag == ColumnSerializer.Flag.PRESERVE_SIZE)
+            return new BufferExpiringCell(name, value, timestamp, timeToLive, localExpirationTime);
+        // The column is now expired, we can safely return a simple tombstone. Note that
+        // as long as the expiring column and the tombstone put together live longer than GC grace seconds,
+        // we'll fulfil our responsibility to repair.  See discussion at
+        // http://cassandra-user-incubator-apache-org.3065146.n2.nabble.com/repair-compaction-and-tombstone-rows-td7583481.html
+        return new BufferDeletedCell(name, localExpirationTime - timeToLive, timestamp);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/CFRowAdder.java b/src/java/org/apache/cassandra/db/CFRowAdder.java
new file mode 100644
index 0000000..dfe49ee
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/CFRowAdder.java

@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.UUIDGen;
+
+/**
+ * Convenience object to populate a given CQL3 row in a ColumnFamily object.
+ *
+ * This is meant for when performance is not of the utmost importance. When
+ * performance matters, it might be worth allocating such builder.
+ */
+public class CFRowAdder
+{
+    public final ColumnFamily cf;
+    public final Composite prefix;
+    public final long timestamp;
+    private final int ldt;
+
+    public CFRowAdder(ColumnFamily cf, Composite prefix, long timestamp)
+    {
+        this.cf = cf;
+        this.prefix = prefix;
+        this.timestamp = timestamp;
+        this.ldt = (int) (System.currentTimeMillis() / 1000);
+
+        // If a CQL3 table, add the row marker
+        if (cf.metadata().isCQL3Table() && !prefix.isStatic())
+            cf.addColumn(new BufferCell(cf.getComparator().rowMarker(prefix), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp));
+    }
+
+    public CFRowAdder add(String cql3ColumnName, Object value)
+    {
+        ColumnDefinition def = getDefinition(cql3ColumnName);
+        return add(cf.getComparator().create(prefix, def), def, value);
+    }
+
+    public CFRowAdder resetCollection(String cql3ColumnName)
+    {
+        ColumnDefinition def = getDefinition(cql3ColumnName);
+        assert def.type.isCollection();
+        Composite name = cf.getComparator().create(prefix, def);
+        cf.addAtom(new RangeTombstone(name.start(), name.end(), timestamp - 1, ldt));
+        return this;
+    }
+
+    public CFRowAdder addMapEntry(String cql3ColumnName, Object key, Object value)
+    {
+        ColumnDefinition def = getDefinition(cql3ColumnName);
+        assert def.type instanceof MapType;
+        MapType mt = (MapType)def.type;
+        CellName name = cf.getComparator().create(prefix, def, mt.keys.decompose(key));
+        return add(name, def, value);
+    }
+
+    public CFRowAdder addListEntry(String cql3ColumnName, Object value)
+    {
+        ColumnDefinition def = getDefinition(cql3ColumnName);
+        assert def.type instanceof ListType;
+        CellName name = cf.getComparator().create(prefix, def, ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes()));
+        return add(name, def, value);
+    }
+
+    private ColumnDefinition getDefinition(String name)
+    {
+        return cf.metadata().getColumnDefinition(new ColumnIdentifier(name, false));
+    }
+
+    private CFRowAdder add(CellName name, ColumnDefinition def, Object value)
+    {
+        if (value == null)
+        {
+            cf.addColumn(new BufferDeletedCell(name, ldt, timestamp));
+        }
+        else
+        {
+            AbstractType valueType = def.type.isCollection()
+                                   ? ((CollectionType) def.type).valueComparator()
+                                   : def.type;
+            cf.addColumn(new BufferCell(name, value instanceof ByteBuffer ? (ByteBuffer)value : valueType.decompose(value), timestamp));
+        }
+        return this;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/Cell.java b/src/java/org/apache/cassandra/db/Cell.java
new file mode 100644
index 0000000..7c3926a
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/Cell.java

@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+/**
+ * Cell is immutable, which prevents all kinds of confusion in a multithreaded environment.
+ */
+public interface Cell extends OnDiskAtom
+{
+    public static final int MAX_NAME_LENGTH = FBUtilities.MAX_UNSIGNED_SHORT;
+
+    public Cell withUpdatedName(CellName newName);
+
+    public Cell withUpdatedTimestamp(long newTimestamp);
+
+    @Override
+    public CellName name();
+
+    public ByteBuffer value();
+
+    public boolean isLive();
+
+    public boolean isLive(long now);
+
+    public int cellDataSize();
+
+    // returns the size of the Cell and all references on the heap, excluding any costs associated with byte arrays
+    // that would be allocated by a localCopy, as these will be accounted for by the allocator
+    public long unsharedHeapSizeExcludingData();
+
+    public int serializedSize(CellNameType type, TypeSizes typeSizes);
+
+    public int serializationFlags();
+
+    public Cell diff(Cell cell);
+
+    public Cell reconcile(Cell cell);
+
+    public Cell localCopy(CFMetaData metadata, AbstractAllocator allocator);
+
+    public Cell localCopy(CFMetaData metaData, MemtableAllocator allocator, OpOrder.Group opGroup);
+
+    public String getString(CellNameType comparator);
+}

diff --git a/src/java/org/apache/cassandra/db/ClockAndCount.java b/src/java/org/apache/cassandra/db/ClockAndCount.java
new file mode 100644
index 0000000..db6c705
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ClockAndCount.java

@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.cache.IMeasurableMemory;
+import org.apache.cassandra.utils.ObjectSizes;
+
+public class ClockAndCount implements IMeasurableMemory
+{
+
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new ClockAndCount(0, 0));
+
+    public static ClockAndCount BLANK = ClockAndCount.create(0L, 0L);
+
+    public final long clock;
+    public final long count;
+
+    private ClockAndCount(long clock, long count)
+    {
+        this.clock = clock;
+        this.count = count;
+    }
+
+    public static ClockAndCount create(long clock, long count)
+    {
+        return new ClockAndCount(clock, count);
+    }
+
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof ClockAndCount))
+            return false;
+
+        ClockAndCount other = (ClockAndCount) o;
+        return clock == other.clock && count == other.count;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(clock, count);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("ClockAndCount(%s,%s)", clock, count);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/CollationController.java b/src/java/org/apache/cassandra/db/CollationController.java
index 4a08a26..1bc421d 100644
--- a/src/java/org/apache/cassandra/db/CollationController.java
+++ b/src/java/org/apache/cassandra/db/CollationController.java

@@ -17,21 +17,28 @@
  */
 package org.apache.cassandra.db;
 
-import java.nio.ByteBuffer;
-import java.util.*;
+import java.io.Closeable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.TreeSet;
 
+import com.google.common.base.Function;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
 
+import net.nicoulaj.compilecommand.annotations.Inline;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
+import org.apache.cassandra.db.composites.CellName;
 import org.apache.cassandra.db.filter.NamesQueryFilter;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.HeapAllocator;
+import org.apache.cassandra.utils.memory.HeapAllocator;
 
 public class CollationController
 {
@@ -48,12 +55,12 @@
         this.gcBefore = gcBefore;
     }
 
-    public ColumnFamily getTopLevelColumns()
+    public ColumnFamily getTopLevelColumns(boolean copyOnHeap)
     {
         return filter.filter instanceof NamesQueryFilter
                && cfs.metadata.getDefaultValidator() != CounterColumnType.instance
-               ? collectTimeOrderedData()
-               : collectAllData();
+               ? collectTimeOrderedData(copyOnHeap)
+               : collectAllData(copyOnHeap);
     }
 
     /**
@@ -61,18 +68,13 @@
      * Once we have data for all requests columns that is newer than the newest remaining maxtimestamp,
      * we stop.
      */
-    private ColumnFamily collectTimeOrderedData()
+    private ColumnFamily collectTimeOrderedData(boolean copyOnHeap)
     {
         final ColumnFamily container = ArrayBackedSortedColumns.factory.create(cfs.metadata, filter.filter.isReversed());
-        List<OnDiskAtomIterator> iterators = new ArrayList<OnDiskAtomIterator>();
+        List<OnDiskAtomIterator> iterators = new ArrayList<>();
+        boolean isEmpty = true;
         Tracing.trace("Acquiring sstable references");
-        ColumnFamilyStore.ViewFragment view = cfs.markReferenced(filter.key);
-
-        // We use a temporary CF object per memtable or sstable source so we can accomodate this.factory being ABSC,
-        // which requires addAtom to happen in sorted order.  Then we use addAll to merge into the final collection,
-        // which allows a (sorted) set of columns to be merged even if they are not uniformly sorted after the existing
-        // ones.
-        ColumnFamily temp = ArrayBackedSortedColumns.factory.create(cfs.metadata, filter.filter.isReversed());
+        ColumnFamilyStore.ViewFragment view = cfs.select(cfs.viewFilter(filter.key));
 
         try
         {
@@ -80,28 +82,31 @@
             long mostRecentRowTombstone = Long.MIN_VALUE;
             for (Memtable memtable : view.memtables)
             {
-                OnDiskAtomIterator iter = filter.getMemtableColumnIterator(memtable);
-                if (iter != null)
+                ColumnFamily cf = memtable.getColumnFamily(filter.key);
+                if (cf != null)
                 {
-                    iterators.add(iter);
-                    temp.delete(iter.getColumnFamily());
+                    filter.delete(container.deletionInfo(), cf);
+                    isEmpty = false;
+                    Iterator<Cell> iter = filter.getIterator(cf);
                     while (iter.hasNext())
-                        temp.addAtom(iter.next());
+                    {
+                        Cell cell = iter.next();
+                        if (copyOnHeap)
+                            cell = cell.localCopy(cfs.metadata, HeapAllocator.instance);
+                        container.addColumn(cell);
+                    }
                 }
-
-                container.addAll(temp, HeapAllocator.instance);
                 mostRecentRowTombstone = container.deletionInfo().getTopLevelDeletion().markedForDeleteAt;
-                temp.clear();
             }
 
             // avoid changing the filter columns of the original filter
             // (reduceNameFilter removes columns that are known to be irrelevant)
             NamesQueryFilter namesFilter = (NamesQueryFilter) filter.filter;
-            TreeSet<ByteBuffer> filterColumns = new TreeSet<ByteBuffer>(namesFilter.columns);
+            TreeSet<CellName> filterColumns = new TreeSet<>(namesFilter.columns);
             QueryFilter reducedFilter = new QueryFilter(filter.key, filter.cfName, namesFilter.withUpdatedColumns(filterColumns), filter.timestamp);
 
             /* add the SSTables on disk */
-            Collections.sort(view.sstables, SSTable.maxTimestampComparator);
+            Collections.sort(view.sstables, SSTableReader.maxTimestampComparator);
 
             // read sorted sstables
             for (SSTableReader sstable : view.sstables)
@@ -120,23 +125,20 @@
                 Tracing.trace("Merging data from sstable {}", sstable.descriptor.generation);
                 OnDiskAtomIterator iter = reducedFilter.getSSTableColumnIterator(sstable);
                 iterators.add(iter);
+                isEmpty = false;
                 if (iter.getColumnFamily() != null)
                 {
-                    ColumnFamily cf = iter.getColumnFamily();
-                    temp.delete(cf);
+                    container.delete(iter.getColumnFamily());
                     sstablesIterated++;
                     while (iter.hasNext())
-                        temp.addAtom(iter.next());
+                        container.addAtom(iter.next());
                 }
-
-                container.addAll(temp, HeapAllocator.instance);
                 mostRecentRowTombstone = container.deletionInfo().getTopLevelDeletion().markedForDeleteAt;
-                temp.clear();
             }
 
             // we need to distinguish between "there is no data at all for this row" (BF will let us rebuild that efficiently)
             // and "there used to be data, but it's gone now" (we should cache the empty CF so we don't need to rebuild that slower)
-            if (iterators.isEmpty())
+            if (isEmpty)
                 return null;
 
             // do a final collate.  toCollate is boilerplate required to provide a CloseableIterator
@@ -150,9 +152,9 @@
                 && cfs.getCompactionStrategy() instanceof SizeTieredCompactionStrategy)
             {
                 Tracing.trace("Defragmenting requested data");
-                RowMutation rm = new RowMutation(cfs.keyspace.getName(), filter.key.key, returnCF.cloneMe());
+                Mutation mutation = new Mutation(cfs.keyspace.getName(), filter.key.getKey(), returnCF.cloneMe());
                 // skipping commitlog and index updates is fine since we're just de-fragmenting existing data
-                Keyspace.open(rm.getKeyspaceName()).apply(rm, false, false);
+                Keyspace.open(mutation.getKeyspaceName()).apply(mutation, false, false);
             }
 
             // Caller is responsible for final removeDeletedCF.  This is important for cacheRow to work correctly:
@@ -162,7 +164,6 @@
         {
             for (OnDiskAtomIterator iter : iterators)
                 FileUtils.closeQuietly(iter);
-            SSTableReader.releaseReferences(view.sstables);
         }
     }
 
@@ -174,11 +175,11 @@
         if (container == null)
             return;
 
-        for (Iterator<ByteBuffer> iterator = ((NamesQueryFilter) filter.filter).columns.iterator(); iterator.hasNext(); )
+        for (Iterator<CellName> iterator = ((NamesQueryFilter) filter.filter).columns.iterator(); iterator.hasNext(); )
         {
-            ByteBuffer filterColumn = iterator.next();
-            Column column = container.getColumn(filterColumn);
-            if (column != null && column.timestamp() > sstableTimestamp)
+            CellName filterColumn = iterator.next();
+            Cell cell = container.getColumn(filterColumn);
+            if (cell != null && cell.timestamp() > sstableTimestamp)
                 iterator.remove();
         }
     }
@@ -187,22 +188,33 @@
      * Collects data the brute-force way: gets an iterator for the filter in question
      * from every memtable and sstable, then merges them together.
      */
-    private ColumnFamily collectAllData()
+    private ColumnFamily collectAllData(boolean copyOnHeap)
     {
         Tracing.trace("Acquiring sstable references");
-        ColumnFamilyStore.ViewFragment view = cfs.markReferenced(filter.key);
-        List<OnDiskAtomIterator> iterators = new ArrayList<OnDiskAtomIterator>(Iterables.size(view.memtables) + view.sstables.size());
+        ColumnFamilyStore.ViewFragment view = cfs.select(cfs.viewFilter(filter.key));
+        List<Iterator<? extends OnDiskAtom>> iterators = new ArrayList<>(Iterables.size(view.memtables) + view.sstables.size());
         ColumnFamily returnCF = ArrayBackedSortedColumns.factory.create(cfs.metadata, filter.filter.isReversed());
-
+        DeletionInfo returnDeletionInfo = returnCF.deletionInfo();
         try
         {
             Tracing.trace("Merging memtable tombstones");
             for (Memtable memtable : view.memtables)
             {
-                OnDiskAtomIterator iter = filter.getMemtableColumnIterator(memtable);
-                if (iter != null)
+                final ColumnFamily cf = memtable.getColumnFamily(filter.key);
+                if (cf != null)
                 {
-                    returnCF.delete(iter.getColumnFamily());
+                    filter.delete(returnDeletionInfo, cf);
+                    Iterator<Cell> iter = filter.getIterator(cf);
+                    if (copyOnHeap)
+                    {
+                        iter = Iterators.transform(iter, new Function<Cell, Cell>()
+                        {
+                            public Cell apply(Cell cell)
+                            {
+                                return cell.localCopy(cf.metadata, HeapAllocator.instance);
+                            }
+                        });
+                    }
                     iterators.add(iter);
                 }
             }
@@ -216,10 +228,10 @@
              *   timestamp(tombstone) > maxTimestamp_s0
              * since we necessarily have
              *   timestamp(tombstone) <= maxTimestamp_s1
-             * In othere words, iterating in maxTimestamp order allow to do our mostRecentTombstone elimination
+             * In other words, iterating in maxTimestamp order allow to do our mostRecentTombstone elimination
              * in one pass, and minimize the number of sstables for which we read a rowTombstone.
              */
-            Collections.sort(view.sstables, SSTable.maxTimestampComparator);
+            Collections.sort(view.sstables, SSTableReader.maxTimestampComparator);
             List<SSTableReader> skippedSSTables = null;
             long mostRecentRowTombstone = Long.MIN_VALUE;
             long minTimestamp = Long.MAX_VALUE;
@@ -240,7 +252,7 @@
                     if (sstable.getSSTableMetadata().maxLocalDeletionTime != Integer.MAX_VALUE)
                     {
                         if (skippedSSTables == null)
-                            skippedSSTables = new ArrayList<SSTableReader>();
+                            skippedSSTables = new ArrayList<>();
                         skippedSSTables.add(sstable);
                     }
                     continue;
@@ -301,9 +313,9 @@
         }
         finally
         {
-            for (OnDiskAtomIterator iter : iterators)
-                FileUtils.closeQuietly(iter);
-            SSTableReader.releaseReferences(view.sstables);
+            for (Object iter : iterators)
+                if (iter instanceof Closeable)
+                    FileUtils.closeQuietly((Closeable) iter);
         }
     }
 

diff --git a/src/java/org/apache/cassandra/db/Column.java b/src/java/org/apache/cassandra/db/Column.java
deleted file mode 100644
index 72cbae1..0000000
--- a/src/java/org/apache/cassandra/db/Column.java
+++ /dev/null

@@ -1,369 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOError;
-import java.io.IOException;
-import java.net.InetAddress;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.CFDefinition;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.Allocator;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.HeapAllocator;
-
-/**
- * Column is immutable, which prevents all kinds of confusion in a multithreaded environment.
- */
-public class Column implements OnDiskAtom
-{
-    public static final int MAX_NAME_LENGTH = FBUtilities.MAX_UNSIGNED_SHORT;
-
-    public static final ColumnSerializer serializer = new ColumnSerializer();
-
-    public static OnDiskAtom.Serializer onDiskSerializer()
-    {
-        return OnDiskAtom.Serializer.instance;
-    }
-
-    /**
-     * For 2.0-formatted sstables (where column count is not stored), @param count should be Integer.MAX_VALUE,
-     * and we will look for the end-of-row column name marker instead of relying on that.
-     */
-    public static Iterator<OnDiskAtom> onDiskIterator(final DataInput in, final int count, final ColumnSerializer.Flag flag, final int expireBefore, final Descriptor.Version version)
-    {
-        return new AbstractIterator<OnDiskAtom>()
-        {
-            int i = 0;
-
-            protected OnDiskAtom computeNext()
-            {
-                if (i++ >= count)
-                    return endOfData();
-
-                OnDiskAtom atom;
-                try
-                {
-                    atom = onDiskSerializer().deserializeFromSSTable(in, flag, expireBefore, version);
-                }
-                catch (IOException e)
-                {
-                    throw new IOError(e);
-                }
-                if (atom == null)
-                    return endOfData();
-
-                return atom;
-            }
-        };
-    }
-
-    protected final ByteBuffer name;
-    protected final ByteBuffer value;
-    protected final long timestamp;
-
-    Column(ByteBuffer name)
-    {
-        this(name, ByteBufferUtil.EMPTY_BYTE_BUFFER);
-    }
-
-    public Column(ByteBuffer name, ByteBuffer value)
-    {
-        this(name, value, 0);
-    }
-
-    public Column(ByteBuffer name, ByteBuffer value, long timestamp)
-    {
-        assert name != null;
-        assert value != null;
-        assert name.remaining() <= Column.MAX_NAME_LENGTH;
-        this.name = name;
-        this.value = value;
-        this.timestamp = timestamp;
-    }
-
-    public Column withUpdatedName(ByteBuffer newName)
-    {
-        return new Column(newName, value, timestamp);
-    }
-
-    public Column withUpdatedTimestamp(long newTimestamp)
-    {
-        return new Column(name, value, newTimestamp);
-    }
-
-    public ByteBuffer name()
-    {
-        return name;
-    }
-
-    public ByteBuffer value()
-    {
-        return value;
-    }
-
-    public long timestamp()
-    {
-        return timestamp;
-    }
-
-    public long minTimestamp()
-    {
-        return timestamp;
-    }
-
-    public long maxTimestamp()
-    {
-        return timestamp;
-    }
-
-    public boolean isMarkedForDelete(long now)
-    {
-        return false;
-    }
-
-    public boolean isLive(long now)
-    {
-        return !isMarkedForDelete(now);
-    }
-
-    // Don't call unless the column is actually marked for delete.
-    public long getMarkedForDeleteAt()
-    {
-        return Long.MAX_VALUE;
-    }
-
-    public int dataSize()
-    {
-        return name().remaining() + value.remaining() + TypeSizes.NATIVE.sizeof(timestamp);
-    }
-
-    public int serializedSize(TypeSizes typeSizes)
-    {
-        /*
-         * Size of a column is =
-         *   size of a name (short + length of the string)
-         * + 1 byte to indicate if the column has been deleted
-         * + 8 bytes for timestamp
-         * + 4 bytes which basically indicates the size of the byte array
-         * + entire byte array.
-        */
-        int nameSize = name.remaining();
-        int valueSize = value.remaining();
-        return typeSizes.sizeof((short) nameSize) + nameSize + 1 + typeSizes.sizeof(timestamp) + typeSizes.sizeof(valueSize) + valueSize;
-    }
-
-    public long serializedSizeForSSTable()
-    {
-        return serializedSize(TypeSizes.NATIVE);
-    }
-
-    public int serializationFlags()
-    {
-        return 0;
-    }
-
-    public Column diff(Column column)
-    {
-        if (timestamp() < column.timestamp())
-            return column;
-        return null;
-    }
-
-    public void updateDigest(MessageDigest digest)
-    {
-        digest.update(name.duplicate());
-        digest.update(value.duplicate());
-
-        DataOutputBuffer buffer = new DataOutputBuffer();
-        try
-        {
-            buffer.writeLong(timestamp);
-            buffer.writeByte(serializationFlags());
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-        digest.update(buffer.getData(), 0, buffer.getLength());
-    }
-
-    public int getLocalDeletionTime()
-    {
-        return Integer.MAX_VALUE;
-    }
-
-    public Column reconcile(Column column)
-    {
-        return reconcile(column, HeapAllocator.instance);
-    }
-
-    public Column reconcile(Column column, Allocator allocator)
-    {
-        // tombstones take precedence.  (if both are tombstones, then it doesn't matter which one we use.)
-        if (isMarkedForDelete(System.currentTimeMillis()))
-            return timestamp() < column.timestamp() ? column : this;
-        if (column.isMarkedForDelete(System.currentTimeMillis()))
-            return timestamp() > column.timestamp() ? this : column;
-        // break ties by comparing values.
-        if (timestamp() == column.timestamp())
-            return value().compareTo(column.value()) < 0 ? column : this;
-        // neither is tombstoned and timestamps are different
-        return timestamp() < column.timestamp() ? column : this;
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if (this == o)
-            return true;
-        if (o == null || getClass() != o.getClass())
-            return false;
-
-        Column column = (Column)o;
-
-        if (timestamp != column.timestamp)
-            return false;
-        if (!name.equals(column.name))
-            return false;
-
-        return value.equals(column.value);
-    }
-
-    @Override
-    public int hashCode()
-    {
-        int result = name != null ? name.hashCode() : 0;
-        result = 31 * result + (value != null ? value.hashCode() : 0);
-        result = 31 * result + (int)(timestamp ^ (timestamp >>> 32));
-        return result;
-    }
-
-    public Column localCopy(ColumnFamilyStore cfs)
-    {
-        return localCopy(cfs, HeapAllocator.instance);
-    }
-
-    public Column localCopy(ColumnFamilyStore cfs, Allocator allocator)
-    {
-        return new Column(cfs.internOrCopy(name, allocator), allocator.clone(value), timestamp);
-    }
-
-    public String getString(AbstractType<?> comparator)
-    {
-        StringBuilder sb = new StringBuilder();
-        sb.append(comparator.getString(name));
-        sb.append(":");
-        sb.append(isMarkedForDelete(System.currentTimeMillis()));
-        sb.append(":");
-        sb.append(value.remaining());
-        sb.append("@");
-        sb.append(timestamp());
-        return sb.toString();
-    }
-
-    protected void validateName(CFMetaData metadata) throws MarshalException
-    {
-        metadata.comparator.validate(name());
-    }
-
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        validateName(metadata);
-        AbstractType<?> valueValidator = metadata.getValueValidatorFromColumnName(name);
-        if (valueValidator != null)
-            valueValidator.validate(value());
-    }
-
-    public boolean hasIrrelevantData(int gcBefore)
-    {
-        return getLocalDeletionTime() < gcBefore;
-    }
-
-    public static Column create(ByteBuffer name, ByteBuffer value, long timestamp, int ttl, CFMetaData metadata)
-    {
-        if (ttl <= 0)
-            ttl = metadata.getDefaultTimeToLive();
-
-        return ttl > 0
-               ? new ExpiringColumn(name, value, timestamp, ttl)
-               : new Column(name, value, timestamp);
-    }
-
-    public static Column create(String value, long timestamp, String... names)
-    {
-        return new Column(decomposeName(names), UTF8Type.instance.decompose(value), timestamp);
-    }
-
-    public static Column create(int value, long timestamp, String... names)
-    {
-        return new Column(decomposeName(names), Int32Type.instance.decompose(value), timestamp);
-    }
-
-    public static Column create(boolean value, long timestamp, String... names)
-    {
-        return new Column(decomposeName(names), BooleanType.instance.decompose(value), timestamp);
-    }
-
-    public static Column create(double value, long timestamp, String... names)
-    {
-        return new Column(decomposeName(names), DoubleType.instance.decompose(value), timestamp);
-    }
-
-    public static Column create(ByteBuffer value, long timestamp, String... names)
-    {
-        return new Column(decomposeName(names), value, timestamp);
-    }
-
-    public static Column create(InetAddress value, long timestamp, String... names)
-    {
-        return new Column(decomposeName(names), InetAddressType.instance.decompose(value), timestamp);
-    }
-
-    static ByteBuffer decomposeName(String... names)
-    {
-        assert names.length > 0;
-
-        if (names.length == 1)
-            return UTF8Type.instance.decompose(names[0]);
-
-        // not super performant.  at this time, only infrequently called schema code uses this.
-        List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(names.length);
-        for (int i = 0; i < names.length; i++)
-            types.add(UTF8Type.instance);
-
-        CompositeType.Builder builder = new CompositeType.Builder(CompositeType.getInstance(types));
-        for (String name : names)
-            builder.add(UTF8Type.instance.decompose(name));
-        return builder.build();
-    }
-}
-

diff --git a/src/java/org/apache/cassandra/db/ColumnFamily.java b/src/java/org/apache/cassandra/db/ColumnFamily.java
index 7edf825..38b39fe 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamily.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamily.java

@@ -28,16 +28,17 @@
 import java.util.Map;
 import java.util.UUID;
 
-import com.google.common.base.Function;
-import com.google.common.base.Functions;
 import com.google.common.collect.ImmutableMap;
 import org.apache.commons.lang3.builder.HashCodeBuilder;
 
 import org.apache.cassandra.cache.IRowCacheEntry;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.filter.ColumnCounter;
 import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.io.sstable.ColumnNameHelper;
 import org.apache.cassandra.io.sstable.ColumnStats;
 import org.apache.cassandra.io.sstable.SSTable;
@@ -52,7 +53,7 @@
  * Whether the implementation is thread safe or not is left to the
  * implementing classes.
  */
-public abstract class ColumnFamily implements Iterable<Column>, IRowCacheEntry
+public abstract class ColumnFamily implements Iterable<Cell>, IRowCacheEntry
 {
     /* The column serializer for this Column Family. Create based on config. */
     public static final ColumnFamilySerializer serializer = new ColumnFamilySerializer();
@@ -82,6 +83,14 @@
         return metadata.cfType;
     }
 
+    public int liveCQL3RowCount(long now)
+    {
+        ColumnCounter counter = getComparator().isDense()
+                              ? new ColumnCounter(now)
+                              : new ColumnCounter.GroupByPrefix(now, getComparator(), metadata.clusteringColumns().size());
+        return counter.countAll(this).live();
+    }
+
     /**
      * Clones the column map.
      */
@@ -100,54 +109,38 @@
         return metadata;
     }
 
-    public void addIfRelevant(Column column, DeletionInfo.InOrderTester tester, int gcBefore)
-    {
-        // the column itself must be not gc-able (it is live, or a still relevant tombstone), (1)
-        // and if its container is deleted, the column must be changed more recently than the container tombstone (2)
-        if ((column.getLocalDeletionTime() >= gcBefore) // (1)
-            && (!tester.isDeleted(column.name(), column.timestamp())))                                // (2)
-        {
-            addColumn(column);
-        }
-    }
-
-    public void addColumn(Column column)
-    {
-        addColumn(column, HeapAllocator.instance);
-    }
-
-    public void addColumn(ByteBuffer name, ByteBuffer value, long timestamp)
+    public void addColumn(CellName name, ByteBuffer value, long timestamp)
     {
         addColumn(name, value, timestamp, 0);
     }
 
-    public void addColumn(ByteBuffer name, ByteBuffer value, long timestamp, int timeToLive)
+    public void addColumn(CellName name, ByteBuffer value, long timestamp, int timeToLive)
     {
-        assert !metadata().getDefaultValidator().isCommutative();
-        Column column = Column.create(name, value, timestamp, timeToLive, metadata());
-        addColumn(column);
+        assert !metadata().isCounter();
+        Cell cell = AbstractCell.create(name, value, timestamp, timeToLive, metadata());
+        addColumn(cell);
     }
 
-    public void addCounter(ByteBuffer name, long value)
+    public void addCounter(CellName name, long value)
     {
-        addColumn(new CounterUpdateColumn(name, value, System.currentTimeMillis()));
+        addColumn(new BufferCounterUpdateCell(name, value, FBUtilities.timestampMicros()));
     }
 
-    public void addTombstone(ByteBuffer name, ByteBuffer localDeletionTime, long timestamp)
+    public void addTombstone(CellName name, ByteBuffer localDeletionTime, long timestamp)
     {
-        addColumn(new DeletedColumn(name, localDeletionTime, timestamp));
+        addColumn(new BufferDeletedCell(name, localDeletionTime, timestamp));
     }
 
-    public void addTombstone(ByteBuffer name, int localDeletionTime, long timestamp)
+    public void addTombstone(CellName name, int localDeletionTime, long timestamp)
     {
-        addColumn(new DeletedColumn(name, localDeletionTime, timestamp));
+        addColumn(new BufferDeletedCell(name, localDeletionTime, timestamp));
     }
 
     public void addAtom(OnDiskAtom atom)
     {
-        if (atom instanceof Column)
+        if (atom instanceof Cell)
         {
-            addColumn((Column)atom);
+            addColumn((Cell)atom);
         }
         else
         {
@@ -192,55 +185,54 @@
     public abstract void purgeTombstones(int gcBefore);
 
     /**
-     * Adds a column to this column map.
-     * If a column with the same name is already present in the map, it will
-     * be replaced by the newly added column.
+     * Adds a cell to this cell map.
+     * If a cell with the same name is already present in the map, it will
+     * be replaced by the newly added cell.
      */
-    public abstract void addColumn(Column column, Allocator allocator);
+    public abstract void addColumn(Cell cell);
+
+    /**
+     * Adds a cell if it's non-gc-able and isn't shadowed by a partition/range tombstone with a higher timestamp.
+     * Requires that the cell to add is sorted strictly after the last cell in the container.
+     */
+    public abstract void maybeAppendColumn(Cell cell, DeletionInfo.InOrderTester tester, int gcBefore);
 
     /**
      * Adds all the columns of a given column map to this column map.
      * This is equivalent to:
      *   <code>
-     *   for (Column c : cm)
+     *   for (Cell c : cm)
      *      addColumn(c, ...);
      *   </code>
      *  but is potentially faster.
      */
-    public abstract void addAll(ColumnFamily cm, Allocator allocator, Function<Column, Column> transformation);
-
-    /**
-     * Replace oldColumn if present by newColumn.
-     * Returns true if oldColumn was present and thus replaced.
-     * oldColumn and newColumn should have the same name.
-     */
-    public abstract boolean replace(Column oldColumn, Column newColumn);
+    public abstract void addAll(ColumnFamily cm);
 
     /**
      * Get a column given its name, returning null if the column is not
      * present.
      */
-    public abstract Column getColumn(ByteBuffer name);
+    public abstract Cell getColumn(CellName name);
 
     /**
      * Returns an iterable with the names of columns in this column map in the same order
      * as the underlying columns themselves.
      */
-    public abstract Iterable<ByteBuffer> getColumnNames();
+    public abstract Iterable<CellName> getColumnNames();
 
     /**
      * Returns the columns of this column map as a collection.
      * The columns in the returned collection should be sorted as the columns
      * in this map.
      */
-    public abstract Collection<Column> getSortedColumns();
+    public abstract Collection<Cell> getSortedColumns();
 
     /**
      * Returns the columns of this column map as a collection.
      * The columns in the returned collection should be sorted in reverse
      * order of the columns in this map.
      */
-    public abstract Collection<Column> getReverseSortedColumns();
+    public abstract Collection<Cell> getReverseSortedColumns();
 
     /**
      * Returns the number of columns in this map.
@@ -248,24 +240,29 @@
     public abstract int getColumnCount();
 
     /**
+     * Returns whether or not there are any columns present.
+     */
+    public abstract boolean hasColumns();
+
+    /**
      * Returns true if this contains no columns or deletion info
      */
     public boolean isEmpty()
     {
-        return deletionInfo().isLive() && getColumnCount() == 0;
+        return deletionInfo().isLive() && !hasColumns();
     }
 
     /**
      * Returns an iterator over the columns of this map that returns only the matching @param slices.
      * The provided slices must be in order and must be non-overlapping.
      */
-    public abstract Iterator<Column> iterator(ColumnSlice[] slices);
+    public abstract Iterator<Cell> iterator(ColumnSlice[] slices);
 
     /**
      * Returns a reversed iterator over the columns of this map that returns only the matching @param slices.
      * The provided slices must be in reversed order and must be non-overlapping.
      */
-    public abstract Iterator<Column> reverseIterator(ColumnSlice[] slices);
+    public abstract Iterator<Cell> reverseIterator(ColumnSlice[] slices);
 
     /**
      * Returns if this map only support inserts in reverse order.
@@ -280,11 +277,6 @@
         delete(columns.deletionInfo());
     }
 
-    public void addAll(ColumnFamily cf, Allocator allocator)
-    {
-        addAll(cf, allocator, Functions.<Column>identity());
-    }
-
     /*
      * This function will calculate the difference between 2 column families.
      * The external input is assumed to be a superset of internal.
@@ -292,48 +284,51 @@
     public ColumnFamily diff(ColumnFamily cfComposite)
     {
         assert cfComposite.id().equals(id());
-        ColumnFamily cfDiff = TreeMapBackedSortedColumns.factory.create(metadata);
+        ColumnFamily cfDiff = ArrayBackedSortedColumns.factory.create(metadata);
         cfDiff.delete(cfComposite.deletionInfo());
 
         // (don't need to worry about cfNew containing Columns that are shadowed by
         // the delete tombstone, since cfNew was generated by CF.resolve, which
         // takes care of those for us.)
-        for (Column columnExternal : cfComposite)
+        for (Cell cellExternal : cfComposite)
         {
-            ByteBuffer cName = columnExternal.name();
-            Column columnInternal = getColumn(cName);
-            if (columnInternal == null)
+            CellName cName = cellExternal.name();
+            Cell cellInternal = getColumn(cName);
+            if (cellInternal == null)
             {
-                cfDiff.addColumn(columnExternal);
+                cfDiff.addColumn(cellExternal);
             }
             else
             {
-                Column columnDiff = columnInternal.diff(columnExternal);
-                if (columnDiff != null)
+                Cell cellDiff = cellInternal.diff(cellExternal);
+                if (cellDiff != null)
                 {
-                    cfDiff.addColumn(columnDiff);
+                    cfDiff.addColumn(cellDiff);
                 }
             }
         }
 
+        cfDiff.setDeletionInfo(deletionInfo().diff(cfComposite.deletionInfo()));
+
         if (!cfDiff.isEmpty())
             return cfDiff;
+        
         return null;
     }
 
     public long dataSize()
     {
         long size = 0;
-        for (Column column : this)
-            size += column.dataSize();
+        for (Cell cell : this)
+            size += cell.cellDataSize();
         return size;
     }
 
     public long maxTimestamp()
     {
         long maxTimestamp = deletionInfo().maxTimestamp();
-        for (Column column : this)
-            maxTimestamp = Math.max(maxTimestamp, column.maxTimestamp());
+        for (Cell cell : this)
+            maxTimestamp = Math.max(maxTimestamp, cell.timestamp());
         return maxTimestamp;
     }
 
@@ -343,8 +338,8 @@
         HashCodeBuilder builder = new HashCodeBuilder(373, 75437)
                 .append(metadata)
                 .append(deletionInfo());
-        for (Column column : this)
-            builder.append(column);
+        for (Cell cell : this)
+            builder.append(cell);
         return builder.toHashCode();
     }
 
@@ -367,12 +362,12 @@
     public String toString()
     {
         StringBuilder sb = new StringBuilder("ColumnFamily(");
-        sb.append(metadata == null ? "<anonymous>" : metadata.cfName);
+        sb.append(metadata.cfName);
 
         if (isMarkedForDelete())
             sb.append(" -").append(deletionInfo()).append("-");
 
-        sb.append(" [").append(getComparator().getColumnsString(this)).append("])");
+        sb.append(" [").append(CellNames.getColumnsString(getComparator(), this)).append("])");
         return sb.toString();
     }
 
@@ -386,8 +381,10 @@
 
     public void updateDigest(MessageDigest digest)
     {
-        for (Column column : this)
-            column.updateDigest(digest);
+        for (Cell cell : this)
+            cell.updateDigest(digest);
+        if (MessagingService.instance().areAllNodesAtLeast21())
+            deletionInfo().updateDigest(digest);
     }
 
     public static ColumnFamily diff(ColumnFamily cf1, ColumnFamily cf2)
@@ -397,19 +394,6 @@
         return cf1.diff(cf2);
     }
 
-    public void resolve(ColumnFamily cf)
-    {
-        resolve(cf, HeapAllocator.instance);
-    }
-
-    public void resolve(ColumnFamily cf, Allocator allocator)
-    {
-        // Row _does_ allow null CF objects :(  seems a necessary evil for efficiency
-        if (cf == null)
-            return;
-        addAll(cf, allocator);
-    }
-
     public ColumnStats getColumnStats()
     {
         // note that we default to MIN_VALUE/MAX_VALUE here to be able to override them later in this method
@@ -421,6 +405,7 @@
         ColumnStats.MaxTracker<Integer> maxDeletionTimeTracker = new ColumnStats.MaxTracker<>(Integer.MAX_VALUE);
         List<ByteBuffer> minColumnNamesSeen = Collections.emptyList();
         List<ByteBuffer> maxColumnNamesSeen = Collections.emptyList();
+        boolean hasLegacyCounterShards = false;
 
         if (deletionInfo().getTopLevelDeletion().localDeletionTime < Integer.MAX_VALUE)
         {
@@ -434,25 +419,35 @@
         {
             RangeTombstone rangeTombstone = it.next();
             tombstones.update(rangeTombstone.getLocalDeletionTime());
-            minTimestampTracker.update(rangeTombstone.minTimestamp());
-            maxTimestampTracker.update(rangeTombstone.maxTimestamp());
+            minTimestampTracker.update(rangeTombstone.timestamp());
+            maxTimestampTracker.update(rangeTombstone.timestamp());
             maxDeletionTimeTracker.update(rangeTombstone.getLocalDeletionTime());
             minColumnNamesSeen = ColumnNameHelper.minComponents(minColumnNamesSeen, rangeTombstone.min, metadata.comparator);
             maxColumnNamesSeen = ColumnNameHelper.maxComponents(maxColumnNamesSeen, rangeTombstone.max, metadata.comparator);
         }
 
-        for (Column column : this)
+        for (Cell cell : this)
         {
-            minTimestampTracker.update(column.minTimestamp());
-            maxTimestampTracker.update(column.maxTimestamp());
-            maxDeletionTimeTracker.update(column.getLocalDeletionTime());
-            int deletionTime = column.getLocalDeletionTime();
+            minTimestampTracker.update(cell.timestamp());
+            maxTimestampTracker.update(cell.timestamp());
+            maxDeletionTimeTracker.update(cell.getLocalDeletionTime());
+
+            int deletionTime = cell.getLocalDeletionTime();
             if (deletionTime < Integer.MAX_VALUE)
                 tombstones.update(deletionTime);
-            minColumnNamesSeen = ColumnNameHelper.minComponents(minColumnNamesSeen, column.name, metadata.comparator);
-            maxColumnNamesSeen = ColumnNameHelper.maxComponents(maxColumnNamesSeen, column.name, metadata.comparator);
+            minColumnNamesSeen = ColumnNameHelper.minComponents(minColumnNamesSeen, cell.name(), metadata.comparator);
+            maxColumnNamesSeen = ColumnNameHelper.maxComponents(maxColumnNamesSeen, cell.name(), metadata.comparator);
+            if (cell instanceof CounterCell)
+                hasLegacyCounterShards = hasLegacyCounterShards || ((CounterCell) cell).hasLegacyShards();
         }
-        return new ColumnStats(getColumnCount(), minTimestampTracker.get(), maxTimestampTracker.get(), maxDeletionTimeTracker.get(), tombstones, minColumnNamesSeen, maxColumnNamesSeen);
+        return new ColumnStats(getColumnCount(),
+                               minTimestampTracker.get(),
+                               maxTimestampTracker.get(),
+                               maxDeletionTimeTracker.get(),
+                               tombstones,
+                               minColumnNamesSeen,
+                               maxColumnNamesSeen,
+                               hasLegacyCounterShards);
     }
 
     public boolean isMarkedForDelete()
@@ -463,53 +458,37 @@
     /**
      * @return the comparator whose sorting order the contained columns conform to
      */
-    public AbstractType<?> getComparator()
+    public CellNameType getComparator()
     {
         return metadata.comparator;
     }
 
     public boolean hasOnlyTombstones(long now)
     {
-        for (Column column : this)
-            if (column.isLive(now))
+        for (Cell cell : this)
+            if (cell.isLive(now))
                 return false;
         return true;
     }
 
-    public Iterator<Column> iterator()
+    public Iterator<Cell> iterator()
     {
         return getSortedColumns().iterator();
     }
 
-    public Iterator<Column> reverseIterator()
+    public Iterator<Cell> reverseIterator()
     {
         return getReverseSortedColumns().iterator();
     }
 
-    public boolean hasIrrelevantData(int gcBefore)
+    public Map<CellName, ByteBuffer> asMap()
     {
-        // Do we have gcable deletion infos?
-        if (deletionInfo().hasPurgeableTombstones(gcBefore))
-            return true;
-
-        // Do we have colums that are either deleted by the container or gcable tombstone?
-        DeletionInfo.InOrderTester tester = inOrderDeletionTester();
-        for (Column column : this)
-            if (tester.isDeleted(column) || column.hasIrrelevantData(gcBefore))
-                return true;
-
-        return false;
-    }
-
-    public Map<ByteBuffer, ByteBuffer> asMap()
-    {
-        ImmutableMap.Builder<ByteBuffer, ByteBuffer> builder = ImmutableMap.builder();
-        for (Column column : this)
-            builder.put(column.name, column.value);
+        ImmutableMap.Builder<CellName, ByteBuffer> builder = ImmutableMap.builder();
+        for (Cell cell : this)
+            builder.put(cell.name(), cell.value());
         return builder.build();
     }
 
-    // Note: the returned ColumnFamily will be an UnsortedColumns.
     public static ColumnFamily fromBytes(ByteBuffer bytes)
     {
         if (bytes == null)
@@ -517,7 +496,10 @@
 
         try
         {
-            return serializer.deserialize(new DataInputStream(ByteBufferUtil.inputStream(bytes)), UnsortedColumns.factory, ColumnSerializer.Flag.LOCAL, MessagingService.current_version);
+            return serializer.deserialize(new DataInputStream(ByteBufferUtil.inputStream(bytes)),
+                                                              ArrayBackedSortedColumns.factory,
+                                                              ColumnSerializer.Flag.LOCAL,
+                                                              MessagingService.current_version);
         }
         catch (IOException e)
         {
@@ -542,7 +524,12 @@
          * allow optimizing for both forward and reversed slices. This does not matter for ThreadSafeSortedColumns.
          * Note that this is only an hint on how we expect to do insertion, this does not change the map sorting.
          */
-        public abstract T create(CFMetaData metadata, boolean insertReversed);
+        public abstract T create(CFMetaData metadata, boolean insertReversed, int initialCapacity);
+
+        public T create(CFMetaData metadata, boolean insertReversed)
+        {
+            return create(metadata, insertReversed, 0);
+        }
 
         public T create(CFMetaData metadata)
         {

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java b/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java
index 3d832b2..f139369 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java

@@ -26,6 +26,7 @@
 import org.apache.cassandra.io.ISSTableSerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
 
@@ -48,7 +49,7 @@
      * <column count>
      * <columns, serialized individually>
     */
-    public void serialize(ColumnFamily cf, DataOutput out, int version)
+    public void serialize(ColumnFamily cf, DataOutputPlus out, int version)
     {
         try
         {
@@ -60,21 +61,14 @@
 
             out.writeBoolean(true);
             serializeCfId(cf.id(), out, version);
-
-            if (cf.metadata().isSuper() && version < MessagingService.VERSION_20)
-            {
-                SuperColumns.serializeSuperColumnFamily(cf, out, version);
-                return;
-            }
-
-            DeletionInfo.serializer().serialize(cf.deletionInfo(), out, version);
-            ColumnSerializer columnSerializer = Column.serializer;
+            cf.getComparator().deletionInfoSerializer().serialize(cf.deletionInfo(), out, version);
+            ColumnSerializer columnSerializer = cf.getComparator().columnSerializer();
             int count = cf.getColumnCount();
             out.writeInt(count);
             int written = 0;
-            for (Column column : cf)
+            for (Cell cell : cf)
             {
-                columnSerializer.serialize(column, out);
+                columnSerializer.serialize(cell, out);
                 written++;
             }
             assert count == written: "Column family had " + count + " columns, but " + written + " written";
@@ -108,9 +102,9 @@
         }
         else
         {
-            cf.delete(DeletionInfo.serializer().deserialize(in, version, cf.getComparator()));
+            cf.delete(cf.getComparator().deletionInfoSerializer().deserialize(in, version));
 
-            ColumnSerializer columnSerializer = Column.serializer;
+            ColumnSerializer columnSerializer = cf.getComparator().columnSerializer();
             int size = in.readInt();
             for (int i = 0; i < size; ++i)
                 cf.addColumn(columnSerializer.deserialize(in, flag));
@@ -120,19 +114,11 @@
 
     public long contentSerializedSize(ColumnFamily cf, TypeSizes typeSizes, int version)
     {
-        long size = 0L;
-
-        if (cf.metadata().isSuper() && version < MessagingService.VERSION_20)
-        {
-            size += SuperColumns.serializedSize(cf, typeSizes, version);
-        }
-        else
-        {
-            size += DeletionInfo.serializer().serializedSize(cf.deletionInfo(), typeSizes, version);
-            size += typeSizes.sizeof(cf.getColumnCount());
-            for (Column column : cf)
-                size += column.serializedSize(typeSizes);
-        }
+        long size = cf.getComparator().deletionInfoSerializer().serializedSize(cf.deletionInfo(), typeSizes, version);
+        size += typeSizes.sizeof(cf.getColumnCount());
+        ColumnSerializer columnSerializer = cf.getComparator().columnSerializer();
+        for (Cell cell : cf)
+            size += columnSerializer.serializedSize(cell, typeSizes);
         return size;
     }
 
@@ -155,7 +141,7 @@
         return serializedSize(cf, TypeSizes.NATIVE, version);
     }
 
-    public void serializeForSSTable(ColumnFamily cf, DataOutput out)
+    public void serializeForSSTable(ColumnFamily cf, DataOutputPlus out)
     {
         // Column families shouldn't be written directly to disk, use ColumnIndex.Builder instead
         throw new UnsupportedOperationException();
@@ -166,7 +152,7 @@
         throw new UnsupportedOperationException();
     }
 
-    public void serializeCfId(UUID cfId, DataOutput out, int version) throws IOException
+    public void serializeCfId(UUID cfId, DataOutputPlus out, int version) throws IOException
     {
         UUIDSerializer.serializer.serialize(cfId, out, version);
     }

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index d0ff951..a45d1b2 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java

@@ -17,9 +17,7 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.File;
-import java.io.FileFilter;
-import java.io.IOException;
+import java.io.*;
 import java.lang.management.ManagementFactory;
 import java.nio.ByteBuffer;
 import java.util.*;
@@ -30,32 +28,35 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Function;
+import com.google.common.base.Predicate;
 import com.google.common.collect.*;
+import com.google.common.util.concurrent.*;
 import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.Uninterruptibles;
-import org.cliffc.high_scale_lib.NonBlockingHashMap;
+import org.apache.cassandra.io.FSWriteError;
+import org.json.simple.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.cache.KeyCacheKey;
-import org.apache.cassandra.cache.IRowCacheEntry;
-import org.apache.cassandra.cache.RowCacheKey;
-import org.apache.cassandra.cache.RowCacheSentinel;
+import org.apache.cassandra.cache.*;
 import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
-import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.config.*;
 import org.apache.cassandra.config.CFMetaData.SpeculativeRetry;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.db.compaction.*;
-import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.filter.ExtendedFilter;
+import org.apache.cassandra.db.filter.IDiskAtomFilter;
+import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.exceptions.ConfigurationException;
@@ -63,22 +64,40 @@
 import org.apache.cassandra.io.compress.CompressionParameters;
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.metadata.CompactionMetadata;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.metrics.ColumnFamilyMetrics;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.StreamLockfile;
-import org.apache.cassandra.thrift.IndexExpression;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.*;
-
-import static org.apache.cassandra.config.CFMetaData.Caching;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
 
 public class ColumnFamilyStore implements ColumnFamilyStoreMBean
 {
     private static final Logger logger = LoggerFactory.getLogger(ColumnFamilyStore.class);
 
-    public static final ExecutorService postFlushExecutor = new JMXEnabledThreadPoolExecutor("MemtablePostFlusher");
+    private static final ExecutorService flushExecutor = new JMXEnabledThreadPoolExecutor(DatabaseDescriptor.getFlushWriters(),
+                                                                                          StageManager.KEEPALIVE,
+                                                                                          TimeUnit.SECONDS,
+                                                                                          new LinkedBlockingQueue<Runnable>(),
+                                                                                          new NamedThreadFactory("MemtableFlushWriter"),
+                                                                                          "internal");
+    // post-flush executor is single threaded to provide guarantee that any flush Future on a CF will never return until prior flushes have completed
+    public static final ExecutorService postFlushExecutor = new JMXEnabledThreadPoolExecutor(1,
+                                                                                             StageManager.KEEPALIVE,
+                                                                                             TimeUnit.SECONDS,
+                                                                                             new LinkedBlockingQueue<Runnable>(),
+                                                                                             new NamedThreadFactory("MemtablePostFlush"),
+                                                                                             "internal");
+    public static final ExecutorService reclaimExecutor = new JMXEnabledThreadPoolExecutor(1, StageManager.KEEPALIVE,
+                                                                                           TimeUnit.SECONDS,
+                                                                                           new LinkedBlockingQueue<Runnable>(),
+                                                                                           new NamedThreadFactory("MemtableReclaimMemory"),
+                                                                                           "internal");
 
     public final Keyspace keyspace;
     public final String name;
@@ -87,17 +106,24 @@
     private final String mbeanName;
     private volatile boolean valid = true;
 
-    /* Memtables and SSTables on disk for this column family */
+    /**
+     * Memtables and SSTables on disk for this column family.
+     *
+     * We synchronize on the DataTracker to ensure isolation when we want to make sure
+     * that the memtable we're acting on doesn't change out from under us.  I.e., flush
+     * syncronizes on it to make sure it can submit on both executors atomically,
+     * so anyone else who wants to make sure flush doesn't interfere should as well.
+     */
     private final DataTracker data;
 
+    /* The read order, used to track accesses to off-heap memtable storage */
+    public final OpOrder readOrdering = new OpOrder();
+
     /* This is used to generate the next index for a SSTable */
     private final AtomicInteger fileIndexGenerator = new AtomicInteger(0);
 
     public final SecondaryIndexManager indexManager;
 
-    private static final int INTERN_CUTOFF = 256;
-    public final ConcurrentMap<ByteBuffer, ByteBuffer> internedNames = new NonBlockingHashMap<ByteBuffer, ByteBuffer>();
-
     /* These are locally held copies to be changed from the config during runtime */
     private volatile DefaultInteger minCompactionThreshold;
     private volatile DefaultInteger maxCompactionThreshold;
@@ -128,8 +154,8 @@
 
         // If the CF comparator has changed, we need to change the memtable,
         // because the old one still aliases the previous comparator.
-        if (getMemtableThreadSafe().initialComparator != metadata.comparator)
-            switchMemtable(true, true);
+        if (data.getView().getCurrentMemtable().initialComparator != metadata.comparator)
+            switchMemtable();
     }
 
     private void maybeReloadCompactionStrategy()
@@ -158,14 +184,23 @@
             {
                 protected void runMayThrow() throws Exception
                 {
-                    if (getMemtableThreadSafe().isExpired())
+                    synchronized (data)
                     {
-                        // if memtable is already expired but didn't flush because it's empty,
-                        // then schedule another flush.
-                        if (isClean())
-                            scheduleFlush();
-                        else
-                            forceFlush(); // scheduleFlush() will be called by the constructor of the new memtable.
+                        Memtable current = data.getView().getCurrentMemtable();
+                        // if we're not expired, we've been hit by a scheduled flush for an already flushed memtable, so ignore
+                        if (current.isExpired())
+                        {
+                            if (current.isClean())
+                            {
+                                // if we're still clean, instead of swapping just reschedule a flush for later
+                                scheduleFlush();
+                            }
+                            else
+                            {
+                                // we'll be rescheduled by the constructor of the Memtable.
+                                forceFlush();
+                            }
+                        }
                     }
                 }
             };
@@ -244,7 +279,7 @@
         fileIndexGenerator.set(generation);
         sampleLatencyNanos = DatabaseDescriptor.getReadRpcTimeout() / 2;
 
-        Caching caching = metadata.getCaching();
+        CachingOptions caching = metadata.getCaching();
 
         logger.info("Initializing {}.{}", keyspace.getName(), name);
 
@@ -258,7 +293,7 @@
             data.addInitialSSTables(sstables);
         }
 
-        if (caching == Caching.ALL || caching == Caching.KEYS_ONLY)
+        if (caching.keyCache.isEnabled())
             CacheService.instance.keyCache.loadSaved(this);
 
         // compaction strategy should be created after the CFS has been prepared
@@ -328,7 +363,7 @@
         catch (Exception e)
         {
             // this shouldn't block anything.
-            logger.warn("Failed unregistering mbean: " + mbeanName, e);
+            logger.warn("Failed unregistering mbean: {}", mbeanName, e);
         }
 
         compactionStrategy.shutdown();
@@ -337,14 +372,7 @@
         data.unreferenceSSTables();
         indexManager.invalidate();
 
-        for (RowCacheKey key : CacheService.instance.rowCache.getKeySet())
-            if (key.cfId == metadata.cfId)
-                invalidateCachedRow(key);
-
-        String ksname = keyspace.getName();
-        for (KeyCacheKey key : CacheService.instance.keyCache.getKeySet())
-            if (key.getPathInfo().left.equals(ksname) && key.getPathInfo().right.equals(name))
-                CacheService.instance.keyCache.remove(key);
+        invalidateCaches();
     }
 
     /**
@@ -404,7 +432,7 @@
                                                                          boolean loadSSTables)
     {
         // get the max generation number, to prevent generation conflicts
-        Directories directories = Directories.create(keyspace.getName(), columnFamily);
+        Directories directories = new Directories(metadata);
         Directories.SSTableLister lister = directories.sstableLister().includeBackups(true);
         List<Integer> generations = new ArrayList<Integer>();
         for (Map.Entry<Descriptor, Set<Component>> entry : lister.list().entrySet())
@@ -412,7 +440,7 @@
             Descriptor desc = entry.getKey();
             generations.add(desc.generation);
             if (!desc.isCompatible())
-                throw new RuntimeException(String.format("Incompatible SSTable found.  Current version %s is unable to read file: %s.  Please run upgradesstables.",
+                throw new RuntimeException(String.format("Incompatible SSTable found. Current version %s is unable to read file: %s. Please run upgradesstables.",
                                                           Descriptor.Version.CURRENT, desc));
         }
         Collections.sort(generations);
@@ -425,9 +453,9 @@
      * Removes unnecessary files from the cf directory at startup: these include temp files, orphans, zero-length files
      * and compacted sstables. Files that cannot be recognized will be ignored.
      */
-    public static void scrubDataDirectories(String keyspaceName, String columnFamily)
+    public static void scrubDataDirectories(CFMetaData metadata)
     {
-        Directories directories = Directories.create(keyspaceName, columnFamily);
+        Directories directories = new Directories(metadata);
 
         // remove any left-behind SSTables from failed/stalled streaming
         FileFilter filter = new FileFilter()
@@ -453,14 +481,14 @@
             }
         }
 
-        logger.debug("Removing compacted SSTable files from {} (see http://wiki.apache.org/cassandra/MemtableSSTable)", columnFamily);
+        logger.debug("Removing compacted SSTable files from {} (see http://wiki.apache.org/cassandra/MemtableSSTable)", metadata.cfName);
 
         for (Map.Entry<Descriptor,Set<Component>> sstableFiles : directories.sstableLister().list().entrySet())
         {
             Descriptor desc = sstableFiles.getKey();
             Set<Component> components = sstableFiles.getValue();
 
-            if (components.contains(Component.COMPACTED_MARKER) || desc.temporary)
+            if (desc.type.isTemporary)
             {
                 SSTable.delete(desc, components);
                 continue;
@@ -480,7 +508,7 @@
         }
 
         // cleanup incomplete saved caches
-        Pattern tmpCacheFilePattern = Pattern.compile(keyspaceName + "-" + columnFamily + "-(Key|Row)Cache.*\\.tmp$");
+        Pattern tmpCacheFilePattern = Pattern.compile(metadata.ksName + "-" + metadata.cfName + "-(Key|Row)Cache.*\\.tmp$");
         File dir = new File(DatabaseDescriptor.getSavedCachesLocation());
 
         if (dir.exists())
@@ -489,15 +517,21 @@
             for (File file : dir.listFiles())
                 if (tmpCacheFilePattern.matcher(file.getName()).matches())
                     if (!file.delete())
-                        logger.warn("could not delete " + file.getAbsolutePath());
+                        logger.warn("could not delete {}", file.getAbsolutePath());
         }
 
         // also clean out any index leftovers.
-        CFMetaData cfm = Schema.instance.getCFMetaData(keyspaceName, columnFamily);
-        if (cfm != null) // secondary indexes aren't stored in DD.
+        for (ColumnDefinition def : metadata.allColumns())
         {
-            for (ColumnDefinition def : cfm.allColumns())
-                scrubDataDirectories(keyspaceName, cfm.indexColumnFamilyName(def));
+            if (def.isIndexed())
+            {
+                CellNameType indexComparator = SecondaryIndex.getIndexComparator(metadata, def);
+                if (indexComparator != null)
+                {
+                    CFMetaData indexMetadata = CFMetaData.newIndexMetadata(metadata, def, indexComparator);
+                    scrubDataDirectories(indexMetadata);
+                }
+            }
         }
     }
 
@@ -513,9 +547,9 @@
      * compactions, we remove the new ones (since those may be incomplete -- under LCS, we may create multiple
      * sstables from any given ancestor).
      */
-    public static void removeUnfinishedCompactionLeftovers(String keyspace, String columnfamily, Map<Integer, UUID> unfinishedCompactions)
+    public static void removeUnfinishedCompactionLeftovers(CFMetaData metadata, Map<Integer, UUID> unfinishedCompactions)
     {
-        Directories directories = Directories.create(keyspace, columnfamily);
+        Directories directories = new Directories(metadata);
 
         Set<Integer> allGenerations = new HashSet<>();
         for (Descriptor desc : directories.sstableLister().list().keySet())
@@ -528,20 +562,21 @@
             HashSet<Integer> missingGenerations = new HashSet<>(unfinishedGenerations);
             missingGenerations.removeAll(allGenerations);
             logger.debug("Unfinished compactions of {}.{} reference missing sstables of generations {}",
-                         keyspace, columnfamily, missingGenerations);
+                         metadata.ksName, metadata.cfName, missingGenerations);
         }
 
         // remove new sstables from compactions that didn't complete, and compute
         // set of ancestors that shouldn't exist anymore
         Set<Integer> completedAncestors = new HashSet<>();
-        for (Map.Entry<Descriptor, Set<Component>> sstableFiles : directories.sstableLister().list().entrySet())
+        for (Map.Entry<Descriptor, Set<Component>> sstableFiles : directories.sstableLister().skipTemporary(true).list().entrySet())
         {
             Descriptor desc = sstableFiles.getKey();
 
             Set<Integer> ancestors;
             try
             {
-                ancestors = SSTableMetadata.serializer.deserialize(desc).right;
+                CompactionMetadata compactionMetadata = (CompactionMetadata) desc.getMetadataSerializer().deserialize(desc, MetadataType.COMPACTION);
+                ancestors = compactionMetadata.ancestors;
             }
             catch (IOException e)
             {
@@ -591,13 +626,29 @@
 
         int cachedRowsRead = CacheService.instance.rowCache.loadSaved(this);
         if (cachedRowsRead > 0)
-            logger.info("completed loading ({} ms; {} keys) row cache for {}.{}",
+            logger.info("Completed loading ({} ms; {} keys) row cache for {}.{}",
                         TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start),
                         cachedRowsRead,
                         keyspace.getName(),
                         name);
     }
 
+    public void initCounterCache()
+    {
+        if (!metadata.isCounter() || CacheService.instance.counterCache.getCapacity() == 0)
+            return;
+
+        long start = System.nanoTime();
+
+        int cachedShardsRead = CacheService.instance.counterCache.loadSaved(this);
+        if (cachedShardsRead > 0)
+            logger.info("Completed loading ({} ms; {} shards) counter cache for {}.{}",
+                        TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start),
+                        cachedShardsRead,
+                        keyspace.getName(),
+                        name);
+    }
+
     /**
      * See #{@code StorageService.loadNewSSTables(String, String)} for more info
      *
@@ -616,7 +667,7 @@
      */
     public synchronized void loadNewSSTables()
     {
-        logger.info("Loading new SSTables for " + keyspace.getName() + "/" + name + "...");
+        logger.info("Loading new SSTables for {}/{}...", keyspace.getName(), name);
 
         Set<Descriptor> currentDescriptors = new HashSet<Descriptor>();
         for (SSTableReader sstable : data.getView().sstables)
@@ -630,7 +681,7 @@
 
             if (currentDescriptors.contains(descriptor))
                 continue; // old (initialized) SSTable found, skipping
-            if (descriptor.temporary) // in the process of being written
+            if (descriptor.type.isTemporary) // in the process of being written
                 continue;
 
             if (!descriptor.isCompatible())
@@ -642,10 +693,7 @@
             try
             {
                 if (new File(descriptor.filenameFor(Component.STATS)).exists())
-                {
-                    Pair<SSTableMetadata, Set<Integer>> oldMetadata = SSTableMetadata.serializer.deserialize(descriptor);
-                    LeveledManifest.mutateLevel(oldMetadata, descriptor, descriptor.filenameFor(Component.STATS), 0);
-                }
+                    descriptor.getMetadataSerializer().mutateLevel(descriptor, 0);
             }
             catch (IOException e)
             {
@@ -663,7 +711,7 @@
                                                descriptor.ksname,
                                                descriptor.cfname,
                                                fileIndexGenerator.incrementAndGet(),
-                                               false);
+                                               Descriptor.Type.FINAL);
             }
             while (new File(newDescriptor.filenameFor(Component.DATA)).exists());
 
@@ -685,11 +733,11 @@
 
         if (newSSTables.isEmpty())
         {
-            logger.info("No new SSTables were found for " + keyspace.getName() + "/" + name);
+            logger.info("No new SSTables were found for {}/{}", keyspace.getName(), name);
             return;
         }
 
-        logger.info("Loading new SSTables and building secondary indexes for " + keyspace.getName() + "/" + name + ": " + newSSTables);
+        logger.info("Loading new SSTables and building secondary indexes for {}/{}: {}", keyspace.getName(), name, newSSTables);
         SSTableReader.acquireReferences(newSSTables);
         data.addSSTables(newSSTables);
         try
@@ -701,7 +749,7 @@
             SSTableReader.releaseReferences(newSSTables);
         }
 
-        logger.info("Done loading load new SSTables for " + keyspace.getName() + "/" + name);
+        logger.info("Done loading load new SSTables for {}/{}", keyspace.getName(), name);
     }
 
     public static void rebuildSecondaryIndex(String ksName, String cfName, String... idxNames)
@@ -742,133 +790,117 @@
                                          keyspace.getName(),
                                          name,
                                          fileIndexGenerator.incrementAndGet(),
-                                         true);
+                                         Descriptor.Type.TEMP);
         return desc.filenameFor(Component.DATA);
     }
 
     /**
-     * Switch and flush the current memtable, if it was dirty. The forceSwitch
-     * flag allow to force switching the memtable even if it is clean (though
-     * in that case we don't flush, as there is no point).
+     * Switches the memtable iff the live memtable is the one provided
+     *
+     * @param memtable
      */
-    public Future<?> switchMemtable(final boolean writeCommitLog, boolean forceSwitch)
+    public Future<?> switchMemtableIfCurrent(Memtable memtable)
     {
-        /*
-         * If we can get the writelock, that means no new updates can come in and
-         * all ongoing updates to memtables have completed. We can get the tail
-         * of the log and use it as the starting position for log replay on recovery.
-         *
-         * This is why we Keyspace.switchLock needs to be global instead of per-Keyspace:
-         * we need to schedule discardCompletedSegments calls in the same order as their
-         * contexts (commitlog position) were read, even though the flush executor
-         * is multithreaded.
-         */
-        Keyspace.switchLock.writeLock().lock();
-        try
+        synchronized (data)
         {
-            final Future<ReplayPosition> ctx = writeCommitLog ? CommitLog.instance.getContext() : Futures.immediateFuture(ReplayPosition.NONE);
-
-            // submit the memtable for any indexed sub-cfses, and our own.
-            final List<ColumnFamilyStore> icc = new ArrayList<ColumnFamilyStore>();
-            // don't assume that this.memtable is dirty; forceFlush can bring us here during index build even if it is not
-            for (ColumnFamilyStore cfs : concatWithIndexes())
-            {
-                if (forceSwitch || !cfs.getMemtableThreadSafe().isClean())
-                    icc.add(cfs);
-            }
-
-            final CountDownLatch latch = new CountDownLatch(icc.size());
-            for (ColumnFamilyStore cfs : icc)
-            {
-                Memtable memtable = cfs.data.switchMemtable();
-                // With forceSwitch it's possible to get a clean memtable here.
-                // In that case, since we've switched it already, just remove
-                // it from the memtable pending flush right away.
-                if (memtable.isClean())
-                {
-                    cfs.replaceFlushed(memtable, null);
-                    latch.countDown();
-                }
-                else
-                {
-                    logger.info("Enqueuing flush of {}", memtable);
-                    memtable.flushAndSignal(latch, ctx);
-                }
-            }
-
-            if (metric.memtableSwitchCount.count() == Long.MAX_VALUE)
-                metric.memtableSwitchCount.clear();
-            metric.memtableSwitchCount.inc();
-
-            // when all the memtables have been written, including for indexes, mark the flush in the commitlog header.
-            // a second executor makes sure the onMemtableFlushes get called in the right order,
-            // while keeping the wait-for-flush (future.get) out of anything latency-sensitive.
-            return postFlushExecutor.submit(new WrappedRunnable()
-            {
-                public void runMayThrow() throws InterruptedException, ExecutionException
-                {
-                    latch.await();
-
-                    if (!icc.isEmpty())
-                    {
-                        //only valid when memtables exist
-
-                        for (SecondaryIndex index : indexManager.getIndexesNotBackedByCfs())
-                        {
-                            // flush any non-cfs backed indexes
-                            logger.info("Flushing SecondaryIndex {}", index);
-                            index.forceBlockingFlush();
-                        }
-                    }
-
-                    if (writeCommitLog)
-                    {
-                        // if we're not writing to the commit log, we are replaying the log, so marking
-                        // the log header with "you can discard anything written before the context" is not valid
-                        CommitLog.instance.discardCompletedSegments(metadata.cfId, ctx.get());
-                    }
-                }
-            });
+            if (data.getView().getCurrentMemtable() == memtable)
+                return switchMemtable();
         }
-        finally
+        return Futures.immediateFuture(null);
+    }
+
+    /*
+     * switchMemtable puts Memtable.getSortedContents on the writer executor.  When the write is complete,
+     * we turn the writer into an SSTableReader and add it to ssTables where it is available for reads.
+     * This method does not block except for synchronizing on DataTracker, but the Future it returns will
+     * not complete until the Memtable (and all prior Memtables) have been successfully flushed, and the CL
+     * marked clean up to the position owned by the Memtable.
+     */
+    public ListenableFuture<?> switchMemtable()
+    {
+        synchronized (data)
         {
-            Keyspace.switchLock.writeLock().unlock();
+            logFlush();
+            Flush flush = new Flush(false);
+            flushExecutor.execute(flush);
+            ListenableFutureTask<?> task = ListenableFutureTask.create(flush.postFlush, null);
+            postFlushExecutor.submit(task);
+            return task;
         }
     }
 
-    private boolean isClean()
+    // print out size of all memtables we're enqueuing
+    private void logFlush()
     {
-        // during index build, 2ary index memtables can be dirty even if parent is not.  if so,
-        // we want flushLargestMemtables to flush the 2ary index ones too.
-        for (ColumnFamilyStore cfs : concatWithIndexes())
-            if (!cfs.getMemtableThreadSafe().isClean())
-                return false;
+        // reclaiming includes that which we are GC-ing;
+        float onHeapRatio = 0, offHeapRatio = 0;
+        long onHeapTotal = 0, offHeapTotal = 0;
+        Memtable memtable = getDataTracker().getView().getCurrentMemtable();
+        onHeapRatio +=  memtable.getAllocator().onHeap().ownershipRatio();
+        offHeapRatio += memtable.getAllocator().offHeap().ownershipRatio();
+        onHeapTotal += memtable.getAllocator().onHeap().owns();
+        offHeapTotal += memtable.getAllocator().offHeap().owns();
 
-        return true;
+        for (SecondaryIndex index : indexManager.getIndexes())
+        {
+            if (index.getIndexCfs() != null)
+            {
+                MemtableAllocator allocator = index.getIndexCfs().getDataTracker().getView().getCurrentMemtable().getAllocator();
+                onHeapRatio += allocator.onHeap().ownershipRatio();
+                offHeapRatio += allocator.offHeap().ownershipRatio();
+                onHeapTotal += allocator.onHeap().owns();
+                offHeapTotal += allocator.offHeap().owns();
+            }
+        }
+
+        logger.info("Enqueuing flush of {}: {}", name, String.format("%d (%.0f%%) on-heap, %d (%.0f%%) off-heap",
+                                                                     onHeapTotal, onHeapRatio * 100, offHeapTotal, offHeapRatio * 100));
+    }
+
+
+    public ListenableFuture<?> forceFlush()
+    {
+        return forceFlush(null);
     }
 
     /**
-     * @return a future, with a guarantee that any data inserted prior to the forceFlush() call is fully flushed
-     *         by the time future.get() returns. Never returns null.
+     * Flush if there is unflushed data that was written to the CommitLog before @param flushIfDirtyBefore
+     * (inclusive).  If @param flushIfDirtyBefore is null, flush if there is any unflushed data.
+     *
+     * @return a Future such that when the future completes, all data inserted before forceFlush was called,
+     * will be flushed.
      */
-    public Future<?> forceFlush()
+    public ListenableFuture<?> forceFlush(ReplayPosition flushIfDirtyBefore)
     {
-        if (isClean())
+        // we synchronize on the data tracker to ensure we don't race against other calls to switchMemtable(),
+        // unnecessarily queueing memtables that are about to be made clean
+        synchronized (data)
         {
-            // We could have a memtable for this column family that is being
-            // flushed. Make sure the future returned wait for that so callers can
-            // assume that any data inserted prior to the call are fully flushed
-            // when the future returns (see #5241).
-            return postFlushExecutor.submit(new Runnable()
-            {
-                public void run()
-                {
-                    logger.debug("forceFlush requested but everything is clean in {}", name);
-                }
-            });
-        }
+            // during index build, 2ary index memtables can be dirty even if parent is not.  if so,
+            // we want to flush the 2ary index ones too.
+            boolean clean = true;
+            for (ColumnFamilyStore cfs : concatWithIndexes())
+                clean &= cfs.data.getView().getCurrentMemtable().isCleanAfter(flushIfDirtyBefore);
 
-        return switchMemtable(true, false);
+            if (clean)
+            {
+                // We could have a memtable for this column family that is being
+                // flushed. Make sure the future returned wait for that so callers can
+                // assume that any data inserted prior to the call are fully flushed
+                // when the future returns (see #5241).
+                ListenableFutureTask<?> task = ListenableFutureTask.create(new Runnable()
+                {
+                    public void run()
+                    {
+                        logger.debug("forceFlush requested but everything is clean in {}", name);
+                    }
+                }, null);
+                postFlushExecutor.execute(task);
+                return task;
+            }
+
+            return switchMemtable();
+        }
     }
 
     public void forceBlockingFlush()
@@ -876,6 +908,216 @@
         FBUtilities.waitOnFuture(forceFlush());
     }
 
+    /**
+     * Both synchronises custom secondary indexes and provides ordering guarantees for futures on switchMemtable/flush
+     * etc, which expect to be able to wait until the flush (and all prior flushes) requested have completed.
+     */
+    private final class PostFlush implements Runnable
+    {
+        final boolean flushSecondaryIndexes;
+        final OpOrder.Barrier writeBarrier;
+        final CountDownLatch latch = new CountDownLatch(1);
+        volatile ReplayPosition lastReplayPosition;
+
+        private PostFlush(boolean flushSecondaryIndexes, OpOrder.Barrier writeBarrier)
+        {
+            this.writeBarrier = writeBarrier;
+            this.flushSecondaryIndexes = flushSecondaryIndexes;
+        }
+
+        public void run()
+        {
+            writeBarrier.await();
+
+            /**
+             * we can flush 2is as soon as the barrier completes, as they will be consistent with (or ahead of) the
+             * flushed memtables and CL position, which is as good as we can guarantee.
+             * TODO: SecondaryIndex should support setBarrier(), so custom implementations can co-ordinate exactly
+             * with CL as we do with memtables/CFS-backed SecondaryIndexes.
+             */
+
+            if (flushSecondaryIndexes)
+            {
+                for (SecondaryIndex index : indexManager.getIndexesNotBackedByCfs())
+                {
+                    // flush any non-cfs backed indexes
+                    logger.info("Flushing SecondaryIndex {}", index);
+                    index.forceBlockingFlush();
+                }
+            }
+
+            try
+            {
+                // we wait on the latch for the lastReplayPosition to be set, and so that waiters
+                // on this task can rely on all prior flushes being complete
+                latch.await();
+            }
+            catch (InterruptedException e)
+            {
+                throw new IllegalStateException();
+            }
+
+            // must check lastReplayPosition != null because Flush may find that all memtables are clean
+            // and so not set a lastReplayPosition
+            if (lastReplayPosition != null)
+            {
+                CommitLog.instance.discardCompletedSegments(metadata.cfId, lastReplayPosition);
+            }
+
+            metric.pendingFlushes.dec();
+        }
+    }
+
+    /**
+     * Should only be constructed/used from switchMemtable() or truncate(), with ownership of the DataTracker monitor.
+     * In the constructor the current memtable(s) are swapped, and a barrier on outstanding writes is issued;
+     * when run by the flushWriter the barrier is waited on to ensure all outstanding writes have completed
+     * before all memtables are immediately written, and the CL is either immediately marked clean or, if
+     * there are custom secondary indexes, the post flush clean up is left to update those indexes and mark
+     * the CL clean
+     */
+    private final class Flush implements Runnable
+    {
+        final OpOrder.Barrier writeBarrier;
+        final List<Memtable> memtables;
+        final PostFlush postFlush;
+        final boolean truncate;
+
+        private Flush(boolean truncate)
+        {
+            // if true, we won't flush, we'll just wait for any outstanding writes, switch the memtable, and discard
+            this.truncate = truncate;
+
+            metric.pendingFlushes.inc();
+            /**
+             * To ensure correctness of switch without blocking writes, run() needs to wait for all write operations
+             * started prior to the switch to complete. We do this by creating a Barrier on the writeOrdering
+             * that all write operations register themselves with, and assigning this barrier to the memtables,
+             * after which we *.issue()* the barrier. This barrier is used to direct write operations started prior
+             * to the barrier.issue() into the memtable we have switched out, and any started after to its replacement.
+             * In doing so it also tells the write operations to update the lastReplayPosition of the memtable, so
+             * that we know the CL position we are dirty to, which can be marked clean when we complete.
+             */
+            writeBarrier = keyspace.writeOrder.newBarrier();
+            memtables = new ArrayList<>();
+
+            // submit flushes for the memtable for any indexed sub-cfses, and our own
+            final ReplayPosition minReplayPosition = CommitLog.instance.getContext();
+            for (ColumnFamilyStore cfs : concatWithIndexes())
+            {
+                // switch all memtables, regardless of their dirty status, setting the barrier
+                // so that we can reach a coordinated decision about cleanliness once they
+                // are no longer possible to be modified
+                Memtable mt = cfs.data.switchMemtable(truncate);
+                mt.setDiscarding(writeBarrier, minReplayPosition);
+                memtables.add(mt);
+            }
+
+            writeBarrier.issue();
+            postFlush = new PostFlush(!truncate, writeBarrier);
+        }
+
+        public void run()
+        {
+            // mark writes older than the barrier as blocking progress, permitting them to exceed our memory limit
+            // if they are stuck waiting on it, then wait for them all to complete
+            writeBarrier.markBlocking();
+            writeBarrier.await();
+
+            // mark all memtables as flushing, removing them from the live memtable list, and
+            // remove any memtables that are already clean from the set we need to flush
+            Iterator<Memtable> iter = memtables.iterator();
+            while (iter.hasNext())
+            {
+                Memtable memtable = iter.next();
+                memtable.cfs.data.markFlushing(memtable);
+                if (memtable.isClean() || truncate)
+                {
+                    memtable.cfs.replaceFlushed(memtable, null);
+                    memtable.setDiscarded();
+                    iter.remove();
+                }
+            }
+
+            if (memtables.isEmpty())
+            {
+                postFlush.latch.countDown();
+                return;
+            }
+
+            metric.memtableSwitchCount.inc();
+
+            for (final Memtable memtable : memtables)
+            {
+                // flush the memtable
+                MoreExecutors.sameThreadExecutor().execute(memtable.flushRunnable());
+
+                // issue a read barrier for reclaiming the memory, and offload the wait to another thread
+                final OpOrder.Barrier readBarrier = readOrdering.newBarrier();
+                readBarrier.issue();
+                reclaimExecutor.execute(new WrappedRunnable()
+                {
+                    public void runMayThrow() throws InterruptedException, ExecutionException
+                    {
+                        readBarrier.await();
+                        memtable.setDiscarded();
+                    }
+                });
+            }
+
+            // signal the post-flush we've done our work
+            postFlush.lastReplayPosition = memtables.get(0).getLastReplayPosition();
+            postFlush.latch.countDown();
+        }
+    }
+
+    /**
+     * Finds the largest memtable, as a percentage of *either* on- or off-heap memory limits, and immediately
+     * queues it for flushing. If the memtable selected is flushed before this completes, no work is done.
+     */
+    public static class FlushLargestColumnFamily implements Runnable
+    {
+        public void run()
+        {
+            float largestRatio = 0f;
+            Memtable largest = null;
+            for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
+            {
+                // we take a reference to the current main memtable for the CF prior to snapping its ownership ratios
+                // to ensure we have some ordering guarantee for performing the switchMemtableIf(), i.e. we will only
+                // swap if the memtables we are measuring here haven't already been swapped by the time we try to swap them
+                Memtable current = cfs.getDataTracker().getView().getCurrentMemtable();
+
+                // find the total ownership ratio for the memtable and all SecondaryIndexes owned by this CF,
+                // both on- and off-heap, and select the largest of the two ratios to weight this CF
+                float onHeap = 0f, offHeap = 0f;
+                onHeap += current.getAllocator().onHeap().ownershipRatio();
+                offHeap += current.getAllocator().offHeap().ownershipRatio();
+
+                for (SecondaryIndex index : cfs.indexManager.getIndexes())
+                {
+                    if (index.getIndexCfs() != null)
+                    {
+                        MemtableAllocator allocator = index.getIndexCfs().getDataTracker().getView().getCurrentMemtable().getAllocator();
+                        onHeap += allocator.onHeap().ownershipRatio();
+                        offHeap += allocator.offHeap().ownershipRatio();
+                    }
+                }
+
+                float ratio = Math.max(onHeap, offHeap);
+
+                if (ratio > largestRatio)
+                {
+                    largest = current;
+                    largestRatio = ratio;
+                }
+            }
+
+            if (largest != null)
+                largest.cfs.switchMemtableIfCurrent(largest);
+        }
+    }
+
     public void maybeUpdateRowCache(DecoratedKey key)
     {
         if (!isRowCacheEnabled())
@@ -892,15 +1134,14 @@
      * param @ key - key for update/insert
      * param @ columnFamily - columnFamily changes
      */
-    public void apply(DecoratedKey key, ColumnFamily columnFamily, SecondaryIndexManager.Updater indexer)
+    public void apply(DecoratedKey key, ColumnFamily columnFamily, SecondaryIndexManager.Updater indexer, OpOrder.Group opGroup, ReplayPosition replayPosition)
     {
         long start = System.nanoTime();
 
-        Memtable mt = getMemtableThreadSafe();
-        mt.put(key, columnFamily, indexer);
+        Memtable mt = data.getMemtableFor(opGroup);
+        mt.put(key, columnFamily, indexer, opGroup, replayPosition);
         maybeUpdateRowCache(key);
         metric.writeLatency.addNano(System.nanoTime() - start);
-        mt.maybeUpdateLiveRatio();
     }
 
     /**
@@ -914,7 +1155,7 @@
         cf.purgeTombstones(gcBefore);
 
         // if there are no columns or tombstones left, return null
-        return cf.getColumnCount() == 0 && !cf.isMarkedForDelete() ? null : cf;
+        return !cf.hasColumns() && !cf.isMarkedForDelete() ? null : cf;
     }
 
     /**
@@ -927,7 +1168,7 @@
     }
 
     /*
-     This is complicated because we need to preserve deleted columns, supercolumns, and columnfamilies
+     This is complicated because we need to preserve deleted columns and columnfamilies
      until they have been deleted for at least GC_GRACE_IN_SECONDS.  But, we do not need to preserve
      their contents; just the object itself as a "tombstone" that can be used to repair other
      replicas that do not know about the deletion.
@@ -939,8 +1180,7 @@
             return null;
         }
 
-        removeDeletedColumnsOnly(cf, gcBefore, indexer);
-        return removeDeletedCF(cf, gcBefore);
+        return removeDeletedCF(removeDeletedColumnsOnly(cf, gcBefore, indexer), gcBefore);
     }
 
     /**
@@ -948,15 +1188,14 @@
      * columns that have been dropped from the schema (for CQL3 tables only).
      * @return the updated ColumnFamily
      */
-    public static long removeDeletedColumnsOnly(ColumnFamily cf, int gcBefore, SecondaryIndexManager.Updater indexer)
+    public static ColumnFamily removeDeletedColumnsOnly(ColumnFamily cf, int gcBefore, SecondaryIndexManager.Updater indexer)
     {
-        Iterator<Column> iter = cf.iterator();
+        Iterator<Cell> iter = cf.iterator();
         DeletionInfo.InOrderTester tester = cf.inOrderDeletionTester();
         boolean hasDroppedColumns = !cf.metadata.getDroppedColumns().isEmpty();
-        long removedBytes = 0;
         while (iter.hasNext())
         {
-            Column c = iter.next();
+            Cell c = iter.next();
             // remove columns if
             // (a) the column itself is gcable or
             // (b) the column is shadowed by a CF tombstone
@@ -965,23 +1204,18 @@
             {
                 iter.remove();
                 indexer.remove(c);
-                removedBytes += c.dataSize();
             }
         }
-        return removedBytes;
-    }
 
-    public static long removeDeletedColumnsOnly(ColumnFamily cf, int gcBefore)
-    {
-        return removeDeletedColumnsOnly(cf, gcBefore, SecondaryIndexManager.nullUpdater);
+        return cf;
     }
 
     // returns true if
     // 1. this column has been dropped from schema and
     // 2. if it has been re-added since then, this particular column was inserted before the last drop
-    private static boolean isDroppedColumn(Column c, CFMetaData meta)
+    private static boolean isDroppedColumn(Cell c, CFMetaData meta)
     {
-        Long droppedAt = meta.getDroppedColumns().get(((CompositeType) meta.comparator).extractLastComponent(c.name()));
+        Long droppedAt = meta.getDroppedColumns().get(c.name().cql3ColumnName(meta));
         return droppedAt != null && c.timestamp() <= droppedAt;
     }
 
@@ -990,7 +1224,7 @@
         if (cf == null || cf.metadata.getDroppedColumns().isEmpty())
             return;
 
-        Iterator<Column> iter = cf.iterator();
+        Iterator<Cell> iter = cf.iterator();
         while (iter.hasNext())
             if (isDroppedColumn(iter.next(), metadata))
                 iter.remove();
@@ -1074,7 +1308,7 @@
     {
         if (operation != OperationType.CLEANUP || isIndex())
         {
-            return SSTable.getTotalBytes(sstables);
+            return SSTableReader.getTotalBytes(sstables);
         }
 
         // cleanup size estimation only counts bytes for keys local to this node
@@ -1107,22 +1341,22 @@
         return maxFile;
     }
 
-    public void forceCleanup(CounterId.OneShotRenewer renewer) throws ExecutionException, InterruptedException
+    public CompactionManager.AllSSTableOpStatus forceCleanup() throws ExecutionException, InterruptedException
     {
-        CompactionManager.instance.performCleanup(ColumnFamilyStore.this, renewer);
+        return CompactionManager.instance.performCleanup(ColumnFamilyStore.this);
     }
 
-    public void scrub(boolean disableSnapshot, boolean skipCorrupted) throws ExecutionException, InterruptedException
+    public CompactionManager.AllSSTableOpStatus scrub(boolean disableSnapshot, boolean skipCorrupted) throws ExecutionException, InterruptedException
     {
         // skip snapshot creation during scrub, SEE JIRA 5891
         if(!disableSnapshot)
             snapshotWithoutFlush("pre-scrub-" + System.currentTimeMillis());
-        CompactionManager.instance.performScrub(ColumnFamilyStore.this, skipCorrupted);
+        return CompactionManager.instance.performScrub(ColumnFamilyStore.this, skipCorrupted);
     }
 
-    public void sstablesRewrite(boolean excludeCurrentVersion) throws ExecutionException, InterruptedException
+    public CompactionManager.AllSSTableOpStatus sstablesRewrite(boolean excludeCurrentVersion) throws ExecutionException, InterruptedException
     {
-        CompactionManager.instance.performSSTableRewrite(ColumnFamilyStore.this, excludeCurrentVersion);
+        return CompactionManager.instance.performSSTableRewrite(ColumnFamilyStore.this, excludeCurrentVersion);
     }
 
     public void markObsolete(Collection<SSTableReader> sstables, OperationType compactionType)
@@ -1131,11 +1365,6 @@
         data.markObsolete(sstables, compactionType);
     }
 
-    public void replaceCompactedSSTables(Collection<SSTableReader> sstables, Collection<SSTableReader> replacements, OperationType compactionType)
-    {
-        data.replaceCompactedSSTables(sstables, replacements, compactionType);
-    }
-
     void replaceFlushed(Memtable memtable, SSTableReader sstable)
     {
         compactionStrategy.replaceFlushed(memtable, sstable);
@@ -1153,35 +1382,7 @@
 
     public long getMemtableDataSize()
     {
-        return metric.memtableDataSize.value();
-    }
-
-    public long getTotalMemtableLiveSize()
-    {
-        return getMemtableDataSize() + indexManager.getTotalLiveSize();
-    }
-
-    /**
-     * @return the live size of all the memtables (the current active one and pending flush).
-     */
-    public long getAllMemtablesLiveSize()
-    {
-        long size = 0;
-        for (Memtable mt : getDataTracker().getAllMemtables())
-            size += mt.getLiveSize();
-        return size;
-    }
-
-    /**
-     * @return the size of all the memtables, including the pending flush ones and 2i memtables, if any.
-     */
-    public long getTotalAllMemtablesLiveSize()
-    {
-        long size = getAllMemtablesLiveSize();
-        if (indexManager.hasIndexes())
-            for (ColumnFamilyStore index : indexManager.getIndexesBackedByCfs())
-                size += index.getAllMemtablesLiveSize();
-        return size;
+        return metric.memtableOnHeapSize.value();
     }
 
     public int getMemtableSwitchCount()
@@ -1189,11 +1390,6 @@
         return (int) metric.memtableSwitchCount.count();
     }
 
-    Memtable getMemtableThreadSafe()
-    {
-        return data.getMemtable();
-    }
-
     /**
      * Package protected for access from the CompactionManager.
      */
@@ -1249,7 +1445,7 @@
 
     public int getPendingTasks()
     {
-        return metric.pendingTasks.value();
+        return (int) metric.pendingFlushes.count();
     }
 
     public long getWriteCount()
@@ -1278,8 +1474,8 @@
     }
 
     public ColumnFamily getColumnFamily(DecoratedKey key,
-                                        ByteBuffer start,
-                                        ByteBuffer finish,
+                                        Composite start,
+                                        Composite finish,
                                         boolean reversed,
                                         int limit,
                                         long timestamp)
@@ -1288,12 +1484,16 @@
     }
 
     /**
-     * fetch the row given by filter.key if it is in the cache; if not, read it from disk and cache it
+     * Fetch the row and columns given by filter.key if it is in the cache; if not, read it from disk and cache it
+     *
+     * If row is cached, and the filter given is within its bounds, we return from cache, otherwise from disk
+     *
+     * If row is not cached, we figure out what filter is "biggest", read that from disk, then
+     * filter the result and either cache that or return it.
+     *
      * @param cfId the column family to read the row from
-     * @param filter the columns being queried.  Note that we still cache entire rows, but if a row is uncached
-     *               and we race to cache it, only the winner will read the entire row
-     * @return the entire row for filter.key, if present in the cache (or we can cache it), or just the column
-     *         specified by filter otherwise
+     * @param filter the columns being queried.
+     * @return the requested data for the filter provided
      */
     private ColumnFamily getThroughCache(UUID cfId, QueryFilter filter)
     {
@@ -1304,6 +1504,7 @@
 
         // attempt a sentinel-read-cache sequence.  if a write invalidates our sentinel, we'll return our
         // (now potentially obsolete) data, but won't cache it. see CASSANDRA-3862
+        // TODO: don't evict entire rows on writes (#2864)
         IRowCacheEntry cached = CacheService.instance.rowCache.get(key);
         if (cached != null)
         {
@@ -1311,32 +1512,135 @@
             {
                 // Some other read is trying to cache the value, just do a normal non-caching read
                 Tracing.trace("Row cache miss (race)");
+                metric.rowCacheMiss.inc();
                 return getTopLevelColumns(filter, Integer.MIN_VALUE);
             }
-            Tracing.trace("Row cache hit");
-            return (ColumnFamily) cached;
+
+            ColumnFamily cachedCf = (ColumnFamily)cached;
+            if (isFilterFullyCoveredBy(filter.filter, cachedCf, filter.timestamp))
+            {
+                metric.rowCacheHit.inc();
+                Tracing.trace("Row cache hit");
+                return filterColumnFamily(cachedCf, filter);
+            }
+
+            metric.rowCacheHitOutOfRange.inc();
+            Tracing.trace("Ignoring row cache as cached value could not satisfy query");
+            return getTopLevelColumns(filter, Integer.MIN_VALUE);
         }
 
+        metric.rowCacheMiss.inc();
         Tracing.trace("Row cache miss");
         RowCacheSentinel sentinel = new RowCacheSentinel();
         boolean sentinelSuccess = CacheService.instance.rowCache.putIfAbsent(key, sentinel);
-
+        ColumnFamily data = null;
+        ColumnFamily toCache = null;
         try
         {
-            ColumnFamily data = getTopLevelColumns(QueryFilter.getIdentityFilter(filter.key, name, filter.timestamp),
-                                                   Integer.MIN_VALUE);
-            if (sentinelSuccess && data != null)
-                CacheService.instance.rowCache.replace(key, sentinel, data);
+            // If we are explicitely asked to fill the cache with full partitions, we go ahead and query the whole thing
+            if (metadata.getCaching().rowCache.cacheFullPartitions())
+            {
+                data = getTopLevelColumns(QueryFilter.getIdentityFilter(filter.key, name, filter.timestamp), Integer.MIN_VALUE);
+                toCache = data;
+                Tracing.trace("Populating row cache with the whole partition");
+                if (sentinelSuccess && toCache != null)
+                    CacheService.instance.rowCache.replace(key, sentinel, toCache);
+                return filterColumnFamily(data, filter);
+            }
 
-            return data;
+            // Otherwise, if we want to cache the result of the query we're about to do, we must make sure this query
+            // covers what needs to be cached. And if the user filter does not satisfy that, we sometimes extend said
+            // filter so we can populate the cache but only if:
+            //   1) we can guarantee it is a strict extension, i.e. that we will still fetch the data asked by the user.
+            //   2) the extension does not make us query more than getRowsPerPartitionToCache() (as a mean to limit the
+            //      amount of extra work we'll do on a user query for the purpose of populating the cache).
+            //
+            // In practice, we can only guarantee those 2 points if the filter is one that queries the head of the
+            // partition (and if that filter actually counts CQL3 rows since that's what we cache and it would be
+            // bogus to compare the filter count to the 'rows to cache' otherwise).
+            if (filter.filter.isHeadFilter() && filter.filter.countCQL3Rows(metadata.comparator))
+            {
+                SliceQueryFilter sliceFilter = (SliceQueryFilter)filter.filter;
+                int rowsToCache = metadata.getCaching().rowCache.rowsToCache;
+
+                SliceQueryFilter cacheSlice = readFilterForCache();
+                QueryFilter cacheFilter = new QueryFilter(filter.key, name, cacheSlice, filter.timestamp);
+
+                // If the filter count is less than the number of rows cached, we simply extend it to make sure we do cover the
+                // number of rows to cache, and if that count is greater than the number of rows to cache, we simply filter what
+                // needs to be cached afterwards.
+                if (sliceFilter.count < rowsToCache)
+                {
+                    toCache = getTopLevelColumns(cacheFilter, Integer.MIN_VALUE);
+                    if (toCache != null)
+                    {
+                        Tracing.trace("Populating row cache ({} rows cached)", cacheSlice.lastCounted());
+                        data = filterColumnFamily(toCache, filter);
+                    }
+                }
+                else
+                {
+                    data = getTopLevelColumns(filter, Integer.MIN_VALUE);
+                    if (data != null)
+                    {
+                        // The filter limit was greater than the number of rows to cache. But, if the filter had a non-empty
+                        // finish bound, we may have gotten less than what needs to be cached, in which case we shouldn't cache it
+                        // (otherwise a cache hit would assume the whole partition is cached which is not the case).
+                        if (sliceFilter.finish().isEmpty() || sliceFilter.lastCounted() >= rowsToCache)
+                        {
+                            toCache = filterColumnFamily(data, cacheFilter);
+                            Tracing.trace("Caching {} rows (out of {} requested)", cacheSlice.lastCounted(), sliceFilter.count);
+                        }
+                        else
+                        {
+                            Tracing.trace("Not populating row cache, not enough rows fetched ({} fetched but {} required for the cache)", sliceFilter.lastCounted(), rowsToCache);
+                        }
+                    }
+                }
+
+                if (sentinelSuccess && toCache != null)
+                    CacheService.instance.rowCache.replace(key, sentinel, toCache);
+                return data;
+            }
+            else
+            {
+                Tracing.trace("Fetching data but not populating cache as query does not query from the start of the partition");
+                return getTopLevelColumns(filter, Integer.MIN_VALUE);
+            }
         }
         finally
         {
-            if (sentinelSuccess && data == null)
+            if (sentinelSuccess && toCache == null)
                 invalidateCachedRow(key);
         }
     }
 
+    public SliceQueryFilter readFilterForCache()
+    {
+        // We create a new filter everytime before for now SliceQueryFilter is unfortunatly mutable.
+        return new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, metadata.getCaching().rowCache.rowsToCache, metadata.clusteringColumns().size());
+    }
+
+    public boolean isFilterFullyCoveredBy(IDiskAtomFilter filter, ColumnFamily cachedCf, long now)
+    {
+        // We can use the cached value only if we know that no data it doesn't contain could be covered
+        // by the query filter, that is if:
+        //   1) either the whole partition is cached
+        //   2) or we can ensure than any data the filter selects are in the cached partition
+
+        // When counting rows to decide if the whole row is cached, we should be careful with expiring
+        // columns: if we use a timestamp newer than the one that was used when populating the cache, we might
+        // end up deciding the whole partition is cached when it's really not (just some rows expired since the
+        // cf was cached). This is the reason for Integer.MIN_VALUE below.
+        boolean wholePartitionCached = cachedCf.liveCQL3RowCount(Integer.MIN_VALUE) < metadata.getCaching().rowCache.rowsToCache;
+
+        // Contrarily to the "wholePartitionCached" check above, we do want isFullyCoveredBy to take the
+        // timestamp of the query into account when dealing with expired columns. Otherwise, we could think
+        // the cached partition has enough live rows to satisfy the filter when it doesn't because some
+        // are now expired.
+        return wholePartitionCached || filter.isFullyCoveredBy(cachedCf, now);
+    }
+
     public int gcBefore(long now)
     {
         return (int) (now / 1000) - metadata.getGcGraceSeconds();
@@ -1369,7 +1673,7 @@
                     return null;
                 }
 
-                result = filterColumnFamily(cached, filter);
+                result = cached;
             }
             else
             {
@@ -1405,11 +1709,12 @@
      */
     ColumnFamily filterColumnFamily(ColumnFamily cached, QueryFilter filter)
     {
-        ColumnFamily cf = cached.cloneMeShallow(ArrayBackedSortedColumns.factory, filter.filter.isReversed());
-        OnDiskAtomIterator ci = filter.getColumnFamilyIterator(cached);
+        if (cached == null)
+            return null;
 
+        ColumnFamily cf = cached.cloneMeShallow(ArrayBackedSortedColumns.factory, filter.filter.isReversed());
         int gcBefore = gcBefore(filter.timestamp);
-        filter.collateOnDiskAtom(cf, ci, gcBefore);
+        filter.collateOnDiskAtom(cf, filter.getIterator(cached), gcBefore);
         return removeDeletedCF(cf, gcBefore);
     }
 
@@ -1445,68 +1750,90 @@
         return markCurrentViewReferenced().sstables;
     }
 
-    private ViewFragment markReferenced(Function<DataTracker.View, List<SSTableReader>> filter)
+    public Set<SSTableReader> getUnrepairedSSTables()
     {
-        List<SSTableReader> sstables;
-        DataTracker.View view;
+        Set<SSTableReader> unRepairedSSTables = new HashSet<>(getSSTables());
+        Iterator<SSTableReader> sstableIterator = unRepairedSSTables.iterator();
+        while(sstableIterator.hasNext())
+        {
+            SSTableReader sstable = sstableIterator.next();
+            if (sstable.isRepaired())
+                sstableIterator.remove();
+        }
+        return unRepairedSSTables;
+    }
 
+    public Set<SSTableReader> getRepairedSSTables()
+    {
+        Set<SSTableReader> repairedSSTables = new HashSet<>(getSSTables());
+        Iterator<SSTableReader> sstableIterator = repairedSSTables.iterator();
+        while(sstableIterator.hasNext())
+        {
+            SSTableReader sstable = sstableIterator.next();
+            if (!sstable.isRepaired())
+                sstableIterator.remove();
+        }
+        return repairedSSTables;
+    }
+
+    public ViewFragment selectAndReference(Function<DataTracker.View, List<SSTableReader>> filter)
+    {
         while (true)
         {
-            view = data.getView();
-
-            if (view.intervalTree.isEmpty())
-            {
-                sstables = Collections.emptyList();
-                break;
-            }
-
-            sstables = filter.apply(view);
-            if (SSTableReader.acquireReferences(sstables))
-                break;
-            // retry w/ new view
+            ViewFragment view = select(filter);
+            if (view.sstables.isEmpty() || SSTableReader.acquireReferences(view.sstables))
+                return view;
         }
-
-        return new ViewFragment(sstables, Iterables.concat(Collections.singleton(view.memtable), view.memtablesPendingFlush));
     }
 
+    public ViewFragment select(Function<DataTracker.View, List<SSTableReader>> filter)
+    {
+        DataTracker.View view = data.getView();
+        List<SSTableReader> sstables = view.intervalTree.isEmpty()
+                                       ? Collections.<SSTableReader>emptyList()
+                                       : filter.apply(view);
+        return new ViewFragment(sstables, view.getAllMemtables());
+    }
+
+
     /**
      * @return a ViewFragment containing the sstables and memtables that may need to be merged
      * for the given @param key, according to the interval tree
      */
-    public ViewFragment markReferenced(final DecoratedKey key)
+    public Function<DataTracker.View, List<SSTableReader>> viewFilter(final DecoratedKey key)
     {
         assert !key.isMinimum(partitioner);
-        return markReferenced(new Function<DataTracker.View, List<SSTableReader>>()
+        return new Function<DataTracker.View, List<SSTableReader>>()
         {
             public List<SSTableReader> apply(DataTracker.View view)
             {
                 return compactionStrategy.filterSSTablesForReads(view.intervalTree.search(key));
             }
-        });
+        };
     }
 
     /**
      * @return a ViewFragment containing the sstables and memtables that may need to be merged
      * for rows within @param rowBounds, inclusive, according to the interval tree.
      */
-    public ViewFragment markReferenced(final AbstractBounds<RowPosition> rowBounds)
+    public Function<DataTracker.View, List<SSTableReader>> viewFilter(final AbstractBounds<RowPosition> rowBounds)
     {
-        return markReferenced(new Function<DataTracker.View, List<SSTableReader>>()
+        return new Function<DataTracker.View, List<SSTableReader>>()
         {
             public List<SSTableReader> apply(DataTracker.View view)
             {
                 return compactionStrategy.filterSSTablesForReads(view.sstablesInBounds(rowBounds));
             }
-        });
+        };
     }
 
     /**
      * @return a ViewFragment containing the sstables and memtables that may need to be merged
      * for rows for all of @param rowBoundsCollection, inclusive, according to the interval tree.
      */
-    public ViewFragment markReferenced(final Collection<AbstractBounds<RowPosition>> rowBoundsCollection)
+    public Function<DataTracker.View, List<SSTableReader>> viewFilter(final Collection<AbstractBounds<RowPosition>> rowBoundsCollection)
     {
-        return markReferenced(new Function<DataTracker.View, List<SSTableReader>>()
+        return new Function<DataTracker.View, List<SSTableReader>>()
         {
             public List<SSTableReader> apply(DataTracker.View view)
             {
@@ -1516,17 +1843,16 @@
 
                 return ImmutableList.copyOf(sstables);
             }
-        });
+        };
     }
 
     public List<String> getSSTablesForKey(String key)
     {
         DecoratedKey dk = partitioner.decorateKey(metadata.getKeyValidator().fromString(key));
-        ViewFragment view = markReferenced(dk);
-        try
+        try (OpOrder.Group op = readOrdering.start())
         {
-            List<String> files = new ArrayList<String>();
-            for (SSTableReader sstr : view.sstables)
+            List<String> files = new ArrayList<>();
+            for (SSTableReader sstr : select(viewFilter(dk)).sstables)
             {
                 // check if the key actually exists in this sstable, without updating cache and stats
                 if (sstr.getPosition(dk, SSTableReader.Operator.EQ, false) != null)
@@ -1534,17 +1860,17 @@
             }
             return files;
         }
-        finally
-        {
-            SSTableReader.releaseReferences(view.sstables);
-        }
     }
 
     public ColumnFamily getTopLevelColumns(QueryFilter filter, int gcBefore)
     {
         Tracing.trace("Executing single-partition query on {}", name);
         CollationController controller = new CollationController(this, filter, gcBefore);
-        ColumnFamily columns = controller.getTopLevelColumns();
+        ColumnFamily columns;
+        try (OpOrder.Group op = readOrdering.start())
+        {
+            columns = controller.getTopLevelColumns(Memtable.MEMORY_POOL.needToCopyOnHeap());
+        }
         metric.updateSSTableIterated(controller.getSstablesIterated());
         return columns;
     }
@@ -1556,9 +1882,19 @@
         for (RowCacheKey key : CacheService.instance.rowCache.getKeySet())
         {
             DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.key));
-            if (key.cfId == metadata.cfId && !Range.isInRanges(dk.token, ranges))
+            if (key.cfId == metadata.cfId && !Range.isInRanges(dk.getToken(), ranges))
                 invalidateCachedRow(dk);
         }
+
+        if (metadata.isCounter())
+        {
+            for (CounterCacheKey key : CacheService.instance.counterCache.getKeySet())
+            {
+                DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.partitionKey));
+                if (key.cfId == metadata.cfId && !Range.isInRanges(dk.getToken(), ranges))
+                    CacheService.instance.counterCache.remove(key);
+            }
+        }
     }
 
     public static abstract class AbstractScanIterator extends AbstractIterator<Row> implements CloseableIterator<Row>
@@ -1578,51 +1914,41 @@
     {
         assert !(range.keyRange() instanceof Range) || !((Range)range.keyRange()).isWrapAround() || range.keyRange().right.isMinimum(partitioner) : range.keyRange();
 
-        final ViewFragment view = markReferenced(range.keyRange());
+        final ViewFragment view = select(viewFilter(range.keyRange()));
         Tracing.trace("Executing seq scan across {} sstables for {}", view.sstables.size(), range.keyRange().getString(metadata.getKeyValidator()));
 
-        try
-        {
-            final CloseableIterator<Row> iterator = RowIteratorFactory.getIterator(view.memtables, view.sstables, range, this, now);
+        final CloseableIterator<Row> iterator = RowIteratorFactory.getIterator(view.memtables, view.sstables, range, this, now);
 
-            // todo this could be pushed into SSTableScanner
-            return new AbstractScanIterator()
+        // todo this could be pushed into SSTableScanner
+        return new AbstractScanIterator()
+        {
+            protected Row computeNext()
             {
-                protected Row computeNext()
-                {
-                    // pull a row out of the iterator
-                    if (!iterator.hasNext())
-                        return endOfData();
+                // pull a row out of the iterator
+                if (!iterator.hasNext())
+                    return endOfData();
 
-                    Row current = iterator.next();
-                    DecoratedKey key = current.key;
+                Row current = iterator.next();
+                DecoratedKey key = current.key;
 
-                    if (!range.stopKey().isMinimum(partitioner) && range.stopKey().compareTo(key) < 0)
-                        return endOfData();
+                if (!range.stopKey().isMinimum(partitioner) && range.stopKey().compareTo(key) < 0)
+                    return endOfData();
 
-                    // skipping outside of assigned range
-                    if (!range.contains(key))
-                        return computeNext();
+                // skipping outside of assigned range
+                if (!range.contains(key))
+                    return computeNext();
 
-                    if (logger.isTraceEnabled())
-                        logger.trace("scanned {}", metadata.getKeyValidator().getString(key.key));
+                if (logger.isTraceEnabled())
+                    logger.trace("scanned {}", metadata.getKeyValidator().getString(key.getKey()));
 
-                    return current;
-                }
+                return current;
+            }
 
-                public void close() throws IOException
-                {
-                    SSTableReader.releaseReferences(view.sstables);
-                    iterator.close();
-                }
-            };
-        }
-        catch (RuntimeException e)
-        {
-            // In case getIterator() throws, otherwise the iteror close method releases the references.
-            SSTableReader.releaseReferences(view.sstables);
-            throw e;
-        }
+            public void close() throws IOException
+            {
+                iterator.close();
+            }
+        };
     }
 
     @VisibleForTesting
@@ -1655,8 +1981,8 @@
      */
     public ExtendedFilter makeExtendedFilter(AbstractBounds<RowPosition> keyRange,
                                              SliceQueryFilter columnRange,
-                                             ByteBuffer columnStart,
-                                             ByteBuffer columnStop,
+                                             Composite columnStart,
+                                             Composite columnStop,
                                              List<IndexExpression> rowFilter,
                                              int maxResults,
                                              boolean countCQL3Rows,
@@ -1704,7 +2030,7 @@
     public List<Row> getRangeSlice(ExtendedFilter filter)
     {
         long start = System.nanoTime();
-        try
+        try (OpOrder.Group op = readOrdering.start())
         {
             return filter(getSequentialIterator(filter.dataRange, filter.timestamp), filter);
         }
@@ -1761,12 +2087,12 @@
                     {
                         ColumnFamily cf = filter.cfs.getColumnFamily(new QueryFilter(rawRow.key, name, extraFilter, filter.timestamp));
                         if (cf != null)
-                            data.addAll(cf, HeapAllocator.instance);
+                            data.addAll(cf);
                     }
 
                     removeDroppedColumns(data);
 
-                    if (!filter.isSatisfiedBy(rawRow.key, data, null))
+                    if (!filter.isSatisfiedBy(rawRow.key, data, null, null))
                         continue;
 
                     logger.trace("{} satisfies all filter expressions", data);
@@ -1803,30 +2129,39 @@
         }
     }
 
-    public AbstractType<?> getComparator()
+    public CellNameType getComparator()
     {
         return metadata.comparator;
     }
 
     public void snapshotWithoutFlush(String snapshotName)
     {
+        snapshotWithoutFlush(snapshotName, null);
+    }
+
+    public void snapshotWithoutFlush(String snapshotName, Predicate<SSTableReader> predicate)
+    {
         for (ColumnFamilyStore cfs : concatWithIndexes())
         {
             DataTracker.View currentView = cfs.markCurrentViewReferenced();
-
+            final JSONArray filesJSONArr = new JSONArray();
             try
             {
                 for (SSTableReader ssTable : currentView.sstables)
                 {
+                    if (ssTable.openReason == SSTableReader.OpenReason.EARLY || (predicate != null && !predicate.apply(ssTable)))
+                    {
+                        continue;
+                    }
+
                     File snapshotDirectory = Directories.getSnapshotDirectory(ssTable.descriptor, snapshotName);
                     ssTable.createLinks(snapshotDirectory.getPath()); // hard links
+                    filesJSONArr.add(ssTable.descriptor.relativeFilenameFor(Component.DATA));
                     if (logger.isDebugEnabled())
-                        logger.debug("Snapshot for " + keyspace + " keyspace data file " + ssTable.getFilename() +
-                                     " created in " + snapshotDirectory);
+                        logger.debug("Snapshot for {} keyspace data file {} created in {}", keyspace, ssTable.getFilename(), snapshotDirectory);
                 }
 
-                if (cfs.compactionStrategy instanceof LeveledCompactionStrategy)
-                    cfs.directories.snapshotLeveledManifest(snapshotName);
+                writeSnapshotManifest(filesJSONArr, snapshotName);
             }
             finally
             {
@@ -1835,6 +2170,26 @@
         }
     }
 
+    private void writeSnapshotManifest(final JSONArray filesJSONArr, final String snapshotName)
+    {
+        final File manifestFile = directories.getSnapshotManifestFile(snapshotName);
+        final JSONObject manifestJSON = new JSONObject();
+        manifestJSON.put("files", filesJSONArr);
+
+        try
+        {
+            if (!manifestFile.getParentFile().exists())
+                manifestFile.getParentFile().mkdirs();
+            PrintStream out = new PrintStream(manifestFile);
+            out.println(manifestJSON.toJSONString());
+            out.close();
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, manifestFile);
+        }
+    }
+
     public List<SSTableReader> getSnapshotSSTableReader(String tag) throws IOException
     {
         Map<Descriptor, Set<Component>> snapshots = directories.sstableLister().snapshots(tag).list();
@@ -1851,8 +2206,13 @@
      */
     public void snapshot(String snapshotName)
     {
+        snapshot(snapshotName, null);
+    }
+
+    public void snapshot(String snapshotName, Predicate<SSTableReader> predicate)
+    {
         forceBlockingFlush();
-        snapshotWithoutFlush(snapshotName);
+        snapshotWithoutFlush(snapshotName, predicate);
     }
 
     public boolean snapshotExists(String snapshotName)
@@ -1876,6 +2236,15 @@
         List<File> snapshotDirs = directories.getCFDirectories();
         Directories.clearSnapshot(snapshotName, snapshotDirs);
     }
+    /**
+     *
+     * @return  Return a map of all snapshots to space being used
+     * The pair for a snapshot has true size and size on disk.
+     */
+    public Map<String, Pair<Long,Long>> getSnapshotDetails()
+    {
+        return directories.getSnapshotDetails();
+    }
 
     public boolean hasUnreclaimedSpace()
     {
@@ -1911,7 +2280,15 @@
             return null;
 
         IRowCacheEntry cached = CacheService.instance.rowCache.getInternal(new RowCacheKey(metadata.cfId, key));
-        return cached == null || cached instanceof RowCacheSentinel ? null : (ColumnFamily) cached;
+        return cached == null || cached instanceof RowCacheSentinel ? null : (ColumnFamily)cached;
+    }
+
+    private void invalidateCaches()
+    {
+        CacheService.instance.invalidateKeyCacheForCf(metadata.cfId);
+        CacheService.instance.invalidateRowCacheForCf(metadata.cfId);
+        if (metadata.isCounter())
+            CacheService.instance.invalidateCounterCacheForCf(metadata.cfId);
     }
 
     /**
@@ -1936,6 +2313,20 @@
         invalidateCachedRow(new RowCacheKey(cfId, key));
     }
 
+    public ClockAndCount getCachedCounter(ByteBuffer partitionKey, CellName cellName)
+    {
+        if (CacheService.instance.counterCache.getCapacity() == 0L) // counter cache disabled.
+            return null;
+        return CacheService.instance.counterCache.get(CounterCacheKey.create(metadata.cfId, partitionKey, cellName));
+    }
+
+    public void putCachedCounter(ByteBuffer partitionKey, CellName cellName, ClockAndCount clockAndCount)
+    {
+        if (CacheService.instance.counterCache.getCapacity() == 0L) // counter cache disabled.
+            return;
+        CacheService.instance.counterCache.put(CounterCacheKey.create(metadata.cfId, partitionKey, cellName), clockAndCount);
+    }
+
     public void forceMajorCompaction() throws InterruptedException, ExecutionException
     {
         CompactionManager.instance.performMaximal(this);
@@ -1953,14 +2344,37 @@
 
     public Iterable<DecoratedKey> keySamples(Range<Token> range)
     {
-        Collection<SSTableReader> sstables = getSSTables();
-        Iterable<DecoratedKey>[] samples = new Iterable[sstables.size()];
-        int i = 0;
-        for (SSTableReader sstable: sstables)
+        Collection<SSTableReader> sstables = markCurrentSSTablesReferenced();
+        try
         {
-            samples[i++] = sstable.getKeySamples(range);
+            Iterable<DecoratedKey>[] samples = new Iterable[sstables.size()];
+            int i = 0;
+            for (SSTableReader sstable: sstables)
+            {
+                samples[i++] = sstable.getKeySamples(range);
+            }
+            return Iterables.concat(samples);
         }
-        return Iterables.concat(samples);
+        finally
+        {
+            SSTableReader.releaseReferences(sstables);
+        }
+    }
+
+    public long estimatedKeysForRange(Range<Token> range)
+    {
+        Collection<SSTableReader> sstables = markCurrentSSTablesReferenced();
+        try
+        {
+            long count = 0;
+            for (SSTableReader sstable : sstables)
+                count += sstable.estimatedKeysForRanges(Collections.singleton(range));
+            return count;
+        }
+        finally
+        {
+            SSTableReader.releaseReferences(sstables);
+        }
     }
 
     /**
@@ -2013,18 +2427,12 @@
         }
         else
         {
-            Keyspace.switchLock.writeLock().lock();
-            try
+            // just nuke the memtable data w/o writing to disk first
+            synchronized (data)
             {
-                for (ColumnFamilyStore cfs : concatWithIndexes())
-                {
-                    Memtable mt = cfs.getMemtableThreadSafe();
-                    if (!mt.isClean())
-                        mt.cfs.data.renewMemtable();
-                }
-            } finally
-            {
-                Keyspace.switchLock.writeLock().unlock();
+                final Flush flush = new Flush(true);
+                flushExecutor.execute(flush);
+                postFlushExecutor.submit(flush.postFlush);
             }
         }
 
@@ -2046,13 +2454,8 @@
                     index.truncateBlocking(truncatedAt);
 
                 SystemKeyspace.saveTruncationRecord(ColumnFamilyStore.this, truncatedAt, replayAfter);
-
                 logger.debug("cleaning out row cache");
-                for (RowCacheKey key : CacheService.instance.rowCache.getKeySet())
-                {
-                    if (key.cfId == metadata.cfId)
-                        invalidateCachedRow(key);
-                }
+                invalidateCaches();
             }
         };
 
@@ -2100,7 +2503,8 @@
                 {
                     if (!cfs.getDataTracker().getCompacting().isEmpty())
                     {
-                        logger.warn("Unable to cancel in-progress compactions for {}.  Probably there is an unusually large row in progress somewhere.  It is also possible that buggy code left some sstables compacting after it was done with them", metadata.cfName);
+                        logger.warn("Unable to cancel in-progress compactions for {}.  Perhaps there is an unusually large row in progress somewhere, or the system is simply overloaded.", metadata.cfName);
+                        return null;
                     }
                 }
                 logger.debug("Compactions successfully cancelled");
@@ -2132,7 +2536,7 @@
                 assert data.getCompacting().isEmpty() : data.getCompacting();
                 Iterable<SSTableReader> sstables = Lists.newArrayList(AbstractCompactionStrategy.filterSuspectSSTables(getSSTables()));
                 if (Iterables.isEmpty(sstables))
-                    return null;
+                    return Collections.emptyList();
                 boolean success = data.markCompacting(sstables);
                 assert success : "something marked things compacting while compactions are disabled";
                 return sstables;
@@ -2304,43 +2708,11 @@
         return partitioner instanceof LocalPartitioner;
     }
 
-    private ByteBuffer intern(ByteBuffer name)
-    {
-        ByteBuffer internedName = internedNames.get(name);
-        if (internedName == null)
-        {
-            internedName = ByteBufferUtil.clone(name);
-            ByteBuffer concurrentName = internedNames.putIfAbsent(internedName, internedName);
-            if (concurrentName != null)
-                internedName = concurrentName;
-        }
-        return internedName;
-    }
-
-    public ByteBuffer internOrCopy(ByteBuffer name, Allocator allocator)
-    {
-        if (internedNames.size() >= INTERN_CUTOFF)
-            return allocator.clone(name);
-
-        return intern(name);
-    }
-
-    public ByteBuffer maybeIntern(ByteBuffer name)
-    {
-        if (internedNames.size() >= INTERN_CUTOFF)
-            return null;
-
-        return intern(name);
-    }
-
     public Iterable<ColumnFamilyStore> concatWithIndexes()
     {
-        return Iterables.concat(indexManager.getIndexesBackedByCfs(), Collections.singleton(this));
-    }
-
-    public Set<Memtable> getMemtablesPendingFlush()
-    {
-        return data.getMemtablesPendingFlush();
+        // we return the main CFS first, which we rely on for simplicity in switchMemtable(), for getting the
+        // latest replay position
+        return Iterables.concat(Collections.singleton(this), indexManager.getIndexesBackedByCfs());
     }
 
     public List<String> getBuiltIndexes()
@@ -2379,24 +2751,18 @@
      */
     public long oldestUnflushedMemtable()
     {
-        DataTracker.View view = data.getView();
-        long oldest = view.memtable.creationTime();
-        for (Memtable memtable : view.memtablesPendingFlush)
-            oldest = Math.min(oldest, memtable.creationTime());
-        return oldest;
+        return data.getView().getOldestMemtable().creationTime();
     }
 
     public boolean isEmpty()
     {
         DataTracker.View view = data.getView();
-        return view.sstables.isEmpty() && view.memtable.getOperations() == 0 && view.memtablesPendingFlush.isEmpty();
+        return view.sstables.isEmpty() && view.getCurrentMemtable().getOperations() == 0 && view.getCurrentMemtable() == view.getOldestMemtable();
     }
 
     private boolean isRowCacheEnabled()
     {
-        return !(metadata.getCaching() == Caching.NONE
-              || metadata.getCaching() == Caching.KEYS_ONLY
-              || CacheService.instance.rowCache.getCapacity() == 0);
+        return metadata.getCaching().rowCache.isEnabled() && CacheService.instance.rowCache.getCapacity() > 0;
     }
 
     /**
@@ -2433,6 +2799,11 @@
         return getDataTracker().getDroppableTombstoneRatio();
     }
 
+    public long trueSnapshotsSize()
+    {
+        return directories.trueSnapshotsSize();
+    }
+
     @VisibleForTesting
     void resetFileIndexGenerator()
     {

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java b/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java
index 90c9f1f..6fa208c 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java

@@ -35,7 +35,7 @@
      * Returns the total amount of data stored in the memtable, including
      * column related overhead.
      *
-     * @see org.apache.cassandra.metrics.ColumnFamilyMetrics#memtableDataSize
+     * @see org.apache.cassandra.metrics.ColumnFamilyMetrics#memtableOnHeapSize
      * @return The size in bytes.
      * @deprecated
      */
@@ -146,7 +146,7 @@
     public double getRecentWriteLatencyMicros();
 
     /**
-     * @see org.apache.cassandra.metrics.ColumnFamilyMetrics#pendingTasks
+     * @see org.apache.cassandra.metrics.ColumnFamilyMetrics#pendingFlushes
      * @return the estimated number of tasks pending for this column family
      */
     @Deprecated
@@ -344,4 +344,9 @@
      * @return ratio
      */
     public double getDroppableTombstoneRatio();
+
+    /**
+     * @return the size of SSTables in "snapshots" subdirectory which aren't live anymore
+     */
+    public long trueSnapshotsSize();
 }

diff --git a/src/java/org/apache/cassandra/db/ColumnIndex.java b/src/java/org/apache/cassandra/db/ColumnIndex.java
index eda275d..d9d6a9c 100644
--- a/src/java/org/apache/cassandra/db/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/db/ColumnIndex.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
@@ -25,7 +24,9 @@
 import com.google.common.annotations.VisibleForTesting;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.io.sstable.IndexHelper;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class ColumnIndex
@@ -53,8 +54,6 @@
      */
     public static class Builder
     {
-        private static final OnDiskAtom.Serializer atomSerializer = Column.onDiskSerializer();
-
         private final ColumnIndex result;
         private final long indexOffset;
         private long startPosition = -1;
@@ -63,15 +62,17 @@
         private OnDiskAtom firstColumn;
         private OnDiskAtom lastColumn;
         private OnDiskAtom lastBlockClosing;
-        private final DataOutput output;
+        private final DataOutputPlus output;
         private final RangeTombstone.Tracker tombstoneTracker;
         private int atomCount;
         private final ByteBuffer key;
         private final DeletionInfo deletionInfo; // only used for serializing and calculating row header size
 
+        private final OnDiskAtom.Serializer atomSerializer;
+
         public Builder(ColumnFamily cf,
                        ByteBuffer key,
-                       DataOutput output)
+                       DataOutputPlus output)
         {
             assert cf != null;
             assert key != null;
@@ -83,6 +84,7 @@
             this.result = new ColumnIndex(new ArrayList<IndexHelper.IndexInfo>());
             this.output = output;
             this.tombstoneTracker = new RangeTombstone.Tracker(cf.getComparator());
+            this.atomSerializer = cf.getComparator().onDiskAtomSerializer();
         }
 
         /**
@@ -119,18 +121,25 @@
         public ColumnIndex build(ColumnFamily cf) throws IOException
         {
             // cf has disentangled the columns and range tombstones, we need to re-interleave them in comparator order
+            Comparator<Composite> comparator = cf.getComparator();
+            DeletionInfo.InOrderTester tester = cf.deletionInfo().inOrderTester();
             Iterator<RangeTombstone> rangeIter = cf.deletionInfo().rangeIterator();
             RangeTombstone tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
-            Comparator<ByteBuffer> comparator = cf.getComparator();
 
-            for (Column c : cf)
+            for (Cell c : cf)
             {
                 while (tombstone != null && comparator.compare(c.name(), tombstone.min) >= 0)
                 {
-                    add(tombstone);
+                    // skip range tombstones that are shadowed by partition tombstones
+                    if (!cf.deletionInfo().getTopLevelDeletion().isDeleted(tombstone))
+                        add(tombstone);
                     tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
                 }
-                add(c);
+
+                // We can skip any cell if it's shadowed by a tombstone already. This is a more
+                // general case than was handled by CASSANDRA-2589.
+                if (!tester.isDeleted(c))
+                    add(c);
             }
 
             while (tombstone != null)
@@ -154,7 +163,11 @@
         public ColumnIndex buildForCompaction(Iterator<OnDiskAtom> columns) throws IOException
         {
             while (columns.hasNext())
-                add(columns.next());
+            {
+                OnDiskAtom c =  columns.next();
+                add(c);
+            }
+
             return build();
         }
 
@@ -172,7 +185,7 @@
                                // where we wouldn't make any progress because a block is filled by said marker
             }
 
-            long size = column.serializedSizeForSSTable();
+            long size = atomSerializer.serializedSizeForSSTable(column);
             endPosition += size;
             blockSize += size;
 

diff --git a/src/java/org/apache/cassandra/db/ColumnSerializer.java b/src/java/org/apache/cassandra/db/ColumnSerializer.java
index fb38b5f..8e7026c 100644
--- a/src/java/org/apache/cassandra/db/ColumnSerializer.java
+++ b/src/java/org/apache/cassandra/db/ColumnSerializer.java

@@ -18,16 +18,19 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-public class ColumnSerializer implements ISerializer<Column>
+public class ColumnSerializer implements ISerializer<Cell>
 {
     public final static int DELETION_MASK        = 0x01;
     public final static int EXPIRATION_MASK      = 0x02;
@@ -51,24 +54,31 @@
         LOCAL, FROM_REMOTE, PRESERVE_SIZE;
     }
 
-    public void serialize(Column column, DataOutput out) throws IOException
+    private final CellNameType type;
+
+    public ColumnSerializer(CellNameType type)
     {
-        assert column.name().remaining() > 0;
-        ByteBufferUtil.writeWithShortLength(column.name(), out);
+        this.type = type;
+    }
+
+    public void serialize(Cell cell, DataOutputPlus out) throws IOException
+    {
+        assert !cell.name().isEmpty();
+        type.cellSerializer().serialize(cell.name(), out);
         try
         {
-            out.writeByte(column.serializationFlags());
-            if (column instanceof CounterColumn)
+            out.writeByte(cell.serializationFlags());
+            if (cell instanceof CounterCell)
             {
-                out.writeLong(((CounterColumn)column).timestampOfLastDelete());
+                out.writeLong(((CounterCell) cell).timestampOfLastDelete());
             }
-            else if (column instanceof ExpiringColumn)
+            else if (cell instanceof ExpiringCell)
             {
-                out.writeInt(((ExpiringColumn) column).getTimeToLive());
-                out.writeInt(column.getLocalDeletionTime());
+                out.writeInt(((ExpiringCell) cell).getTimeToLive());
+                out.writeInt(cell.getLocalDeletionTime());
             }
-            out.writeLong(column.timestamp());
-            ByteBufferUtil.writeWithLength(column.value(), out);
+            out.writeLong(cell.timestamp());
+            ByteBufferUtil.writeWithLength(cell.value(), out);
         }
         catch (IOException e)
         {
@@ -76,7 +86,7 @@
         }
     }
 
-    public Column deserialize(DataInput in) throws IOException
+    public Cell deserialize(DataInput in) throws IOException
     {
         return deserialize(in, Flag.LOCAL);
     }
@@ -86,29 +96,27 @@
      * deserialize comes from a remote host. If it does, then we must clear
      * the delta.
      */
-    public Column deserialize(DataInput in, ColumnSerializer.Flag flag) throws IOException
+    public Cell deserialize(DataInput in, ColumnSerializer.Flag flag) throws IOException
     {
         return deserialize(in, flag, Integer.MIN_VALUE);
     }
 
-    public Column deserialize(DataInput in, ColumnSerializer.Flag flag, int expireBefore) throws IOException
+    public Cell deserialize(DataInput in, ColumnSerializer.Flag flag, int expireBefore) throws IOException
     {
-        ByteBuffer name = ByteBufferUtil.readWithShortLength(in);
-        if (name.remaining() <= 0)
-            throw CorruptColumnException.create(in, name);
+        CellName name = type.cellSerializer().deserialize(in);
 
         int b = in.readUnsignedByte();
         return deserializeColumnBody(in, name, b, flag, expireBefore);
     }
 
-    Column deserializeColumnBody(DataInput in, ByteBuffer name, int mask, ColumnSerializer.Flag flag, int expireBefore) throws IOException
+    Cell deserializeColumnBody(DataInput in, CellName name, int mask, ColumnSerializer.Flag flag, int expireBefore) throws IOException
     {
         if ((mask & COUNTER_MASK) != 0)
         {
             long timestampOfLastDelete = in.readLong();
             long ts = in.readLong();
             ByteBuffer value = ByteBufferUtil.readWithLength(in);
-            return CounterColumn.create(name, value, ts, timestampOfLastDelete, flag);
+            return BufferCounterCell.create(name, value, ts, timestampOfLastDelete, flag);
         }
         else if ((mask & EXPIRATION_MASK) != 0)
         {
@@ -116,23 +124,36 @@
             int expiration = in.readInt();
             long ts = in.readLong();
             ByteBuffer value = ByteBufferUtil.readWithLength(in);
-            return ExpiringColumn.create(name, value, ts, ttl, expiration, expireBefore, flag);
+            return BufferExpiringCell.create(name, value, ts, ttl, expiration, expireBefore, flag);
         }
         else
         {
             long ts = in.readLong();
             ByteBuffer value = ByteBufferUtil.readWithLength(in);
             return (mask & COUNTER_UPDATE_MASK) != 0
-                   ? new CounterUpdateColumn(name, value, ts)
+                   ? new BufferCounterUpdateCell(name, value, ts)
                    : ((mask & DELETION_MASK) == 0
-                      ? new Column(name, value, ts)
-                      : new DeletedColumn(name, value, ts));
+                      ? new BufferCell(name, value, ts)
+                      : new BufferDeletedCell(name, value, ts));
         }
     }
 
-    public long serializedSize(Column column, TypeSizes type)
+    void skipColumnBody(DataInput in, int mask) throws IOException
     {
-        return column.serializedSize(type);
+        if ((mask & COUNTER_MASK) != 0)
+            FileUtils.skipBytesFully(in, 16);
+        else if ((mask & EXPIRATION_MASK) != 0)
+            FileUtils.skipBytesFully(in, 16);
+        else
+            FileUtils.skipBytesFully(in, 8);
+
+        int length = in.readInt();
+        FileUtils.skipBytesFully(in, length);
+    }
+
+    public long serializedSize(Cell cell, TypeSizes typeSizes)
+    {
+        return cell.serializedSize(type, typeSizes);
     }
 
     public static class CorruptColumnException extends IOException

diff --git a/src/java/org/apache/cassandra/db/ConsistencyLevel.java b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
index 3ef3217..e322968 100644
--- a/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+++ b/src/java/org/apache/cassandra/db/ConsistencyLevel.java

@@ -25,6 +25,7 @@
 import java.util.Map;
 
 import com.google.common.collect.Iterables;
+import net.nicoulaj.compilecommand.annotations.Inline;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -347,17 +348,10 @@
     public void validateCounterForWrite(CFMetaData metadata) throws InvalidRequestException
     {
         if (this == ConsistencyLevel.ANY)
-        {
             throw new InvalidRequestException("Consistency level ANY is not yet supported for counter columnfamily " + metadata.cfName);
-        }
-        else if (!metadata.getReplicateOnWrite() && !(this == ConsistencyLevel.ONE || this == ConsistencyLevel.LOCAL_ONE))
-        {
-            throw new InvalidRequestException("cannot achieve CL > CL.ONE without replicate_on_write on columnfamily " + metadata.cfName);
-        }
-        else if (isSerialConsistency())
-        {
+
+        if (isSerialConsistency())
             throw new InvalidRequestException("Counter operations are inherently non-serializable");
-        }
     }
 
     private void requireNetworkTopologyStrategy(String keyspaceName) throws InvalidRequestException

diff --git a/src/java/org/apache/cassandra/db/CounterCell.java b/src/java/org/apache/cassandra/db/CounterCell.java
new file mode 100644
index 0000000..cda1200
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/CounterCell.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+/**
+ * A column that represents a partitioned counter.
+ */
+public interface CounterCell extends Cell
+{
+    static final CounterContext contextManager = CounterContext.instance();
+
+    public long timestampOfLastDelete();
+
+    public long total();
+
+    public boolean hasLegacyShards();
+
+    public Cell markLocalToBeCleared();
+
+    CounterCell localCopy(CFMetaData metadata, AbstractAllocator allocator);
+
+    CounterCell localCopy(CFMetaData metaData, MemtableAllocator allocator, OpOrder.Group opGroup);
+}

diff --git a/src/java/org/apache/cassandra/db/CounterColumn.java b/src/java/org/apache/cassandra/db/CounterColumn.java
deleted file mode 100644
index 28a2ba5..0000000
--- a/src/java/org/apache/cassandra/db/CounterColumn.java
+++ /dev/null

@@ -1,262 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.context.IContext.ContextRelationship;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.Allocator;
-import org.apache.cassandra.utils.*;
-
-/**
- * A column that represents a partitioned counter.
- */
-public class CounterColumn extends Column
-{
-    private static final Logger logger = LoggerFactory.getLogger(CounterColumn.class);
-
-    protected static final CounterContext contextManager = CounterContext.instance();
-
-    private final long timestampOfLastDelete;
-
-    public CounterColumn(ByteBuffer name, long value, long timestamp)
-    {
-        this(name, contextManager.createLocal(value, HeapAllocator.instance), timestamp);
-    }
-
-    public CounterColumn(ByteBuffer name, long value, long timestamp, long timestampOfLastDelete)
-    {
-        this(name, contextManager.createLocal(value, HeapAllocator.instance), timestamp, timestampOfLastDelete);
-    }
-
-    public CounterColumn(ByteBuffer name, ByteBuffer value, long timestamp)
-    {
-        this(name, value, timestamp, Long.MIN_VALUE);
-    }
-
-    public CounterColumn(ByteBuffer name, ByteBuffer value, long timestamp, long timestampOfLastDelete)
-    {
-        super(name, value, timestamp);
-        this.timestampOfLastDelete = timestampOfLastDelete;
-    }
-
-    public static CounterColumn create(ByteBuffer name, ByteBuffer value, long timestamp, long timestampOfLastDelete, ColumnSerializer.Flag flag)
-    {
-        if (flag == ColumnSerializer.Flag.FROM_REMOTE || (flag == ColumnSerializer.Flag.LOCAL && contextManager.shouldClearLocal(value)))
-            value = contextManager.clearAllLocal(value);
-        return new CounterColumn(name, value, timestamp, timestampOfLastDelete);
-    }
-
-    @Override
-    public Column withUpdatedName(ByteBuffer newName)
-    {
-        return new CounterColumn(newName, value, timestamp, timestampOfLastDelete);
-    }
-
-    public long timestampOfLastDelete()
-    {
-        return timestampOfLastDelete;
-    }
-
-    public long total()
-    {
-        return contextManager.total(value);
-    }
-
-    @Override
-    public int dataSize()
-    {
-        /*
-         * A counter column adds to a Column :
-         *  + 8 bytes for timestampOfLastDelete
-         */
-        return super.dataSize() + TypeSizes.NATIVE.sizeof(timestampOfLastDelete);
-    }
-
-    @Override
-    public int serializedSize(TypeSizes typeSizes)
-    {
-        return super.serializedSize(typeSizes) + typeSizes.sizeof(timestampOfLastDelete);
-    }
-
-    @Override
-    public Column diff(Column column)
-    {
-        assert (column instanceof CounterColumn) || (column instanceof DeletedColumn) : "Wrong class type: " + column.getClass();
-
-        if (timestamp() < column.timestamp())
-            return column;
-
-        // Note that if at that point, column can't be a tombstone. Indeed,
-        // column is the result of merging us with other nodes results, and
-        // merging a CounterColumn with a tombstone never return a tombstone
-        // unless that tombstone timestamp is greater that the CounterColumn
-        // one.
-        assert !(column instanceof DeletedColumn) : "Wrong class type: " + column.getClass();
-
-        if (timestampOfLastDelete() < ((CounterColumn)column).timestampOfLastDelete())
-            return column;
-        ContextRelationship rel = contextManager.diff(column.value(), value());
-        if (ContextRelationship.GREATER_THAN == rel || ContextRelationship.DISJOINT == rel)
-            return column;
-        return null;
-    }
-
-    /*
-     * We have to special case digest creation for counter column because
-     * we don't want to include the information about which shard of the
-     * context is a delta or not, since this information differs from node to
-     * node.
-     */
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        digest.update(name.duplicate());
-        // We don't take the deltas into account in a digest
-        contextManager.updateDigest(digest, value);
-        DataOutputBuffer buffer = new DataOutputBuffer();
-        try
-        {
-            buffer.writeLong(timestamp);
-            buffer.writeByte(serializationFlags());
-            buffer.writeLong(timestampOfLastDelete);
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-        digest.update(buffer.getData(), 0, buffer.getLength());
-    }
-
-    @Override
-    public Column reconcile(Column column, Allocator allocator)
-    {
-        assert (column instanceof CounterColumn) || (column instanceof DeletedColumn) : "Wrong class type: " + column.getClass();
-
-        // live + tombstone: track last tombstone
-        if (column.isMarkedForDelete(Long.MIN_VALUE)) // cannot be an expired column, so the current time is irrelevant
-        {
-            // live < tombstone
-            if (timestamp() < column.timestamp())
-            {
-                return column;
-            }
-            // live last delete >= tombstone
-            if (timestampOfLastDelete() >= column.timestamp())
-            {
-                return this;
-            }
-            // live last delete < tombstone
-            return new CounterColumn(name(), value(), timestamp(), column.timestamp());
-        }
-        // live < live last delete
-        if (timestamp() < ((CounterColumn)column).timestampOfLastDelete())
-            return column;
-        // live last delete > live
-        if (timestampOfLastDelete() > column.timestamp())
-            return this;
-        // live + live: merge clocks; update value
-        return new CounterColumn(
-            name(),
-            contextManager.merge(value(), column.value(), allocator),
-            Math.max(timestamp(), column.timestamp()),
-            Math.max(timestampOfLastDelete(), ((CounterColumn)column).timestampOfLastDelete()));
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        // super.equals() returns false if o is not a CounterColumn
-        return super.equals(o) && timestampOfLastDelete == ((CounterColumn)o).timestampOfLastDelete;
-    }
-
-    @Override
-    public int hashCode()
-    {
-        int result = super.hashCode();
-        result = 31 * result + (int)(timestampOfLastDelete ^ (timestampOfLastDelete >>> 32));
-        return result;
-    }
-
-    @Override
-    public Column localCopy(ColumnFamilyStore cfs)
-    {
-        return new CounterColumn(cfs.internOrCopy(name, HeapAllocator.instance), ByteBufferUtil.clone(value), timestamp, timestampOfLastDelete);
-    }
-
-    @Override
-    public Column localCopy(ColumnFamilyStore cfs, Allocator allocator)
-    {
-        return new CounterColumn(cfs.internOrCopy(name, allocator), allocator.clone(value), timestamp, timestampOfLastDelete);
-    }
-
-    @Override
-    public String getString(AbstractType<?> comparator)
-    {
-        StringBuilder sb = new StringBuilder();
-        sb.append(comparator.getString(name));
-        sb.append(":");
-        sb.append(false);
-        sb.append(":");
-        sb.append(contextManager.toString(value));
-        sb.append("@");
-        sb.append(timestamp());
-        sb.append("!");
-        sb.append(timestampOfLastDelete);
-        return sb.toString();
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.COUNTER_MASK;
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        validateName(metadata);
-        // We cannot use the value validator as for other columns as the CounterColumnType validate a long,
-        // which is not the internal representation of counters
-        contextManager.validateContext(value());
-    }
-
-    /**
-     * Check if a given counterId is found in this CounterColumn context.
-     */
-    public boolean hasCounterId(CounterId id)
-    {
-        return contextManager.hasCounterId(value(), id);
-    }
-
-    public Column markLocalToBeCleared()
-    {
-        ByteBuffer marked = contextManager.markLocalToBeCleared(value);
-        return marked == value ? this : new CounterColumn(name, marked, timestamp, timestampOfLastDelete);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/CounterMutation.java b/src/java/org/apache/cassandra/db/CounterMutation.java
index 3caeda5..58717b4 100644
--- a/src/java/org/apache/cassandra/db/CounterMutation.java
+++ b/src/java/org/apache/cassandra/db/CounterMutation.java

@@ -18,62 +18,68 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import java.util.UUID;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
 
+import com.google.common.base.Function;
+import com.google.common.base.Objects;
 import com.google.common.collect.Iterables;
+import com.google.common.util.concurrent.Striped;
 
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.db.filter.NamesQueryFilter;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.HeapAllocator;
+import org.apache.cassandra.service.CacheService;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.*;
 
 public class CounterMutation implements IMutation
 {
     public static final CounterMutationSerializer serializer = new CounterMutationSerializer();
 
-    private final RowMutation rowMutation;
+    private static final Striped<Lock> LOCKS = Striped.lazyWeakLock(DatabaseDescriptor.getConcurrentCounterWriters() * 1024);
+
+    private final Mutation mutation;
     private final ConsistencyLevel consistency;
 
-    public CounterMutation(RowMutation rowMutation, ConsistencyLevel consistency)
+    public CounterMutation(Mutation mutation, ConsistencyLevel consistency)
     {
-        this.rowMutation = rowMutation;
+        this.mutation = mutation;
         this.consistency = consistency;
     }
 
     public String getKeyspaceName()
     {
-        return rowMutation.getKeyspaceName();
+        return mutation.getKeyspaceName();
     }
 
     public Collection<UUID> getColumnFamilyIds()
     {
-        return rowMutation.getColumnFamilyIds();
+        return mutation.getColumnFamilyIds();
     }
 
     public Collection<ColumnFamily> getColumnFamilies()
     {
-        return rowMutation.getColumnFamilies();
+        return mutation.getColumnFamilies();
+    }
+
+    public Mutation getMutation()
+    {
+        return mutation;
     }
 
     public ByteBuffer key()
     {
-        return rowMutation.key();
-    }
-
-    public RowMutation rowMutation()
-    {
-        return rowMutation;
+        return mutation.key();
     }
 
     public ConsistencyLevel consistency()
@@ -81,79 +87,219 @@
         return consistency;
     }
 
-    public RowMutation makeReplicationMutation()
-    {
-        List<ReadCommand> readCommands = new LinkedList<ReadCommand>();
-        long timestamp = System.currentTimeMillis();
-        for (ColumnFamily columnFamily : rowMutation.getColumnFamilies())
-        {
-            if (!columnFamily.metadata().getReplicateOnWrite())
-                continue;
-            addReadCommandFromColumnFamily(rowMutation.getKeyspaceName(), rowMutation.key(), columnFamily, timestamp, readCommands);
-        }
-
-        // create a replication RowMutation
-        RowMutation replicationMutation = new RowMutation(rowMutation.getKeyspaceName(), rowMutation.key());
-        for (ReadCommand readCommand : readCommands)
-        {
-            Keyspace keyspace = Keyspace.open(readCommand.ksName);
-            Row row = readCommand.getRow(keyspace);
-            if (row == null || row.cf == null)
-                continue;
-
-            ColumnFamily cf = row.cf;
-            replicationMutation.add(cf);
-        }
-
-        return replicationMutation.isEmpty() ? null : replicationMutation;
-    }
-
-    private void addReadCommandFromColumnFamily(String keyspaceName, ByteBuffer key, ColumnFamily columnFamily, long timestamp, List<ReadCommand> commands)
-    {
-        SortedSet<ByteBuffer> s = new TreeSet<ByteBuffer>(columnFamily.metadata().comparator);
-        Iterables.addAll(s, columnFamily.getColumnNames());
-        commands.add(new SliceByNamesReadCommand(keyspaceName, key, columnFamily.metadata().cfName, timestamp, new NamesQueryFilter(s)));
-    }
-
     public MessageOut<CounterMutation> makeMutationMessage()
     {
-        return new MessageOut<CounterMutation>(MessagingService.Verb.COUNTER_MUTATION, this, serializer);
+        return new MessageOut<>(MessagingService.Verb.COUNTER_MUTATION, this, serializer);
     }
 
-    public boolean shouldReplicateOnWrite()
+    /**
+     * Applies the counter mutation, returns the result Mutation (for replication to other nodes).
+     *
+     * 1. Grabs the striped cell-level locks in the proper order
+     * 2. Gets the current values of the counters-to-be-modified from the counter cache
+     * 3. Reads the rest of the current values (cache misses) from the CF
+     * 4. Writes the updated counter values
+     * 5. Updates the counter cache
+     * 6. Releases the lock(s)
+     *
+     * See CASSANDRA-4775 and CASSANDRA-6504 for further details.
+     *
+     * @return the applied resulting Mutation
+     */
+    public Mutation apply() throws WriteTimeoutException
     {
-        for (ColumnFamily cf : rowMutation.getColumnFamilies())
-            if (cf.metadata().getReplicateOnWrite())
-                return true;
-        return false;
-    }
+        Mutation result = new Mutation(getKeyspaceName(), key());
+        Keyspace keyspace = Keyspace.open(getKeyspaceName());
 
-    public void apply()
-    {
-        // transform all CounterUpdateColumn to CounterColumn: accomplished by localCopy
-        RowMutation rm = new RowMutation(rowMutation.getKeyspaceName(), ByteBufferUtil.clone(rowMutation.key()));
-        Keyspace keyspace = Keyspace.open(rm.getKeyspaceName());
+        int count = 0;
+        for (ColumnFamily cf : getColumnFamilies())
+            count += cf.getColumnCount();
 
-        for (ColumnFamily cf_ : rowMutation.getColumnFamilies())
+        List<Lock> locks = new ArrayList<>(count);
+        Tracing.trace("Acquiring {} counter locks", count);
+        try
         {
-            ColumnFamily cf = cf_.cloneMeShallow();
-            ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cf.id());
-            for (Column column : cf_)
-            {
-                cf.addColumn(column.localCopy(cfs), HeapAllocator.instance);
-            }
-            rm.add(cf);
+            grabCounterLocks(keyspace, locks);
+            for (ColumnFamily cf : getColumnFamilies())
+                result.add(processModifications(cf));
+            result.apply();
+            updateCounterCache(result, keyspace);
+            return result;
         }
-        rm.apply();
+        finally
+        {
+            for (Lock lock : locks)
+                lock.unlock();
+        }
+    }
+
+    private void grabCounterLocks(Keyspace keyspace, List<Lock> locks) throws WriteTimeoutException
+    {
+        long startTime = System.nanoTime();
+
+        for (Lock lock : LOCKS.bulkGet(getCounterLockKeys()))
+        {
+            long timeout = TimeUnit.MILLISECONDS.toNanos(getTimeout()) - (System.nanoTime() - startTime);
+            try
+            {
+                if (!lock.tryLock(timeout, TimeUnit.NANOSECONDS))
+                    throw new WriteTimeoutException(WriteType.COUNTER, consistency(), 0, consistency().blockFor(keyspace));
+                locks.add(lock);
+            }
+            catch (InterruptedException e)
+            {
+                throw new WriteTimeoutException(WriteType.COUNTER, consistency(), 0, consistency().blockFor(keyspace));
+            }
+        }
+    }
+
+    /**
+     * Returns a wrapper for the Striped#bulkGet() call (via Keyspace#counterLocksFor())
+     * Striped#bulkGet() depends on Object#hashCode(), so here we make sure that the cf id and the partition key
+     * all get to be part of the hashCode() calculation, not just the cell name.
+     */
+    private Iterable<Object> getCounterLockKeys()
+    {
+        return Iterables.concat(Iterables.transform(getColumnFamilies(), new Function<ColumnFamily, Iterable<Object>>()
+        {
+            public Iterable<Object> apply(final ColumnFamily cf)
+            {
+                return Iterables.transform(cf, new Function<Cell, Object>()
+                {
+                    public Object apply(Cell cell)
+                    {
+                        return Objects.hashCode(cf.id(), key(), cell.name());
+                    }
+                });
+            }
+        }));
+    }
+
+    // Replaces all the CounterUpdateCell-s with updated regular CounterCell-s
+    private ColumnFamily processModifications(ColumnFamily changesCF)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(getKeyspaceName()).getColumnFamilyStore(changesCF.id());
+
+        ColumnFamily resultCF = changesCF.cloneMeShallow();
+
+        List<CounterUpdateCell> counterUpdateCells = new ArrayList<>(changesCF.getColumnCount());
+        for (Cell cell : changesCF)
+        {
+            if (cell instanceof CounterUpdateCell)
+                counterUpdateCells.add((CounterUpdateCell)cell);
+            else
+                resultCF.addColumn(cell);
+        }
+
+        if (counterUpdateCells.isEmpty())
+            return resultCF; // only DELETEs
+
+        ClockAndCount[] currentValues = getCurrentValues(counterUpdateCells, cfs);
+        for (int i = 0; i < counterUpdateCells.size(); i++)
+        {
+            ClockAndCount currentValue = currentValues[i];
+            CounterUpdateCell update = counterUpdateCells.get(i);
+
+            long clock = currentValue.clock + 1L;
+            long count = currentValue.count + update.delta();
+
+            resultCF.addColumn(new BufferCounterCell(update.name(),
+                                                     CounterContext.instance().createGlobal(CounterId.getLocalId(), clock, count),
+                                                     update.timestamp()));
+        }
+
+        return resultCF;
+    }
+
+    // Attempt to load the current values(s) from cache. If that fails, read the rest from the cfs.
+    private ClockAndCount[] getCurrentValues(List<CounterUpdateCell> counterUpdateCells, ColumnFamilyStore cfs)
+    {
+        ClockAndCount[] currentValues = new ClockAndCount[counterUpdateCells.size()];
+        int remaining = counterUpdateCells.size();
+
+        if (CacheService.instance.counterCache.getCapacity() != 0)
+        {
+            Tracing.trace("Fetching {} counter values from cache", counterUpdateCells.size());
+            remaining = getCurrentValuesFromCache(counterUpdateCells, cfs, currentValues);
+            if (remaining == 0)
+                return currentValues;
+        }
+
+        Tracing.trace("Reading {} counter values from the CF", remaining);
+        getCurrentValuesFromCFS(counterUpdateCells, cfs, currentValues);
+
+        return currentValues;
+    }
+
+    // Returns the count of cache misses.
+    private int getCurrentValuesFromCache(List<CounterUpdateCell> counterUpdateCells,
+                                          ColumnFamilyStore cfs,
+                                          ClockAndCount[] currentValues)
+    {
+        int cacheMisses = 0;
+        for (int i = 0; i < counterUpdateCells.size(); i++)
+        {
+            ClockAndCount cached = cfs.getCachedCounter(key(), counterUpdateCells.get(i).name());
+            if (cached != null)
+                currentValues[i] = cached;
+            else
+                cacheMisses++;
+        }
+        return cacheMisses;
+    }
+
+    // Reads the missing current values from the CFS.
+    private void getCurrentValuesFromCFS(List<CounterUpdateCell> counterUpdateCells,
+                                         ColumnFamilyStore cfs,
+                                         ClockAndCount[] currentValues)
+    {
+        SortedSet<CellName> names = new TreeSet<>(cfs.metadata.comparator);
+        for (int i = 0; i < currentValues.length; i++)
+            if (currentValues[i] == null)
+                names.add(counterUpdateCells.get(i).name());
+
+        ReadCommand cmd = new SliceByNamesReadCommand(getKeyspaceName(), key(), cfs.metadata.cfName, Long.MIN_VALUE, new NamesQueryFilter(names));
+        Row row = cmd.getRow(cfs.keyspace);
+        ColumnFamily cf = row == null ? null : row.cf;
+
+        for (int i = 0; i < currentValues.length; i++)
+        {
+            if (currentValues[i] != null)
+                continue;
+
+            Cell cell = cf == null ? null : cf.getColumn(counterUpdateCells.get(i).name());
+            if (cell == null || !cell.isLive()) // absent or a tombstone.
+                currentValues[i] = ClockAndCount.BLANK;
+            else
+                currentValues[i] = CounterContext.instance().getLocalClockAndCount(cell.value());
+        }
+    }
+
+    private void updateCounterCache(Mutation applied, Keyspace keyspace)
+    {
+        if (CacheService.instance.counterCache.getCapacity() == 0)
+            return;
+
+        for (ColumnFamily cf : applied.getColumnFamilies())
+        {
+            ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cf.id());
+            for (Cell cell : cf)
+                if (cell instanceof CounterCell)
+                    cfs.putCachedCounter(key(), cell.name(), CounterContext.instance().getLocalClockAndCount(cell.value()));
+        }
     }
 
     public void addAll(IMutation m)
     {
         if (!(m instanceof CounterMutation))
             throw new IllegalArgumentException();
-
         CounterMutation cm = (CounterMutation)m;
-        rowMutation.addAll(cm.rowMutation);
+        mutation.addAll(cm.mutation);
+    }
+
+    public long getTimeout()
+    {
+        return DatabaseDescriptor.getCounterWriteRpcTimeout();
     }
 
     @Override
@@ -164,31 +310,28 @@
 
     public String toString(boolean shallow)
     {
-        StringBuilder buff = new StringBuilder("CounterMutation(");
-        buff.append(rowMutation.toString(shallow));
-        buff.append(", ").append(consistency.toString());
-        return buff.append(")").toString();
-    }
-}
-
-class CounterMutationSerializer implements IVersionedSerializer<CounterMutation>
-{
-    public void serialize(CounterMutation cm, DataOutput out, int version) throws IOException
-    {
-        RowMutation.serializer.serialize(cm.rowMutation(), out, version);
-        out.writeUTF(cm.consistency().name());
+        return String.format("CounterMutation(%s, %s)", mutation.toString(shallow), consistency);
     }
 
-    public CounterMutation deserialize(DataInput in, int version) throws IOException
+    public static class CounterMutationSerializer implements IVersionedSerializer<CounterMutation>
     {
-        RowMutation rm = RowMutation.serializer.deserialize(in, version);
-        ConsistencyLevel consistency = Enum.valueOf(ConsistencyLevel.class, in.readUTF());
-        return new CounterMutation(rm, consistency);
-    }
+        public void serialize(CounterMutation cm, DataOutputPlus out, int version) throws IOException
+        {
+            Mutation.serializer.serialize(cm.mutation, out, version);
+            out.writeUTF(cm.consistency.name());
+        }
 
-    public long serializedSize(CounterMutation cm, int version)
-    {
-        return RowMutation.serializer.serializedSize(cm.rowMutation(), version)
-             + TypeSizes.NATIVE.sizeof(cm.consistency().name());
+        public CounterMutation deserialize(DataInput in, int version) throws IOException
+        {
+            Mutation m = Mutation.serializer.deserialize(in, version);
+            ConsistencyLevel consistency = Enum.valueOf(ConsistencyLevel.class, in.readUTF());
+            return new CounterMutation(m, consistency);
+        }
+
+        public long serializedSize(CounterMutation cm, int version)
+        {
+            return Mutation.serializer.serializedSize(cm.mutation, version)
+                 + TypeSizes.NATIVE.sizeof(cm.consistency.name());
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/CounterMutationVerbHandler.java b/src/java/org/apache/cassandra/db/CounterMutationVerbHandler.java
index c1cc95f..d65fbd7 100644
--- a/src/java/org/apache/cassandra/db/CounterMutationVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/CounterMutationVerbHandler.java

@@ -37,8 +37,7 @@
         try
         {
             final CounterMutation cm = message.payload;
-            if (logger.isDebugEnabled())
-              logger.debug("Applying forwarded " + cm);
+            logger.debug("Applying forwarded {}", cm);
 
             String localDataCenter = DatabaseDescriptor.getEndpointSnitch().getDatacenter(FBUtilities.getBroadcastAddress());
             // We should not wait for the result of the write in this thread,
@@ -48,11 +47,11 @@
             // will not be called if the request timeout, but this is ok
             // because the coordinator of the counter mutation will timeout on
             // it's own in that case.
-            StorageProxy.applyCounterMutationOnLeader(cm, localDataCenter, new Runnable(){
+            StorageProxy.applyCounterMutationOnLeader(cm, localDataCenter, new Runnable()
+            {
                 public void run()
                 {
-                    WriteResponse response = new WriteResponse();
-                    MessagingService.instance().sendReply(response.createMessage(), id, message.from);
+                    MessagingService.instance().sendReply(new WriteResponse().createMessage(), id, message.from);
                 }
             });
         }

diff --git a/src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java b/src/java/org/apache/cassandra/db/CounterUpdateCell.java
similarity index 65%
copy from src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java
copy to src/java/org/apache/cassandra/db/CounterUpdateCell.java
index afd268d..58ac365 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java
+++ b/src/java/org/apache/cassandra/db/CounterUpdateCell.java

@@ -15,15 +15,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.db.columniterator;
+package org.apache.cassandra.db;
 
-import java.io.IOException;
-
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.db.OnDiskAtom;
-
-public abstract class SimpleAbstractColumnIterator extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
+/**
+ * A counter update while it hasn't been applied yet by the leader replica.
+ *
+ * Contains a single counter update. When applied by the leader replica, this
+ * is transformed to a relevant CounterCell. This Cell is a temporary data
+ * structure that should never be stored inside a memtable or an sstable.
+ */
+public interface CounterUpdateCell extends Cell
 {
-    public void close() throws IOException {}
+    public long delta();
 }

diff --git a/src/java/org/apache/cassandra/db/CounterUpdateColumn.java b/src/java/org/apache/cassandra/db/CounterUpdateColumn.java
deleted file mode 100644
index df90625..0000000
--- a/src/java/org/apache/cassandra/db/CounterUpdateColumn.java
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.utils.Allocator;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.HeapAllocator;
-
-/**
- * A counter update while it hasn't been applied yet by the leader replica.
- *
- * Contains a single counter update. When applied by the leader replica, this
- * is transformed to a relevant CounterColumn. This Column is a temporary data
- * structure that should never be stored inside a memtable or an sstable.
- */
-public class CounterUpdateColumn extends Column
-{
-    public CounterUpdateColumn(ByteBuffer name, long value, long timestamp)
-    {
-        this(name, ByteBufferUtil.bytes(value), timestamp);
-    }
-
-    public CounterUpdateColumn(ByteBuffer name, ByteBuffer value, long timestamp)
-    {
-        super(name, value, timestamp);
-    }
-
-    public long delta()
-    {
-        return value().getLong(value().position());
-    }
-
-    @Override
-    public Column diff(Column column)
-    {
-        // Diff is used during reads, but we should never read those columns
-        throw new UnsupportedOperationException("This operation is unsupported on CounterUpdateColumn.");
-    }
-
-    @Override
-    public CounterUpdateColumn withUpdatedName(ByteBuffer newName)
-    {
-        return new CounterUpdateColumn(newName, value, timestamp);
-    }
-
-    @Override
-    public Column reconcile(Column column, Allocator allocator)
-    {
-        // The only time this could happen is if a batchAdd ships two
-        // increment for the same column. Hence we simply sums the delta.
-
-        assert (column instanceof CounterUpdateColumn) || (column instanceof DeletedColumn) : "Wrong class type.";
-
-        // tombstones take precedence
-        if (column.isMarkedForDelete(Long.MIN_VALUE)) // can't be an expired column, so the current time is irrelevant
-            return timestamp() > column.timestamp() ? this : column;
-
-        // neither is tombstoned
-        CounterUpdateColumn c = (CounterUpdateColumn)column;
-        return new CounterUpdateColumn(name(), delta() + c.delta(), Math.max(timestamp(), c.timestamp()));
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.COUNTER_UPDATE_MASK;
-    }
-
-    @Override
-    public CounterColumn localCopy(ColumnFamilyStore cfs)
-    {
-        return new CounterColumn(cfs.internOrCopy(name, HeapAllocator.instance),
-                                 CounterContext.instance().createLocal(delta(), HeapAllocator.instance),
-                                 timestamp(),
-                                 Long.MIN_VALUE);
-    }
-
-    @Override
-    public Column localCopy(ColumnFamilyStore cfs, Allocator allocator)
-    {
-        return new CounterColumn(cfs.internOrCopy(name, allocator),
-                                 CounterContext.instance().createLocal(delta(), allocator),
-                                 timestamp(),
-                                 Long.MIN_VALUE);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/DataRange.java b/src/java/org/apache/cassandra/db/DataRange.java
index b8e0bf5..31a9370 100644
--- a/src/java/org/apache/cassandra/db/DataRange.java
+++ b/src/java/org/apache/cassandra/db/DataRange.java

@@ -23,8 +23,9 @@
 import java.util.List;
 
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.dht.*;
 
 /**
@@ -57,8 +58,8 @@
     public static boolean isFullRowSlice(SliceQueryFilter filter)
     {
         return filter.slices.length == 1
-            && filter.start().remaining() == 0
-            && filter.finish().remaining() == 0
+            && filter.start().isEmpty()
+            && filter.finish().isEmpty()
             && filter.count == Integer.MAX_VALUE;
     }
 
@@ -124,11 +125,11 @@
     public static class Paging extends DataRange
     {
         private final SliceQueryFilter sliceFilter;
-        private final Comparator<ByteBuffer> comparator;
-        private final ByteBuffer columnStart;
-        private final ByteBuffer columnFinish;
+        private final Comparator<Composite> comparator;
+        private final Composite columnStart;
+        private final Composite columnFinish;
 
-        private Paging(AbstractBounds<RowPosition> range, SliceQueryFilter filter, ByteBuffer columnStart, ByteBuffer columnFinish, Comparator<ByteBuffer> comparator)
+        private Paging(AbstractBounds<RowPosition> range, SliceQueryFilter filter, Composite columnStart, Composite columnFinish, Comparator<Composite> comparator)
         {
             super(range, filter);
 
@@ -142,9 +143,9 @@
             this.columnFinish = columnFinish;
         }
 
-        public Paging(AbstractBounds<RowPosition> range, SliceQueryFilter filter, ByteBuffer columnStart, ByteBuffer columnFinish, AbstractType<?> comparator)
+        public Paging(AbstractBounds<RowPosition> range, SliceQueryFilter filter, Composite columnStart, Composite columnFinish, CellNameType comparator)
         {
-            this(range, filter, columnStart, columnFinish, filter.isReversed() ? comparator.reverseComparator : comparator);
+            this(range, filter, columnStart, columnFinish, filter.isReversed() ? comparator.reverseComparator() : comparator);
         }
 
         @Override
@@ -162,7 +163,7 @@
 
         private boolean equals(RowPosition pos, ByteBuffer rowKey)
         {
-            return pos instanceof DecoratedKey && ((DecoratedKey)pos).key.equals(rowKey);
+            return pos instanceof DecoratedKey && ((DecoratedKey)pos).getKey().equals(rowKey);
         }
 
         @Override
@@ -184,11 +185,10 @@
         private ColumnSlice[] slicesForKey(ByteBuffer key)
         {
             // We don't call that until it's necessary, so assume we have to do some hard work
-            // Also note that columnStart and columnFinish, when used, only "restrict" the filter slices,
             // it doesn't expand on them. As such, we can ignore the case where they are empty and we do
             // as it screw up with the logic below (see #6592)
-            ByteBuffer newStart = equals(startKey(), key) && columnStart.hasRemaining() ? columnStart : null;
-            ByteBuffer newFinish = equals(stopKey(), key) && columnFinish.hasRemaining() ? columnFinish : null;
+            Composite newStart = equals(startKey(), key) && !columnStart.isEmpty() ? columnStart : null;
+            Composite newFinish = equals(stopKey(), key) && !columnFinish.isEmpty() ? columnFinish : null;
 
             List<ColumnSlice> newSlices = new ArrayList<ColumnSlice>(sliceFilter.slices.length); // in the common case, we'll have the same number of slices
 

diff --git a/src/java/org/apache/cassandra/db/DataTracker.java b/src/java/org/apache/cassandra/db/DataTracker.java
index f6d2c75..2ff040c 100644
--- a/src/java/org/apache/cassandra/db/DataTracker.java
+++ b/src/java/org/apache/cassandra/db/DataTracker.java

@@ -36,6 +36,7 @@
 import org.apache.cassandra.notifications.*;
 import org.apache.cassandra.utils.Interval;
 import org.apache.cassandra.utils.IntervalTree;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
 public class DataTracker
 {
@@ -52,23 +53,22 @@
         this.init();
     }
 
-    public Memtable getMemtable()
+    // get the Memtable that the ordered writeOp should be directed to
+    public Memtable getMemtableFor(OpOrder.Group opGroup)
     {
-        return view.get().memtable;
-    }
+        // since any new memtables appended to the list after we fetch it will be for operations started
+        // after us, we can safely assume that we will always find the memtable that 'accepts' us;
+        // if the barrier for any memtable is set whilst we are reading the list, it must accept us.
 
-    public Set<Memtable> getMemtablesPendingFlush()
-    {
-        return view.get().memtablesPendingFlush;
-    }
-
-    /**
-     * @return the active memtable and all the memtables that are pending flush.
-     */
-    public Iterable<Memtable> getAllMemtables()
-    {
-        View snapshot = view.get();
-        return Iterables.concat(snapshot.memtablesPendingFlush, Collections.singleton(snapshot.memtable));
+        // there may be multiple memtables in the list that would 'accept' us, however we only ever choose
+        // the oldest such memtable, as accepts() only prevents us falling behind (i.e. ensures we don't
+        // assign operations to a memtable that was retired/queued before we started)
+        for (Memtable memtable : view.get().liveMemtables)
+        {
+            if (memtable.accepts(opGroup))
+                return memtable;
+        }
+        throw new AssertionError(view.get().liveMemtables.toString());
     }
 
     public Set<SSTableReader> getSSTables()
@@ -99,48 +99,41 @@
     }
 
     /**
-     * Switch the current memtable.
-     * This atomically adds the current memtable to the memtables pending
-     * flush and replace it with a fresh memtable.
+     * Switch the current memtable. This atomically appends a new memtable to the end of the list of active memtables,
+     * returning the previously last memtable. It leaves the previous Memtable in the list of live memtables until
+     * discarding(memtable) is called. These two methods must be synchronized/paired, i.e. m = switchMemtable
+     * must be followed by discarding(m), they cannot be interleaved.
      *
-     * @return the previous current memtable (the one added to the pending
-     * flush)
+     * @return the previously active memtable
      */
-    public Memtable switchMemtable()
+    public Memtable switchMemtable(boolean truncating)
     {
-        // atomically change the current memtable
         Memtable newMemtable = new Memtable(cfstore);
         Memtable toFlushMemtable;
         View currentView, newView;
         do
         {
             currentView = view.get();
-            toFlushMemtable = currentView.memtable;
+            toFlushMemtable = currentView.getCurrentMemtable();
             newView = currentView.switchMemtable(newMemtable);
         }
         while (!view.compareAndSet(currentView, newView));
 
+        if (truncating)
+            notifyRenewed(newMemtable);
+
         return toFlushMemtable;
     }
 
-    /**
-     * Renew the current memtable without putting the old one for a flush.
-     * Used when we flush but a memtable is clean (in which case we must
-     * change it because it was frozen).
-     */
-    public void renewMemtable()
+    public void markFlushing(Memtable memtable)
     {
-        assert !cfstore.keyspace.metadata.durableWrites;
-
-        Memtable newMemtable = new Memtable(cfstore);
         View currentView, newView;
         do
         {
             currentView = view.get();
-            newView = currentView.renewMemtable(newMemtable);
+            newView = currentView.markFlushing(memtable);
         }
         while (!view.compareAndSet(currentView, newView));
-        notifyRenewed(currentView.memtable);
     }
 
     public void replaceFlushed(Memtable memtable, SSTableReader sstable)
@@ -199,26 +192,30 @@
     public boolean markCompacting(Iterable<SSTableReader> sstables)
     {
         assert sstables != null && !Iterables.isEmpty(sstables);
-
-        View currentView = view.get();
-        Set<SSTableReader> inactive = Sets.difference(ImmutableSet.copyOf(sstables), currentView.compacting);
-        if (inactive.size() < Iterables.size(sstables))
-            return false;
-
-        if (Iterables.any(sstables, new Predicate<SSTableReader>()
+        while (true)
         {
-            @Override
-            public boolean apply(SSTableReader sstable)
+            View currentView = view.get();
+            Set<SSTableReader> set = ImmutableSet.copyOf(sstables);
+            Set<SSTableReader> inactive = Sets.difference(set, currentView.compacting);
+            if (inactive.size() < set.size())
+                return false;
+
+            if (Iterables.any(set, new Predicate<SSTableReader>()
             {
-                return sstable.isMarkedCompacted();
+                @Override
+                public boolean apply(SSTableReader sstable)
+                {
+                    return sstable.isMarkedCompacted();
+                }
+            }))
+            {
+                return false;
             }
-        }))
-        {
-            return false;
-        }
 
-        View newView = currentView.markCompacting(inactive);
-        return view.compareAndSet(currentView, newView);
+            View newView = currentView.markCompacting(set);
+            if (view.compareAndSet(currentView, newView))
+                return true;
+        }
     }
 
     /**
@@ -261,10 +258,18 @@
         notifySSTablesChanged(sstables, Collections.<SSTableReader>emptyList(), compactionType);
     }
 
-    public void replaceCompactedSSTables(Collection<SSTableReader> sstables, Collection<SSTableReader> replacements, OperationType compactionType)
+    // note that this DOES NOT insert the replacement sstables, it only removes the old sstables and notifies any listeners
+    // that they have been replaced by the provided sstables, which must have been performed by an earlier replaceReaders() call
+    public void markCompactedSSTablesReplaced(Collection<SSTableReader> sstables, Collection<SSTableReader> allReplacements, OperationType compactionType)
     {
-        replace(sstables, replacements);
-        notifySSTablesChanged(sstables, replacements, compactionType);
+        replace(sstables, Collections.<SSTableReader>emptyList());
+        notifySSTablesChanged(sstables, allReplacements, compactionType);
+        for (SSTableReader sstable : allReplacements)
+        {
+            long bytesOnDisk = sstable.bytesOnDisk();
+            cfstore.metric.totalDiskSpaceUsed.inc(bytesOnDisk);
+            cfstore.metric.liveDiskSpaceUsed.inc(bytesOnDisk);
+        }
     }
 
     public void addInitialSSTables(Collection<SSTableReader> sstables)
@@ -315,7 +320,7 @@
     void removeUnreadableSSTables(File directory)
     {
         View currentView, newView;
-        List<SSTableReader> remaining = new ArrayList<>();
+        Set<SSTableReader> remaining = new HashSet<>();
         do
         {
             currentView = view.get();
@@ -329,17 +334,48 @@
             newView = currentView.replace(currentView.sstables, remaining);
         }
         while (!view.compareAndSet(currentView, newView));
+        for (SSTableReader sstable : currentView.sstables)
+            if (!remaining.contains(sstable))
+                sstable.releaseReference();
         notifySSTablesChanged(remaining, Collections.<SSTableReader>emptySet(), OperationType.UNKNOWN);
     }
 
     /** (Re)initializes the tracker, purging all references. */
     void init()
     {
-        view.set(new View(new Memtable(cfstore),
-                          Collections.<Memtable>emptySet(),
-                          Collections.<SSTableReader>emptySet(),
-                          Collections.<SSTableReader>emptySet(),
-                          SSTableIntervalTree.empty()));
+        view.set(new View(
+                         ImmutableList.of(new Memtable(cfstore)),
+                         ImmutableList.<Memtable>of(),
+                         Collections.<SSTableReader>emptySet(),
+                         Collections.<SSTableReader>emptySet(),
+                         SSTableIntervalTree.empty()));
+    }
+
+    /**
+     * A special kind of replacement for SSTableReaders that were cloned with a new index summary sampling level (see
+     * SSTableReader.cloneWithNewSummarySamplingLevel and CASSANDRA-5519).  This does not mark the old reader
+     * as compacted.
+     * @param oldSSTables replaced readers
+     * @param newSSTables replacement readers
+     */
+    public void replaceReaders(Collection<SSTableReader> oldSSTables, Collection<SSTableReader> newSSTables)
+    {
+        View currentView, newView;
+        do
+        {
+            currentView = view.get();
+            newView = currentView.replace(oldSSTables, newSSTables);
+        }
+        while (!view.compareAndSet(currentView, newView));
+
+        if (!oldSSTables.isEmpty())
+            notifySSTablesChanged(oldSSTables, newSSTables, OperationType.COMPACTION);
+
+        for (SSTableReader sstable : newSSTables)
+            sstable.setTrackedBy(this);
+
+        for (SSTableReader sstable : oldSSTables)
+            sstable.releaseReference();
     }
 
     private void replace(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements)
@@ -440,11 +476,7 @@
             allDroppable += sstable.getDroppableTombstonesBefore(localTime - sstable.metadata.getGcGraceSeconds());
             allColumns += sstable.getEstimatedColumnCount().mean() * sstable.getEstimatedColumnCount().count();
         }
-        if (allColumns > 0)
-        {
-            return allDroppable / allColumns;
-        }
-        return 0;
+        return allColumns > 0 ? allDroppable / allColumns : 0;
     }
 
     public void notifySSTablesChanged(Collection<SSTableReader> removed, Collection<SSTableReader> added, OperationType compactionType)
@@ -461,6 +493,14 @@
             subscriber.handleNotification(notification, this);
     }
 
+    public void notifySSTableRepairedStatusChanged(Collection<SSTableReader> repairStatusesChanged)
+    {
+        INotification notification = new SSTableRepairStatusChanged(repairStatusesChanged);
+        for (INotificationConsumer subscriber : subscribers)
+            subscriber.handleNotification(notification, this);
+
+    }
+
     public void notifyDeleting(SSTableReader deleting)
     {
         INotification notification = new SSTableDeletingNotification(deleting);
@@ -525,72 +565,134 @@
      * flush, the sstables for a column family, and the sstables that are active
      * in compaction (a subset of the sstables).
      */
-    static class View
+    public static class View
     {
-        public final Memtable memtable;
-        public final Set<Memtable> memtablesPendingFlush;
+        /**
+         * ordinarily a list of size 1, but when preparing to flush will contain both the memtable we will flush
+         * and the new replacement memtable, until all outstanding write operations on the old table complete.
+         * The last item in the list is always the "current" memtable.
+         */
+        private final List<Memtable> liveMemtables;
+        /**
+         * contains all memtables that are no longer referenced for writing and are queued for / in the process of being
+         * flushed. In chronologically ascending order.
+         */
+        private final List<Memtable> flushingMemtables;
         public final Set<SSTableReader> compacting;
         public final Set<SSTableReader> sstables;
         public final SSTableIntervalTree intervalTree;
 
-        View(Memtable memtable, Set<Memtable> pendingFlush, Set<SSTableReader> sstables, Set<SSTableReader> compacting, SSTableIntervalTree intervalTree)
+        View(List<Memtable> liveMemtables, List<Memtable> flushingMemtables, Set<SSTableReader> sstables, Set<SSTableReader> compacting, SSTableIntervalTree intervalTree)
         {
-            assert memtable != null;
-            assert pendingFlush != null;
+            assert liveMemtables != null;
+            assert flushingMemtables != null;
             assert sstables != null;
             assert compacting != null;
             assert intervalTree != null;
 
-            this.memtable = memtable;
-            this.memtablesPendingFlush = pendingFlush;
+            this.liveMemtables = liveMemtables;
+            this.flushingMemtables = flushingMemtables;
             this.sstables = sstables;
             this.compacting = compacting;
             this.intervalTree = intervalTree;
         }
 
+        public Memtable getOldestMemtable()
+        {
+            if (!flushingMemtables.isEmpty())
+                return flushingMemtables.get(0);
+            return liveMemtables.get(0);
+        }
+
+        public Memtable getCurrentMemtable()
+        {
+            return liveMemtables.get(liveMemtables.size() - 1);
+        }
+
+        public Iterable<Memtable> getMemtablesPendingFlush()
+        {
+            if (liveMemtables.size() == 1)
+                return flushingMemtables;
+            return Iterables.concat(liveMemtables.subList(0, 1), flushingMemtables);
+        }
+
+        /**
+         * @return the active memtable and all the memtables that are pending flush.
+         */
+        public Iterable<Memtable> getAllMemtables()
+        {
+            return Iterables.concat(flushingMemtables, liveMemtables);
+        }
+
         public Sets.SetView<SSTableReader> nonCompactingSStables()
         {
             return Sets.difference(ImmutableSet.copyOf(sstables), compacting);
         }
 
-        public View switchMemtable(Memtable newMemtable)
+        View switchMemtable(Memtable newMemtable)
         {
-            Set<Memtable> newPending = ImmutableSet.<Memtable>builder().addAll(memtablesPendingFlush).add(memtable).build();
-            return new View(newMemtable, newPending, sstables, compacting, intervalTree);
+            List<Memtable> newLiveMemtables = ImmutableList.<Memtable>builder().addAll(liveMemtables).add(newMemtable).build();
+            return new View(newLiveMemtables, flushingMemtables, sstables, compacting, intervalTree);
         }
 
-        public View renewMemtable(Memtable newMemtable)
+        View markFlushing(Memtable toFlushMemtable)
         {
-            return new View(newMemtable, memtablesPendingFlush, sstables, compacting, intervalTree);
+            List<Memtable> live = liveMemtables, flushing = flushingMemtables;
+
+            // since we can have multiple flushes queued, we may occasionally race and start a flush out of order,
+            // so must locate it in the list to remove, rather than just removing from the beginning
+            int i = live.indexOf(toFlushMemtable);
+            assert i < live.size() - 1;
+            List<Memtable> newLive = ImmutableList.<Memtable>builder()
+                                                  .addAll(live.subList(0, i))
+                                                  .addAll(live.subList(i + 1, live.size()))
+                                                  .build();
+
+            // similarly, if we out-of-order markFlushing once, we may afterwards need to insert a memtable into the
+            // flushing list in a position other than the end, though this will be rare
+            i = flushing.size();
+            while (i > 0 && flushing.get(i - 1).creationTime() > toFlushMemtable.creationTime())
+                i--;
+            List<Memtable> newFlushing = ImmutableList.<Memtable>builder()
+                                                      .addAll(flushing.subList(0, i))
+                                                      .add(toFlushMemtable)
+                                                      .addAll(flushing.subList(i, flushing.size()))
+                                                      .build();
+
+            return new View(newLive, newFlushing, sstables, compacting, intervalTree);
         }
 
-        public View replaceFlushed(Memtable flushedMemtable, SSTableReader newSSTable)
+        View replaceFlushed(Memtable flushedMemtable, SSTableReader newSSTable)
         {
-            Set<Memtable> newPending = ImmutableSet.copyOf(Sets.difference(memtablesPendingFlush, Collections.singleton(flushedMemtable)));
+            int index = flushingMemtables.indexOf(flushedMemtable);
+            List<Memtable> newQueuedMemtables = ImmutableList.<Memtable>builder()
+                                                             .addAll(flushingMemtables.subList(0, index))
+                                                             .addAll(flushingMemtables.subList(index + 1, flushingMemtables.size()))
+                                                             .build();
             Set<SSTableReader> newSSTables = newSSTable == null
-                                           ? sstables
-                                           : newSSTables(newSSTable);
+                                             ? sstables
+                                             : newSSTables(newSSTable);
             SSTableIntervalTree intervalTree = buildIntervalTree(newSSTables);
-            return new View(memtable, newPending, newSSTables, compacting, intervalTree);
+            return new View(liveMemtables, newQueuedMemtables, newSSTables, compacting, intervalTree);
         }
 
-        public View replace(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements)
+        View replace(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements)
         {
             Set<SSTableReader> newSSTables = newSSTables(oldSSTables, replacements);
             SSTableIntervalTree intervalTree = buildIntervalTree(newSSTables);
-            return new View(memtable, memtablesPendingFlush, newSSTables, compacting, intervalTree);
+            return new View(liveMemtables, flushingMemtables, newSSTables, compacting, intervalTree);
         }
 
-        public View markCompacting(Collection<SSTableReader> tomark)
+        View markCompacting(Collection<SSTableReader> tomark)
         {
             Set<SSTableReader> compactingNew = ImmutableSet.<SSTableReader>builder().addAll(compacting).addAll(tomark).build();
-            return new View(memtable, memtablesPendingFlush, sstables, compactingNew, intervalTree);
+            return new View(liveMemtables, flushingMemtables, sstables, compactingNew, intervalTree);
         }
 
-        public View unmarkCompacting(Iterable<SSTableReader> tounmark)
+        View unmarkCompacting(Iterable<SSTableReader> tounmark)
         {
             Set<SSTableReader> compactingNew = ImmutableSet.copyOf(Sets.difference(compacting, ImmutableSet.copyOf(tounmark)));
-            return new View(memtable, memtablesPendingFlush, sstables, compactingNew, intervalTree);
+            return new View(liveMemtables, flushingMemtables, sstables, compactingNew, intervalTree);
         }
 
         private Set<SSTableReader> newSSTables(SSTableReader newSSTable)
@@ -619,12 +721,12 @@
         @Override
         public String toString()
         {
-            return String.format("View(pending_count=%d, sstables=%s, compacting=%s)", memtablesPendingFlush.size(), sstables, compacting);
+            return String.format("View(pending_count=%d, sstables=%s, compacting=%s)", liveMemtables.size() + flushingMemtables.size() - 1, sstables, compacting);
         }
 
         public List<SSTableReader> sstablesInBounds(AbstractBounds<RowPosition> rowBounds)
         {
-            RowPosition stopInTree = rowBounds.right.isMinimum(memtable.cfs.partitioner) ? intervalTree.max() : rowBounds.right;
+            RowPosition stopInTree = rowBounds.right.isMinimum(liveMemtables.get(0).cfs.partitioner) ? intervalTree.max() : rowBounds.right;
             return intervalTree.search(Interval.<RowPosition, SSTableReader>create(rowBounds.left, stopInTree));
         }
     }

diff --git a/src/java/org/apache/cassandra/db/DecoratedKey.java b/src/java/org/apache/cassandra/db/DecoratedKey.java
index 8f7a22b..604cbb7 100644
--- a/src/java/org/apache/cassandra/db/DecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/DecoratedKey.java

@@ -20,9 +20,13 @@
 import java.nio.ByteBuffer;
 import java.util.Comparator;
 
+import net.nicoulaj.compilecommand.annotations.Inline;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FastByteOperations;
+import org.apache.cassandra.utils.memory.MemoryUtil;
 
 /**
  * Represents a decorated key, handy for certain operations
@@ -33,7 +37,7 @@
  * if this matters, you can subclass RP to use a stronger hash, or use a non-lossy tokenization scheme (as in the
  * OrderPreservingPartitioner classes).
  */
-public class DecoratedKey extends RowPosition
+public abstract class DecoratedKey implements RowPosition
 {
     public static final Comparator<DecoratedKey> comparator = new Comparator<DecoratedKey>()
     {
@@ -43,20 +47,18 @@
         }
     };
 
-    public final Token token;
-    public final ByteBuffer key;
+    private final Token token;
 
-    public DecoratedKey(Token token, ByteBuffer key)
+    public DecoratedKey(Token token)
     {
-        assert token != null && key != null;
+        assert token != null;
         this.token = token;
-        this.key = key;
     }
 
     @Override
     public int hashCode()
     {
-        return key.hashCode(); // hash of key is enough
+        return getKey().hashCode(); // hash of key is enough
     }
 
     @Override
@@ -64,12 +66,11 @@
     {
         if (this == obj)
             return true;
-        if (obj == null || this.getClass() != obj.getClass())
+        if (obj == null || !(obj instanceof DecoratedKey))
             return false;
 
         DecoratedKey other = (DecoratedKey)obj;
-
-        return ByteBufferUtil.compareUnsigned(key, other.key) == 0; // we compare faster than BB.equals for array backed BB
+        return ByteBufferUtil.compareUnsigned(getKey(), other.getKey()) == 0; // we compare faster than BB.equals for array backed BB
     }
 
     public int compareTo(RowPosition pos)
@@ -82,8 +83,8 @@
             return -pos.compareTo(this);
 
         DecoratedKey otherKey = (DecoratedKey) pos;
-        int cmp = token.compareTo(otherKey.getToken());
-        return cmp == 0 ? ByteBufferUtil.compareUnsigned(key, otherKey.key) : cmp;
+        int cmp = getToken().compareTo(otherKey.getToken());
+        return cmp == 0 ? ByteBufferUtil.compareUnsigned(getKey(), otherKey.getKey()) : cmp;
     }
 
     public static int compareTo(IPartitioner partitioner, ByteBuffer key, RowPosition position)
@@ -94,7 +95,7 @@
 
         DecoratedKey otherKey = (DecoratedKey) position;
         int cmp = partitioner.getToken(key).compareTo(otherKey.getToken());
-        return cmp == 0 ? ByteBufferUtil.compareUnsigned(key, otherKey.key) : cmp;
+        return cmp == 0 ? ByteBufferUtil.compareUnsigned(key, otherKey.getKey()) : cmp;
     }
 
     public boolean isMinimum(IPartitioner partitioner)
@@ -103,6 +104,11 @@
         return false;
     }
 
+    public boolean isMinimum()
+    {
+        return isMinimum(StorageService.getPartitioner());
+    }
+
     public RowPosition.Kind kind()
     {
         return RowPosition.Kind.ROW_KEY;
@@ -111,12 +117,14 @@
     @Override
     public String toString()
     {
-        String keystring = key == null ? "null" : ByteBufferUtil.bytesToHex(key);
-        return "DecoratedKey(" + token + ", " + keystring + ")";
+        String keystring = getKey() == null ? "null" : ByteBufferUtil.bytesToHex(getKey());
+        return "DecoratedKey(" + getToken() + ", " + keystring + ")";
     }
 
     public Token getToken()
     {
         return token;
     }
+
+    public abstract ByteBuffer getKey();
 }

diff --git a/src/java/org/apache/cassandra/db/DefinitionsUpdateVerbHandler.java b/src/java/org/apache/cassandra/db/DefinitionsUpdateVerbHandler.java
index c4b9f84..5cb62ed 100644
--- a/src/java/org/apache/cassandra/db/DefinitionsUpdateVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/DefinitionsUpdateVerbHandler.java

@@ -32,15 +32,15 @@
  * Called when node receives updated schema state from the schema migration coordinator node.
  * Such happens when user makes local schema migration on one of the nodes in the ring
  * (which is going to act as coordinator) and that node sends (pushes) it's updated schema state
- * (in form of row mutations) to all the alive nodes in the cluster.
+ * (in form of mutations) to all the alive nodes in the cluster.
  */
-public class DefinitionsUpdateVerbHandler implements IVerbHandler<Collection<RowMutation>>
+public class DefinitionsUpdateVerbHandler implements IVerbHandler<Collection<Mutation>>
 {
     private static final Logger logger = LoggerFactory.getLogger(DefinitionsUpdateVerbHandler.class);
 
-    public void doVerb(final MessageIn<Collection<RowMutation>> message, int id)
+    public void doVerb(final MessageIn<Collection<Mutation>> message, int id)
     {
-        logger.debug("Received schema mutation push from " + message.from);
+        logger.debug("Received schema mutation push from {}", message.from);
 
         StageManager.getStage(Stage.MIGRATION).submit(new WrappedRunnable()
         {

diff --git a/src/java/org/apache/cassandra/db/DefsTables.java b/src/java/org/apache/cassandra/db/DefsTables.java
index 35eecc0..59f2e20 100644
--- a/src/java/org/apache/cassandra/db/DefsTables.java
+++ b/src/java/org/apache/cassandra/db/DefsTables.java

@@ -25,6 +25,7 @@
 import com.google.common.collect.Iterables;
 import com.google.common.collect.MapDifference;
 import com.google.common.collect.Maps;
+import org.apache.cassandra.db.commitlog.CommitLog;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -32,10 +33,11 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.UTMetaData;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.marshal.AsciiType;
-import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.UserType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.service.StorageService;
@@ -72,7 +74,7 @@
  *
  * Where <key> is a name of keyspace e.g. "ks".
  *
- * Column names where made composite to support 3-level nesting which represents following structure:
+ * Cell names where made composite to support 3-level nesting which represents following structure:
  * "ColumnFamily name":"column name":"column attribute" => "value"
  *
  * Example of schema (using CLI):
@@ -121,32 +123,19 @@
     {
         List<Row> serializedSchema = SystemKeyspace.serializedSchema(SystemKeyspace.SCHEMA_KEYSPACES_CF);
 
-        List<KSMetaData> keyspaces = new ArrayList<KSMetaData>(serializedSchema.size());
+        List<KSMetaData> keyspaces = new ArrayList<>(serializedSchema.size());
 
         for (Row row : serializedSchema)
         {
             if (Schema.invalidSchemaRow(row) || Schema.ignoredSchemaRow(row))
                 continue;
 
-            keyspaces.add(KSMetaData.fromSchema(row, serializedColumnFamilies(row.key)));
+            keyspaces.add(KSMetaData.fromSchema(row, serializedColumnFamilies(row.key), serializedUserTypes(row.key)));
         }
 
         return keyspaces;
     }
 
-    public static ByteBuffer searchComposite(String name, boolean start)
-    {
-        assert name != null;
-        ByteBuffer nameBytes = UTF8Type.instance.decompose(name);
-        int length = nameBytes.remaining();
-        byte[] bytes = new byte[2 + length + 1];
-        bytes[0] = (byte)((length >> 8) & 0xFF);
-        bytes[1] = (byte)(length & 0xFF);
-        ByteBufferUtil.arrayCopy(nameBytes, 0, bytes, 2, length);
-        bytes[bytes.length - 1] = (byte)(start ? 0 : 1);
-        return ByteBuffer.wrap(bytes);
-    }
-
     private static Row serializedColumnFamilies(DecoratedKey ksNameKey)
     {
         ColumnFamilyStore cfsStore = SystemKeyspace.schemaCFS(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF);
@@ -155,8 +144,16 @@
                                                                                          System.currentTimeMillis())));
     }
 
+    private static Row serializedUserTypes(DecoratedKey ksNameKey)
+    {
+        ColumnFamilyStore cfsStore = SystemKeyspace.schemaCFS(SystemKeyspace.SCHEMA_USER_TYPES_CF);
+        return new Row(ksNameKey, cfsStore.getColumnFamily(QueryFilter.getIdentityFilter(ksNameKey,
+                                                                                         SystemKeyspace.SCHEMA_USER_TYPES_CF,
+                                                                                         System.currentTimeMillis())));
+    }
+
     /**
-     * Merge remote schema in form of row mutations with local and mutate ks/cf metadata objects
+     * Merge remote schema in form of mutations with local and mutate ks/cf metadata objects
      * (which also involves fs operations on add/drop ks/cf)
      *
      * @param mutations the schema changes to apply
@@ -164,30 +161,42 @@
      * @throws ConfigurationException If one of metadata attributes has invalid value
      * @throws IOException If data was corrupted during transportation or failed to apply fs operations
      */
-    public static synchronized void mergeSchema(Collection<RowMutation> mutations) throws ConfigurationException, IOException
+    public static synchronized void mergeSchema(Collection<Mutation> mutations) throws ConfigurationException, IOException
     {
-        // current state of the schema
-        Map<DecoratedKey, ColumnFamily> oldKeyspaces = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_KEYSPACES_CF);
-        Map<DecoratedKey, ColumnFamily> oldColumnFamilies = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF);
+        mergeSchemaInternal(mutations, true);
+        Schema.instance.updateVersionAndAnnounce();
+    }
 
-        for (RowMutation mutation : mutations)
+    public static synchronized void mergeSchemaInternal(Collection<Mutation> mutations, boolean doFlush) throws ConfigurationException, IOException
+    {
+        // compare before/after schemas of the affected keyspaces only
+        Set<String> keyspaces = new HashSet<>(mutations.size());
+        for (Mutation mutation : mutations)
+            keyspaces.add(ByteBufferUtil.string(mutation.key()));
+
+        // current state of the schema
+        Map<DecoratedKey, ColumnFamily> oldKeyspaces = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_KEYSPACES_CF, keyspaces);
+        Map<DecoratedKey, ColumnFamily> oldColumnFamilies = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF, keyspaces);
+        Map<DecoratedKey, ColumnFamily> oldTypes = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_USER_TYPES_CF, keyspaces);
+
+        for (Mutation mutation : mutations)
             mutation.apply();
 
-        if (!StorageService.instance.isClientMode())
+        if (doFlush && !StorageService.instance.isClientMode())
             flushSchemaCFs();
 
         // with new data applied
-        Map<DecoratedKey, ColumnFamily> newKeyspaces = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_KEYSPACES_CF);
-        Map<DecoratedKey, ColumnFamily> newColumnFamilies = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF);
+        Map<DecoratedKey, ColumnFamily> newKeyspaces = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_KEYSPACES_CF, keyspaces);
+        Map<DecoratedKey, ColumnFamily> newColumnFamilies = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF, keyspaces);
+        Map<DecoratedKey, ColumnFamily> newTypes = SystemKeyspace.getSchema(SystemKeyspace.SCHEMA_USER_TYPES_CF, keyspaces);
 
         Set<String> keyspacesToDrop = mergeKeyspaces(oldKeyspaces, newKeyspaces);
         mergeColumnFamilies(oldColumnFamilies, newColumnFamilies);
+        mergeTypes(oldTypes, newTypes);
 
         // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
         for (String keyspaceToDrop : keyspacesToDrop)
             dropKeyspace(keyspaceToDrop);
-
-        Schema.instance.updateVersionAndAnnounce();
     }
 
     private static Set<String> mergeKeyspaces(Map<DecoratedKey, ColumnFamily> before, Map<DecoratedKey, ColumnFamily> after)
@@ -209,26 +218,26 @@
         MapDifference<DecoratedKey, ColumnFamily> diff = Maps.difference(before, after);
 
         for (Map.Entry<DecoratedKey, ColumnFamily> entry : diff.entriesOnlyOnRight().entrySet())
-            if (entry.getValue().getColumnCount() > 0)
+            if (entry.getValue().hasColumns())
                 created.add(new Row(entry.getKey(), entry.getValue()));
 
         for (Map.Entry<DecoratedKey, MapDifference.ValueDifference<ColumnFamily>> entry : diff.entriesDiffering().entrySet())
         {
-            String keyspaceName = AsciiType.instance.compose(entry.getKey().key);
+            String keyspaceName = AsciiType.instance.compose(entry.getKey().getKey());
 
             ColumnFamily pre  = entry.getValue().leftValue();
             ColumnFamily post = entry.getValue().rightValue();
 
-            if (pre.getColumnCount() > 0 && post.getColumnCount() > 0)
+            if (pre.hasColumns() && post.hasColumns())
                 altered.add(keyspaceName);
-            else if (pre.getColumnCount() > 0)
+            else if (pre.hasColumns())
                 dropped.add(keyspaceName);
-            else if (post.getColumnCount() > 0) // a (re)created keyspace
+            else if (post.hasColumns()) // a (re)created keyspace
                 created.add(new Row(entry.getKey(), post));
         }
 
         for (Row row : created)
-            addKeyspace(KSMetaData.fromSchema(row, Collections.<CFMetaData>emptyList()));
+            addKeyspace(KSMetaData.fromSchema(row, Collections.<CFMetaData>emptyList(), new UTMetaData()));
         for (String name : altered)
             updateKeyspace(name);
         return dropped;
@@ -244,21 +253,21 @@
         MapDifference<DecoratedKey, ColumnFamily> diff = Maps.difference(before, after);
 
         for (Map.Entry<DecoratedKey, ColumnFamily> entry : diff.entriesOnlyOnRight().entrySet())
-            if (entry.getValue().getColumnCount() > 0)
+            if (entry.getValue().hasColumns())
                 created.addAll(KSMetaData.deserializeColumnFamilies(new Row(entry.getKey(), entry.getValue())).values());
 
         for (Map.Entry<DecoratedKey, MapDifference.ValueDifference<ColumnFamily>> entry : diff.entriesDiffering().entrySet())
         {
-            String keyspaceName = AsciiType.instance.compose(entry.getKey().key);
+            String keyspaceName = AsciiType.instance.compose(entry.getKey().getKey());
 
             ColumnFamily pre  = entry.getValue().leftValue();
             ColumnFamily post = entry.getValue().rightValue();
 
-            if (pre.getColumnCount() > 0 && post.getColumnCount() > 0)
+            if (pre.hasColumns() && post.hasColumns())
             {
                 MapDifference<String, CFMetaData> delta =
-                    Maps.difference(Schema.instance.getKSMetaData(keyspaceName).cfMetaData(),
-                                    KSMetaData.deserializeColumnFamilies(new Row(entry.getKey(), post)));
+                        Maps.difference(Schema.instance.getKSMetaData(keyspaceName).cfMetaData(),
+                                        KSMetaData.deserializeColumnFamilies(new Row(entry.getKey(), post)));
 
                 dropped.addAll(delta.entriesOnlyOnLeft().values());
                 created.addAll(delta.entriesOnlyOnRight().values());
@@ -270,11 +279,11 @@
                     }
                 }));
             }
-            else if (pre.getColumnCount() > 0)
+            else if (pre.hasColumns())
             {
                 dropped.addAll(Schema.instance.getKSMetaData(keyspaceName).cfMetaData().values());
             }
-            else if (post.getColumnCount() > 0)
+            else if (post.hasColumns())
             {
                 created.addAll(KSMetaData.deserializeColumnFamilies(new Row(entry.getKey(), post)).values());
             }
@@ -288,6 +297,61 @@
             dropColumnFamily(cfm.ksName, cfm.cfName);
     }
 
+    // see the comments for mergeKeyspaces()
+    private static void mergeTypes(Map<DecoratedKey, ColumnFamily> before, Map<DecoratedKey, ColumnFamily> after)
+    {
+        List<UserType> created = new ArrayList<>();
+        List<UserType> altered = new ArrayList<>();
+        List<UserType> dropped = new ArrayList<>();
+
+        MapDifference<DecoratedKey, ColumnFamily> diff = Maps.difference(before, after);
+
+        // New keyspace with types
+        for (Map.Entry<DecoratedKey, ColumnFamily> entry : diff.entriesOnlyOnRight().entrySet())
+            if (entry.getValue().hasColumns())
+                created.addAll(UTMetaData.fromSchema(new Row(entry.getKey(), entry.getValue())).values());
+
+        for (Map.Entry<DecoratedKey, MapDifference.ValueDifference<ColumnFamily>> entry : diff.entriesDiffering().entrySet())
+        {
+            String keyspaceName = AsciiType.instance.compose(entry.getKey().getKey());
+
+            ColumnFamily pre  = entry.getValue().leftValue();
+            ColumnFamily post = entry.getValue().rightValue();
+
+            if (pre.hasColumns() && post.hasColumns())
+            {
+                MapDifference<ByteBuffer, UserType> delta =
+                        Maps.difference(Schema.instance.getKSMetaData(keyspaceName).userTypes.getAllTypes(),
+                                        UTMetaData.fromSchema(new Row(entry.getKey(), post)));
+
+                dropped.addAll(delta.entriesOnlyOnLeft().values());
+                created.addAll(delta.entriesOnlyOnRight().values());
+                Iterables.addAll(altered, Iterables.transform(delta.entriesDiffering().values(), new Function<MapDifference.ValueDifference<UserType>, UserType>()
+                {
+                    public UserType apply(MapDifference.ValueDifference<UserType> pair)
+                    {
+                        return pair.rightValue();
+                    }
+                }));
+            }
+            else if (pre.hasColumns())
+            {
+                dropped.addAll(Schema.instance.getKSMetaData(keyspaceName).userTypes.getAllTypes().values());
+            }
+            else if (post.hasColumns())
+            {
+                created.addAll(UTMetaData.fromSchema(new Row(entry.getKey(), post)).values());
+            }
+        }
+
+        for (UserType type : created)
+            addType(type);
+        for (UserType type : altered)
+            updateType(type);
+        for (UserType type : dropped)
+            dropType(type);
+    }
+
     private static void addKeyspace(KSMetaData ksm)
     {
         assert Schema.instance.getKSMetaData(ksm.name) == null;
@@ -306,7 +370,7 @@
         KSMetaData ksm = Schema.instance.getKSMetaData(cfm.ksName);
         ksm = KSMetaData.cloneWith(ksm, Iterables.concat(ksm.cfMetaData().values(), Collections.singleton(cfm)));
 
-        logger.info("Loading " + cfm);
+        logger.info("Loading {}", cfm);
 
         Schema.instance.load(cfm);
 
@@ -323,6 +387,19 @@
         }
     }
 
+    private static void addType(UserType ut)
+    {
+        KSMetaData ksm = Schema.instance.getKSMetaData(ut.keyspace);
+        assert ksm != null;
+
+        logger.info("Loading {}", ut);
+
+        ksm.userTypes.addType(ut);
+
+        if (!StorageService.instance.isClientMode())
+            MigrationManager.instance.notifyCreateUserType(ut);
+    }
+
     private static void updateKeyspace(String ksName)
     {
         KSMetaData oldKsm = Schema.instance.getKSMetaData(ksName);
@@ -352,6 +429,19 @@
         }
     }
 
+    private static void updateType(UserType ut)
+    {
+        KSMetaData ksm = Schema.instance.getKSMetaData(ut.keyspace);
+        assert ksm != null;
+
+        logger.info("Updating {}", ut);
+
+        ksm.userTypes.addType(ut);
+
+        if (!StorageService.instance.isClientMode())
+            MigrationManager.instance.notifyUpdateUserType(ut);
+    }
+
     private static void dropKeyspace(String ksName)
     {
         KSMetaData ksm = Schema.instance.getKSMetaData(ksName);
@@ -359,10 +449,13 @@
 
         CompactionManager.instance.interruptCompactionFor(ksm.cfMetaData().values(), true);
 
+        Keyspace keyspace = Keyspace.open(ksm.name);
+
         // remove all cfs from the keyspace instance.
+        List<UUID> droppedCfs = new ArrayList<>();
         for (CFMetaData cfm : ksm.cfMetaData().values())
         {
-            ColumnFamilyStore cfs = Keyspace.open(ksm.name).getColumnFamilyStore(cfm.cfName);
+            ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfm.cfName);
 
             Schema.instance.purge(cfm);
 
@@ -372,11 +465,19 @@
                     cfs.snapshot(snapshotName);
                 Keyspace.open(ksm.name).dropCf(cfm.cfId);
             }
+
+            droppedCfs.add(cfm.cfId);
         }
 
         // remove the keyspace from the static instances.
         Keyspace.clear(ksm.name);
         Schema.instance.clearKeyspaceDefinition(ksm);
+
+        keyspace.writeOrder.awaitNewBarrier();
+
+        // force a new segment in the CL
+        CommitLog.instance.forceRecycleAllSegments(droppedCfs);
+
         if (!StorageService.instance.isClientMode())
         {
             MigrationManager.instance.notifyDropKeyspace(ksm);
@@ -404,13 +505,26 @@
                 cfs.snapshot(Keyspace.getTimestampedSnapshotName(cfs.name));
             Keyspace.open(ksm.name).dropCf(cfm.cfId);
             MigrationManager.instance.notifyDropColumnFamily(cfm);
+
+            CommitLog.instance.forceRecycleAllSegments(Collections.singleton(cfm.cfId));
         }
     }
 
+    private static void dropType(UserType ut)
+    {
+        KSMetaData ksm = Schema.instance.getKSMetaData(ut.keyspace);
+        assert ksm != null;
+
+        ksm.userTypes.removeType(ut);
+
+        if (!StorageService.instance.isClientMode())
+            MigrationManager.instance.notifyDropUserType(ut);
+    }
+
     private static KSMetaData makeNewKeyspaceDefinition(KSMetaData ksm, CFMetaData toExclude)
     {
         // clone ksm but do not include the new def
-        List<CFMetaData> newCfs = new ArrayList<CFMetaData>(ksm.cfMetaData().values());
+        List<CFMetaData> newCfs = new ArrayList<>(ksm.cfMetaData().values());
         newCfs.remove(toExclude);
         assert newCfs.size() == ksm.cfMetaData().size() - 1;
         return KSMetaData.cloneWith(ksm, newCfs);
@@ -418,10 +532,8 @@
 
     private static void flushSchemaCFs()
     {
-        SystemKeyspace.forceBlockingFlush(SystemKeyspace.SCHEMA_KEYSPACES_CF);
-        SystemKeyspace.forceBlockingFlush(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF);
-        SystemKeyspace.forceBlockingFlush(SystemKeyspace.SCHEMA_COLUMNS_CF);
-        SystemKeyspace.forceBlockingFlush(SystemKeyspace.SCHEMA_TRIGGERS_CF);
+        for (String cf : SystemKeyspace.allSchemaCfs)
+            SystemKeyspace.forceBlockingFlush(cf);
     }
 }
 

diff --git a/src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java b/src/java/org/apache/cassandra/db/DeletedCell.java
similarity index 62%
copy from src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java
copy to src/java/org/apache/cassandra/db/DeletedCell.java
index afd268d..998c409 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java
+++ b/src/java/org/apache/cassandra/db/DeletedCell.java

@@ -15,15 +15,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.db.columniterator;
+package org.apache.cassandra.db;
 
-import java.io.IOException;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
 
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.db.OnDiskAtom;
-
-public abstract class SimpleAbstractColumnIterator extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
+public interface DeletedCell extends Cell
 {
-    public void close() throws IOException {}
+    DeletedCell localCopy(CFMetaData metadata, AbstractAllocator allocator);
+
+    DeletedCell localCopy(CFMetaData metaData, MemtableAllocator allocator, OpOrder.Group opGroup);
 }

diff --git a/src/java/org/apache/cassandra/db/DeletedColumn.java b/src/java/org/apache/cassandra/db/DeletedColumn.java
deleted file mode 100644
index 377df27..0000000
--- a/src/java/org/apache/cassandra/db/DeletedColumn.java
+++ /dev/null

@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.Allocator;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.HeapAllocator;
-
-public class DeletedColumn extends Column
-{
-    public DeletedColumn(ByteBuffer name, int localDeletionTime, long timestamp)
-    {
-        this(name, ByteBufferUtil.bytes(localDeletionTime), timestamp);
-    }
-
-    public DeletedColumn(ByteBuffer name, ByteBuffer value, long timestamp)
-    {
-        super(name, value, timestamp);
-    }
-
-    @Override
-    public Column withUpdatedName(ByteBuffer newName)
-    {
-        return new DeletedColumn(newName, value, timestamp);
-    }
-
-    @Override
-    public Column withUpdatedTimestamp(long newTimestamp)
-    {
-        return new DeletedColumn(name, value, newTimestamp);
-    }
-
-    @Override
-    public boolean isMarkedForDelete(long now)
-    {
-        return true;
-    }
-
-    @Override
-    public long getMarkedForDeleteAt()
-    {
-        return timestamp;
-    }
-
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        digest.update(name.duplicate());
-
-        DataOutputBuffer buffer = new DataOutputBuffer();
-        try
-        {
-            buffer.writeLong(timestamp);
-            buffer.writeByte(serializationFlags());
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-        digest.update(buffer.getData(), 0, buffer.getLength());
-    }
-
-    @Override
-    public int getLocalDeletionTime()
-    {
-       return value.getInt(value.position());
-    }
-
-    @Override
-    public Column reconcile(Column column, Allocator allocator)
-    {
-        if (column instanceof DeletedColumn)
-            return super.reconcile(column, allocator);
-        return column.reconcile(this, allocator);
-    }
-
-    @Override
-    public Column localCopy(ColumnFamilyStore cfs)
-    {
-        return new DeletedColumn(cfs.internOrCopy(name, HeapAllocator.instance), ByteBufferUtil.clone(value), timestamp);
-    }
-
-    @Override
-    public Column localCopy(ColumnFamilyStore cfs, Allocator allocator)
-    {
-        return new DeletedColumn(cfs.internOrCopy(name, allocator), allocator.clone(value), timestamp);
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.DELETION_MASK;
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        validateName(metadata);
-        if (value().remaining() != 4)
-            throw new MarshalException("A tombstone value should be 4 bytes long");
-        if (getLocalDeletionTime() < 0)
-            throw new MarshalException("The local deletion time should not be negative");
-    }
-
-    public static DeletedColumn create(int localDeletionTime, long timestamp, String... names)
-    {
-        return new DeletedColumn(decomposeName(names), localDeletionTime, timestamp);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/DeletionInfo.java b/src/java/org/apache/cassandra/db/DeletionInfo.java
index 23f46bf..193f8b1 100644
--- a/src/java/org/apache/cassandra/db/DeletionInfo.java
+++ b/src/java/org/apache/cassandra/db/DeletionInfo.java

@@ -18,24 +18,31 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.*;
+import java.security.MessageDigest;
+import java.util.Comparator;
+import java.util.Iterator;
 
 import com.google.common.base.Objects;
 import com.google.common.collect.Iterators;
 
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.cache.IMeasurableMemory;
+import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
 
 /**
  * A combination of a top-level (or row) tombstone and range tombstones describing the deletions
  * within a {@link ColumnFamily} (or row).
  */
-public class DeletionInfo
+public class DeletionInfo implements IMeasurableMemory
 {
-    private static final Serializer serializer = new Serializer();
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new DeletionInfo(0, 0));
 
     /**
      * This represents a deletion of the entire row.  We can't represent this within the RangeTombstoneList, so it's
@@ -67,13 +74,13 @@
         this(topLevel, null);
     }
 
-    public DeletionInfo(ByteBuffer start, ByteBuffer end, Comparator<ByteBuffer> comparator, long markedForDeleteAt, int localDeletionTime)
+    public DeletionInfo(Composite start, Composite end, Comparator<Composite> comparator, long markedForDeleteAt, int localDeletionTime)
     {
         this(DeletionTime.LIVE, new RangeTombstoneList(comparator, 1));
         ranges.add(start, end, markedForDeleteAt, localDeletionTime);
     }
 
-    public DeletionInfo(RangeTombstone rangeTombstone, Comparator<ByteBuffer> comparator)
+    public DeletionInfo(RangeTombstone rangeTombstone, Comparator<Composite> comparator)
     {
         this(rangeTombstone.min, rangeTombstone.max, comparator, rangeTombstone.data.markedForDeleteAt, rangeTombstone.data.localDeletionTime);
     }
@@ -92,48 +99,50 @@
         return new DeletionInfo(DeletionTime.LIVE);
     }
 
-    public static Serializer serializer()
-    {
-        return serializer;
-    }
-
     public DeletionInfo copy()
     {
         return new DeletionInfo(topLevel, ranges == null ? null : ranges.copy());
     }
 
+    public DeletionInfo copy(AbstractAllocator allocator)
+    {
+
+        RangeTombstoneList rangesCopy = null;
+        if (ranges != null)
+             rangesCopy = ranges.copy(allocator);
+
+        return new DeletionInfo(topLevel, rangesCopy);
+    }
+
     /**
      * Returns whether this DeletionInfo is live, that is deletes no columns.
      */
     public boolean isLive()
     {
-        return topLevel.markedForDeleteAt == Long.MIN_VALUE
-            && topLevel.localDeletionTime == Integer.MAX_VALUE
-            && (ranges == null || ranges.isEmpty());
+        return topLevel.isLive() && (ranges == null || ranges.isEmpty());
     }
 
     /**
-     * Return whether a given column is deleted by the container having this deletion info.
+     * Return whether a given cell is deleted by the container having this deletion info.
      *
-     * @param column the column to check.
-     * @return true if the column is deleted, false otherwise
+     * @param cell the cell to check.
+     * @return true if the cell is deleted, false otherwise
      */
-    public boolean isDeleted(Column column)
-    {
-        return isDeleted(column.name(), column.timestamp());
-    }
-
-    public boolean isDeleted(ByteBuffer name, long timestamp)
+    public boolean isDeleted(Cell cell)
     {
         // We do rely on this test: if topLevel.markedForDeleteAt is MIN_VALUE, we should not
         // consider the column deleted even if timestamp=MIN_VALUE, otherwise this break QueryFilter.isRelevant
         if (isLive())
             return false;
 
-        if (timestamp <= topLevel.markedForDeleteAt)
+        if (cell.timestamp() <= topLevel.markedForDeleteAt)
             return true;
 
-        return ranges != null && ranges.isDeleted(name, timestamp);
+        // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
+        if (!topLevel.isLive() && cell instanceof CounterCell)
+            return true;
+
+        return ranges != null && ranges.isDeleted(cell);
     }
 
     /**
@@ -171,6 +180,35 @@
     }
 
     /**
+     * Evaluates difference between this deletion info and superset for read repair
+     *
+     * @return the difference between the two, or LIVE if no difference
+     */
+    public DeletionInfo diff(DeletionInfo superset)
+    {
+        RangeTombstoneList rangeDiff = superset.ranges == null || superset.ranges.isEmpty()
+                                     ? null
+                                     : ranges == null ? superset.ranges : ranges.diff(superset.ranges);
+
+        return topLevel.markedForDeleteAt != superset.topLevel.markedForDeleteAt || rangeDiff != null
+             ? new DeletionInfo(superset.topLevel, rangeDiff)
+             : DeletionInfo.live();
+    }
+
+
+    /**
+     * Digests deletion info. Used to trigger read repair on mismatch.
+     */
+    public void updateDigest(MessageDigest digest)
+    {
+        if (topLevel.markedForDeleteAt != Long.MIN_VALUE)
+            digest.update(ByteBufferUtil.bytes(topLevel.markedForDeleteAt));
+
+        if (ranges != null)
+            ranges.updateDigest(digest);
+    }
+
+    /**
      * Returns true if {@code purge} would remove the top-level tombstone or any of the range
      * tombstones, false otherwise.
      * @param gcBefore timestamp (in seconds) before which tombstones should be purged
@@ -194,7 +232,7 @@
             topLevel = newInfo;
     }
 
-    public void add(RangeTombstone tombstone, Comparator<ByteBuffer> comparator)
+    public void add(RangeTombstone tombstone, Comparator<Composite> comparator)
     {
         if (ranges == null)
             ranges = new RangeTombstoneList(comparator, 1);
@@ -255,7 +293,12 @@
         return ranges == null ? Iterators.<RangeTombstone>emptyIterator() : ranges.iterator();
     }
 
-    public DeletionTime rangeCovering(ByteBuffer name)
+    public Iterator<RangeTombstone> rangeIterator(Composite start, Composite finish)
+    {
+        return ranges == null ? Iterators.<RangeTombstone>emptyIterator() : ranges.iterator(start, finish);
+    }
+
+    public RangeTombstone rangeCovering(Composite name)
     {
         return ranges == null ? null : ranges.search(name);
     }
@@ -281,8 +324,7 @@
      */
     public boolean mayModify(DeletionInfo delInfo)
     {
-        return topLevel.markedForDeleteAt > delInfo.topLevel.markedForDeleteAt
-            || hasRanges();
+        return topLevel.compareTo(delInfo.topLevel) > 0 || hasRanges();
     }
 
     @Override
@@ -298,15 +340,15 @@
     {
         assert !ranges.isEmpty();
         StringBuilder sb = new StringBuilder();
-        AbstractType at = (AbstractType)ranges.comparator();
-        assert at != null;
+        CType type = (CType)ranges.comparator();
+        assert type != null;
         Iterator<RangeTombstone> iter = rangeIterator();
         while (iter.hasNext())
         {
             RangeTombstone i = iter.next();
             sb.append("[");
-            sb.append(at.getString(i.min)).append("-");
-            sb.append(at.getString(i.max)).append(", ");
+            sb.append(type.getString(i.min)).append("-");
+            sb.append(type.getString(i.max)).append(", ");
             sb.append(i.data);
             sb.append("]");
         }
@@ -338,34 +380,38 @@
         return Objects.hashCode(topLevel, ranges);
     }
 
+    @Override
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE + topLevel.unsharedHeapSize() + (ranges == null ? 0 : ranges.unsharedHeapSize());
+    }
+
     public static class Serializer implements IVersionedSerializer<DeletionInfo>
     {
-        public void serialize(DeletionInfo info, DataOutput out, int version) throws IOException
+        private final RangeTombstoneList.Serializer rtlSerializer;
+
+        public Serializer(CType type)
+        {
+            this.rtlSerializer = new RangeTombstoneList.Serializer(type);
+        }
+
+        public void serialize(DeletionInfo info, DataOutputPlus out, int version) throws IOException
         {
             DeletionTime.serializer.serialize(info.topLevel, out);
-            RangeTombstoneList.serializer.serialize(info.ranges, out, version);
+            rtlSerializer.serialize(info.ranges, out, version);
         }
 
-        /*
-         * Range tombstones internally depend on the column family serializer, but it is not serialized.
-         * Thus deserialize(DataInput, int, Comparator<ByteBuffer>) should be used instead of this method.
-         */
         public DeletionInfo deserialize(DataInput in, int version) throws IOException
         {
-            throw new UnsupportedOperationException();
-        }
-
-        public DeletionInfo deserialize(DataInput in, int version, Comparator<ByteBuffer> comparator) throws IOException
-        {
             DeletionTime topLevel = DeletionTime.serializer.deserialize(in);
-            RangeTombstoneList ranges = RangeTombstoneList.serializer.deserialize(in, version, comparator);
+            RangeTombstoneList ranges = rtlSerializer.deserialize(in, version);
             return new DeletionInfo(topLevel, ranges);
         }
 
         public long serializedSize(DeletionInfo info, TypeSizes typeSizes, int version)
         {
             long size = DeletionTime.serializer.serializedSize(info.topLevel, typeSizes);
-            return size + RangeTombstoneList.serializer.serializedSize(info.ranges, typeSizes, version);
+            return size + rtlSerializer.serializedSize(info.ranges, typeSizes, version);
         }
 
         public long serializedSize(DeletionInfo info, int version)
@@ -398,14 +444,13 @@
             this.reversed = reversed;
         }
 
-        public boolean isDeleted(Column column)
+        public boolean isDeleted(Cell cell)
         {
-            return isDeleted(column.name(), column.timestamp());
-        }
+            if (cell.timestamp() <= topLevel.markedForDeleteAt)
+                return true;
 
-        public boolean isDeleted(ByteBuffer name, long timestamp)
-        {
-            if (timestamp <= topLevel.markedForDeleteAt)
+            // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
+            if (!topLevel.isLive() && cell instanceof CounterCell)
                 return true;
 
             /*
@@ -413,13 +458,13 @@
              * is always in forward sorted order.
              */
             if (reversed)
-                 return DeletionInfo.this.isDeleted(name, timestamp);
+                 return DeletionInfo.this.isDeleted(cell);
 
             // Maybe create the tester if we hadn't yet and we now have some ranges (see above).
             if (tester == null && ranges != null)
                 tester = ranges.inOrderTester();
 
-            return tester != null && tester.isDeleted(name, timestamp);
+            return tester != null && tester.isDeleted(cell);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/DeletionTime.java b/src/java/org/apache/cassandra/db/DeletionTime.java
index dd2ccaf..0e5f13f 100644
--- a/src/java/org/apache/cassandra/db/DeletionTime.java
+++ b/src/java/org/apache/cassandra/db/DeletionTime.java

@@ -18,21 +18,25 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Objects;
 
+import org.apache.cassandra.cache.IMeasurableMemory;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.codehaus.jackson.annotate.JsonIgnore;
 
 /**
  * A top-level (row) tombstone.
  */
-public class DeletionTime implements Comparable<DeletionTime>
+public class DeletionTime implements Comparable<DeletionTime>, IMeasurableMemory
 {
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new DeletionTime(0, 0));
+
     /**
      * A special DeletionTime that signifies that there is no top-level (row) tombstone.
      */
@@ -51,7 +55,7 @@
      */
     public final int localDeletionTime;
 
-    public static final ISerializer<DeletionTime> serializer = new Serializer();
+    public static final Serializer serializer = new Serializer();
 
     @VisibleForTesting
     public DeletionTime(long markedForDeleteAt, int localDeletionTime)
@@ -109,20 +113,19 @@
         return localDeletionTime < gcBefore;
     }
 
-    public boolean isDeleted(Column column)
+    public boolean isDeleted(OnDiskAtom atom)
     {
-        return column.timestamp() <= markedForDeleteAt;
+        return atom.timestamp() <= markedForDeleteAt;
     }
 
-    public long memorySize()
+    public long unsharedHeapSize()
     {
-        long fields = TypeSizes.NATIVE.sizeof(markedForDeleteAt) + TypeSizes.NATIVE.sizeof(localDeletionTime);
-        return ObjectSizes.getFieldSize(fields);
+        return EMPTY_SIZE;
     }
 
-    private static class Serializer implements ISerializer<DeletionTime>
+    public static class Serializer implements ISerializer<DeletionTime>
     {
-        public void serialize(DeletionTime delTime, DataOutput out) throws IOException
+        public void serialize(DeletionTime delTime, DataOutputPlus out) throws IOException
         {
             out.writeInt(delTime.localDeletionTime);
             out.writeLong(delTime.markedForDeleteAt);
@@ -137,6 +140,11 @@
                  : new DeletionTime(mfda, ldt);
         }
 
+        public void skip(DataInput in) throws IOException
+        {
+            FileUtils.skipBytesFully(in, 4 + 8);
+        }
+
         public long serializedSize(DeletionTime delTime, TypeSizes typeSizes)
         {
             return typeSizes.sizeof(delTime.localDeletionTime)

diff --git a/src/java/org/apache/cassandra/db/Directories.java b/src/java/org/apache/cassandra/db/Directories.java
index e118f86..4319481 100644
--- a/src/java/org/apache/cassandra/db/Directories.java
+++ b/src/java/org/apache/cassandra/db/Directories.java

@@ -17,16 +17,28 @@
  */
 package org.apache.cassandra.db;
 
+import static com.google.common.collect.Sets.newHashSet;
+
 import java.io.File;
 import java.io.FileFilter;
 import java.io.IOError;
 import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Predicate;
 import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.ImmutableSet.Builder;
+import com.google.common.collect.Iterables;
 import com.google.common.primitives.Longs;
 import com.google.common.util.concurrent.Uninterruptibles;
 
@@ -35,22 +47,26 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.*;
-import org.apache.cassandra.db.compaction.LeveledManifest;
 import org.apache.cassandra.io.FSError;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 
 /**
  * Encapsulate handling of paths to the data files.
  *
- * The directory layout is the following:
- *   /<path_to_data_dir>/ks/cf1/ks-cf1-hb-1-Data.db
- *                         /cf2/ks-cf2-hb-1-Data.db
+ * Since v2.1, the directory layout is the following:
+ *   /<path_to_data_dir>/ks/cf1-cfId/ks-cf1-ka-1-Data.db
+ *                         /cf2-cfId/ks-cf2-ka-1-Data.db
  *                         ...
  *
+ * cfId is an hex encoded CFID.
+ *
+ * For backward compatibility, Directories uses older directory layout if exists.
+ *
  * In addition, more that one 'root' data directory can be specified so that
  * <path_to_data_dir> potentially represents multiple locations.
  * Note that in the case of multiple locations, the manifest for the leveled
@@ -71,23 +87,25 @@
     public static final String SNAPSHOT_SUBDIR = "snapshots";
     public static final String SECONDARY_INDEX_NAME_SEPARATOR = ".";
 
-    public static final DataDirectory[] dataFileLocations;
+    public static final DataDirectory[] dataDirectories;
     static
     {
         String[] locations = DatabaseDescriptor.getAllDataFileLocations();
-        dataFileLocations = new DataDirectory[locations.length];
+        dataDirectories = new DataDirectory[locations.length];
         for (int i = 0; i < locations.length; ++i)
-            dataFileLocations[i] = new DataDirectory(new File(locations[i]));
+            dataDirectories[i] = new DataDirectory(new File(locations[i]));
     }
 
+
     /**
-     * Checks whether Cassandra has RWX permissions to the specified directory.
+     * Checks whether Cassandra has RWX permissions to the specified directory.  Logs an error with
+     * the details if it does not.
      *
      * @param dir File object of the directory.
      * @param dataDir String representation of the directory's location
      * @return status representing Cassandra's RWX permissions to the supplied folder location.
      */
-    public static boolean hasFullPermissions(File dir, String dataDir)
+    public static boolean verifyFullPermissions(File dir, String dataDir)
     {
         if (!dir.isDirectory())
         {
@@ -152,42 +170,61 @@
         }
     }
 
-    private final String keyspacename;
-    private final String cfname;
-    private final File[] sstableDirectories;
+    private final CFMetaData metadata;
+    private final File[] dataPaths;
 
-    public static Directories create(String keyspacename, String cfname)
+    /**
+     * Create Directories of given ColumnFamily.
+     * SSTable directories are created under data_directories defined in cassandra.yaml if not exist at this time.
+     *
+     * @param metadata metadata of ColumnFamily
+     */
+    public Directories(CFMetaData metadata)
     {
-        int idx = cfname.indexOf(SECONDARY_INDEX_NAME_SEPARATOR);
-        if (idx > 0)
-            // secondary index, goes in the same directory than the base cf
-            return new Directories(keyspacename, cfname, cfname.substring(0, idx));
-        else
-            return new Directories(keyspacename, cfname, cfname);
-    }
-
-    private Directories(String keyspacename, String cfname, String directoryName)
-    {
-        this.keyspacename = keyspacename;
-        this.cfname = cfname;
-        this.sstableDirectories = new File[dataFileLocations.length];
-        for (int i = 0; i < dataFileLocations.length; ++i)
-            sstableDirectories[i] = new File(dataFileLocations[i].location, join(keyspacename, directoryName));
-
-        if (!StorageService.instance.isClientMode())
+        this.metadata = metadata;
+        if (StorageService.instance.isClientMode())
         {
-            for (File dir : sstableDirectories)
+            dataPaths = null;
+            return;
+        }
+
+        String cfId = ByteBufferUtil.bytesToHex(ByteBufferUtil.bytes(metadata.cfId));
+        int idx = metadata.cfName.indexOf(SECONDARY_INDEX_NAME_SEPARATOR);
+        // secondary indicies go in the same directory as the base cf
+        String directoryName = idx > 0 ? metadata.cfName.substring(0, idx) + "-" + cfId : metadata.cfName + "-" + cfId;
+
+        this.dataPaths = new File[dataDirectories.length];
+        // If upgraded from version less than 2.1, use existing directories
+        for (int i = 0; i < dataDirectories.length; ++i)
+        {
+            // check if old SSTable directory exists
+            dataPaths[i] = new File(dataDirectories[i].location, join(metadata.ksName, this.metadata.cfName));
+        }
+        boolean olderDirectoryExists = Iterables.any(Arrays.asList(dataPaths), new Predicate<File>()
+        {
+            public boolean apply(File file)
             {
-                try
-                {
-                    FileUtils.createDirectory(dir);
-                }
-                catch (FSError e)
-                {
-                    // don't just let the default exception handler do this, we need the create loop to continue
-                    logger.error("Failed to create {} directory", dir);
-                    FileUtils.handleFSError(e);
-                }
+                return file.exists();
+            }
+        });
+        if (!olderDirectoryExists)
+        {
+            // use 2.1-style path names
+            for (int i = 0; i < dataDirectories.length; ++i)
+                dataPaths[i] = new File(dataDirectories[i].location, join(metadata.ksName, directoryName));
+        }
+
+        for (File dir : dataPaths)
+        {
+            try
+            {
+                FileUtils.createDirectory(dir);
+            }
+            catch (FSError e)
+            {
+                // don't just let the default exception handler do this, we need the create loop to continue
+                logger.error("Failed to create {} directory", dir);
+                FileUtils.handleFSError(e);
             }
         }
     }
@@ -200,7 +237,7 @@
      */
     public File getLocationForDisk(DataDirectory dataDirectory)
     {
-        for (File dir : sstableDirectories)
+        for (File dir : dataPaths)
         {
             if (dir.getAbsolutePath().startsWith(dataDirectory.location.getAbsolutePath()))
                 return dir;
@@ -208,6 +245,16 @@
         return null;
     }
 
+    public Descriptor find(String filename)
+    {
+        for (File dir : dataPaths)
+        {
+            if (new File(dir, filename).exists())
+                return Descriptor.fromFilename(dir, filename).left;
+        }
+        return null;
+    }
+
     public File getDirectoryForNewSSTables()
     {
         File path = getWriteableLocationAsFile();
@@ -241,10 +288,10 @@
      */
     public DataDirectory getWriteableLocation()
     {
-        List<DataDirectory> candidates = new ArrayList<DataDirectory>();
+        List<DataDirectory> candidates = new ArrayList<>();
 
         // pick directories with enough space and so that resulting sstable dirs aren't blacklisted for writes.
-        for (DataDirectory dataDir : dataFileLocations)
+        for (DataDirectory dataDir : dataDirectories)
         {
             if (BlacklistedDirectories.isUnwritable(getLocationForDisk(dataDir)))
                 continue;
@@ -269,12 +316,16 @@
         return candidates.get(0);
     }
 
-
     public static File getSnapshotDirectory(Descriptor desc, String snapshotName)
     {
         return getOrCreate(desc.directory, SNAPSHOT_SUBDIR, snapshotName);
     }
 
+    public File getSnapshotManifestFile(String snapshotName)
+    {
+         return new File(getDirectoryForNewSSTables(), join(SNAPSHOT_SUBDIR, snapshotName, "manifest.json"));
+    }
+
     public static File getBackupsDirectory(Descriptor desc)
     {
         return getOrCreate(desc.directory, BACKUPS_SUBDIR);
@@ -319,7 +370,7 @@
         private boolean includeBackups;
         private boolean onlyBackups;
         private int nbFiles;
-        private final Map<Descriptor, Set<Component>> components = new HashMap<Descriptor, Set<Component>>();
+        private final Map<Descriptor, Set<Component>> components = new HashMap<>();
         private boolean filtered;
         private String snapshotName;
 
@@ -365,7 +416,7 @@
         public List<File> listFiles()
         {
             filter();
-            List<File> l = new ArrayList<File>(nbFiles);
+            List<File> l = new ArrayList<>(nbFiles);
             for (Map.Entry<Descriptor, Set<Component>> entry : components.entrySet())
             {
                 for (Component c : entry.getValue())
@@ -381,7 +432,7 @@
             if (filtered)
                 return;
 
-            for (File location : sstableDirectories)
+            for (File location : dataPaths)
             {
                 if (BlacklistedDirectories.isUnreadable(location))
                     continue;
@@ -404,7 +455,7 @@
         private FileFilter getFilter()
         {
             // Note: the prefix needs to include cfname + separator to distinguish between a cfs and it's secondary indexes
-            final String sstablePrefix = keyspacename + Component.separator + cfname + Component.separator;
+            final String sstablePrefix = getSSTablePrefix();
             return new FileFilter()
             {
                 // This function always return false since accepts adds to the components map
@@ -418,13 +469,13 @@
                     if (pair == null)
                         return false;
 
-                    if (skipTemporary && pair.left.temporary)
+                    if (skipTemporary && pair.left.type.isTemporary)
                         return false;
 
                     Set<Component> previous = components.get(pair.left);
                     if (previous == null)
                     {
-                        previous = new HashSet<Component>();
+                        previous = new HashSet<>();
                         components.put(pair.left, previous);
                     }
                     previous.add(pair.right);
@@ -435,37 +486,45 @@
         }
     }
 
-    @Deprecated
-    public File tryGetLeveledManifest()
+    /**
+     *
+     * @return  Return a map of all snapshots to space being used
+     * The pair for a snapshot has size on disk and true size.
+     */
+    public Map<String, Pair<Long, Long>> getSnapshotDetails()
     {
-        for (File dir : sstableDirectories)
+        final Map<String, Pair<Long, Long>> snapshotSpaceMap = new HashMap<>();
+        for (final File dir : dataPaths)
         {
-            File manifestFile = new File(dir, cfname + LeveledManifest.EXTENSION);
-            if (manifestFile.exists())
+            final File snapshotDir = new File(dir,SNAPSHOT_SUBDIR);
+            if (snapshotDir.exists() && snapshotDir.isDirectory())
             {
-                logger.debug("Found manifest at {}", manifestFile);
-                return manifestFile;
+                final File[] snapshots  = snapshotDir.listFiles();
+                if (snapshots != null)
+                {
+                    for (final File snapshot : snapshots)
+                    {
+                        if (snapshot.isDirectory())
+                        {
+                            final long sizeOnDisk = FileUtils.folderSize(snapshot);
+                            final long trueSize = getTrueAllocatedSizeIn(snapshot);
+                            Pair<Long,Long> spaceUsed = snapshotSpaceMap.get(snapshot.getName());
+                            if (spaceUsed == null)
+                                spaceUsed =  Pair.create(sizeOnDisk,trueSize);
+                            else
+                                spaceUsed = Pair.create(spaceUsed.left + sizeOnDisk, spaceUsed.right + trueSize);
+                            snapshotSpaceMap.put(snapshot.getName(), spaceUsed);
+                        }
+                    }
+                }
             }
         }
-        logger.debug("No level manifest found");
-        return null;
-    }
 
-    @Deprecated
-    public void snapshotLeveledManifest(String snapshotName)
-    {
-        File manifest = tryGetLeveledManifest();
-        if (manifest != null)
-        {
-            File snapshotDirectory = getOrCreate(manifest.getParentFile(), SNAPSHOT_SUBDIR, snapshotName);
-            File target = new File(snapshotDirectory, manifest.getName());
-            FileUtils.createHardLink(manifest, target);
-        }
+        return snapshotSpaceMap;
     }
-
     public boolean snapshotExists(String snapshotName)
     {
-        for (File dir : sstableDirectories)
+        for (File dir : dataPaths)
         {
             File snapshotDir = new File(dir, join(SNAPSHOT_SUBDIR, snapshotName));
             if (snapshotDir.exists())
@@ -484,7 +543,7 @@
             if (snapshotDir.exists())
             {
                 if (logger.isDebugEnabled())
-                    logger.debug("Removing snapshot directory " + snapshotDir);
+                    logger.debug("Removing snapshot directory {}", snapshotDir);
                 FileUtils.deleteRecursive(snapshotDir);
             }
         }
@@ -493,7 +552,7 @@
     // The snapshot must exist
     public long snapshotCreationTime(String snapshotName)
     {
-        for (File dir : sstableDirectories)
+        for (File dir : dataPaths)
         {
             File snapshotDir = new File(dir, join(SNAPSHOT_SUBDIR, snapshotName));
             if (snapshotDir.exists())
@@ -501,12 +560,43 @@
         }
         throw new RuntimeException("Snapshot " + snapshotName + " doesn't exist");
     }
+    
+    public long trueSnapshotsSize()
+    {
+        long result = 0L;
+        for (File dir : dataPaths)
+            result += getTrueAllocatedSizeIn(new File(dir, join(SNAPSHOT_SUBDIR)));
+        return result;
+    }
+
+    private String getSSTablePrefix()
+    {
+        return metadata.ksName + Component.separator + metadata.cfName + Component.separator;
+    }
+
+    public long getTrueAllocatedSizeIn(File input)
+    {
+        if (!input.isDirectory())
+            return 0;
+        
+        TrueFilesSizeVisitor visitor = new TrueFilesSizeVisitor();
+        try
+        {
+            Files.walkFileTree(input.toPath(), visitor);
+        }
+        catch (IOException e)
+        {
+            logger.error("Could not calculate the size of {}. {}", input, e);
+        }
+    
+        return visitor.getAllocatedSize();
+    }
 
     // Recursively finds all the sub directories in the KS directory.
     public static List<File> getKSChildDirectories(String ksName)
     {
-        List<File> result = new ArrayList<File>();
-        for (DataDirectory dataDirectory : dataFileLocations)
+        List<File> result = new ArrayList<>();
+        for (DataDirectory dataDirectory : dataDirectories)
         {
             File ksDir = new File(dataDirectory.location, ksName);
             File[] cfDirs = ksDir.listFiles();
@@ -523,8 +613,8 @@
 
     public List<File> getCFDirectories()
     {
-        List<File> result = new ArrayList<File>();
-        for (File dataDirectory : sstableDirectories)
+        List<File> result = new ArrayList<>();
+        for (File dataDirectory : dataPaths)
         {
             if (dataDirectory.isDirectory())
                 result.add(dataDirectory);
@@ -552,18 +642,65 @@
         return StringUtils.join(s, File.separator);
     }
 
-    // Hack for tests, don't use otherwise
+    @VisibleForTesting
     static void overrideDataDirectoriesForTest(String loc)
     {
-        for (int i = 0; i < dataFileLocations.length; ++i)
-            dataFileLocations[i] = new DataDirectory(new File(loc));
+        for (int i = 0; i < dataDirectories.length; ++i)
+            dataDirectories[i] = new DataDirectory(new File(loc));
     }
 
-    // Hack for tests, don't use otherwise
+    @VisibleForTesting
     static void resetDataDirectoriesAfterTest()
     {
         String[] locations = DatabaseDescriptor.getAllDataFileLocations();
         for (int i = 0; i < locations.length; ++i)
-            dataFileLocations[i] = new DataDirectory(new File(locations[i]));
+            dataDirectories[i] = new DataDirectory(new File(locations[i]));
+    }
+    
+    private class TrueFilesSizeVisitor extends SimpleFileVisitor<Path>
+    {
+        private final AtomicLong size = new AtomicLong(0);
+        private final Set<String> visited = newHashSet(); //count each file only once
+        private final Set<String> alive;
+        private final String prefix = getSSTablePrefix();
+
+        public TrueFilesSizeVisitor()
+        {
+            super();
+            Builder<String> builder = ImmutableSet.builder();
+            for (File file: sstableLister().listFiles())
+                builder.add(file.getName());
+            alive = builder.build();
+        }
+
+        private boolean isAcceptable(Path file)
+        {
+            String fileName = file.toFile().getName(); 
+            return fileName.startsWith(prefix)
+                    && !visited.contains(fileName)
+                    && !alive.contains(fileName);
+        }
+
+        @Override
+        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException
+        {
+            if (isAcceptable(file))
+            {
+                size.addAndGet(attrs.size());
+                visited.add(file.toFile().getName());
+            }
+            return FileVisitResult.CONTINUE;
+        }
+
+        @Override
+        public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException 
+        {
+            return FileVisitResult.CONTINUE;
+        }
+        
+        public long getAllocatedSize()
+        {
+            return size.get();
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/EmptyColumns.java b/src/java/org/apache/cassandra/db/EmptyColumns.java
deleted file mode 100644
index 129ddc5..0000000
--- a/src/java/org/apache/cassandra/db/EmptyColumns.java
+++ /dev/null

@@ -1,120 +0,0 @@
-package org.apache.cassandra.db;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Iterator;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.utils.Allocator;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterators;
-
-public class EmptyColumns extends AbstractThreadUnsafeSortedColumns
-{
-    public static final Factory<EmptyColumns> factory = new Factory<EmptyColumns>()
-    {
-        public EmptyColumns create(CFMetaData metadata, boolean insertReversed)
-        {
-            assert !insertReversed;
-            return new EmptyColumns(metadata, DeletionInfo.live());
-        }
-    };
-
-    public EmptyColumns(CFMetaData metadata, DeletionInfo info)
-    {
-        super(metadata, info);
-    }
-
-    public ColumnFamily cloneMe()
-    {
-        return new EmptyColumns(metadata, deletionInfo);
-    }
-
-    public void clear()
-    {
-    }
-
-    public Factory<EmptyColumns> getFactory()
-    {
-        return factory;
-    }
-
-    public void addColumn(Column column, Allocator allocator)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public void addAll(ColumnFamily cm, Allocator allocator, Function<Column, Column> transformation)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public boolean replace(Column oldColumn, Column newColumn)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Column getColumn(ByteBuffer name)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Iterable<ByteBuffer> getColumnNames()
-    {
-        return Collections.emptyList();
-    }
-
-    public Collection<Column> getSortedColumns()
-    {
-        return Collections.emptyList();
-    }
-
-    public Collection<Column> getReverseSortedColumns()
-    {
-        return Collections.emptyList();
-    }
-
-    public int getColumnCount()
-    {
-        return 0;
-    }
-
-    public Iterator<Column> iterator(ColumnSlice[] slices)
-    {
-        return Iterators.emptyIterator();
-    }
-
-    public Iterator<Column> reverseIterator(ColumnSlice[] slices)
-    {
-        return Iterators.emptyIterator();
-    }
-
-    public boolean isInsertReversed()
-    {
-        return false;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/ExpiringCell.java b/src/java/org/apache/cassandra/db/ExpiringCell.java
new file mode 100644
index 0000000..5fc0f94
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ExpiringCell.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+/**
+ * Alternative to Cell that have an expiring time.
+ * ExpiringCell is immutable (as Cell is).
+ *
+ * Note that ExpiringCell does not override Cell.getMarkedForDeleteAt,
+ * which means that it's in the somewhat unintuitive position of being deleted (after its expiration)
+ * without having a time-at-which-it-became-deleted.  (Because ttl is a server-side measurement,
+ * we can't mix it with the timestamp field, which is client-supplied and whose resolution we
+ * can't assume anything about.)
+ */
+public interface ExpiringCell extends Cell
+{
+    public static final int MAX_TTL = 20 * 365 * 24 * 60 * 60; // 20 years in seconds
+
+    public int getTimeToLive();
+
+    ExpiringCell localCopy(CFMetaData metadata, AbstractAllocator allocator);
+
+    ExpiringCell localCopy(CFMetaData metaData, MemtableAllocator allocator, OpOrder.Group opGroup);
+}

diff --git a/src/java/org/apache/cassandra/db/ExpiringColumn.java b/src/java/org/apache/cassandra/db/ExpiringColumn.java
deleted file mode 100644
index e11567f..0000000
--- a/src/java/org/apache/cassandra/db/ExpiringColumn.java
+++ /dev/null

@@ -1,205 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.Allocator;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.HeapAllocator;
-
-/**
- * Alternative to Column that have an expiring time.
- * ExpiringColumn is immutable (as Column is).
- *
- * Note that ExpiringColumn does not override Column.getMarkedForDeleteAt,
- * which means that it's in the somewhat unintuitive position of being deleted (after its expiration)
- * without having a time-at-which-it-became-deleted.  (Because ttl is a server-side measurement,
- * we can't mix it with the timestamp field, which is client-supplied and whose resolution we
- * can't assume anything about.)
- */
-public class ExpiringColumn extends Column
-{
-    public static final int MAX_TTL = 20 * 365 * 24 * 60 * 60; // 20 years in seconds
-
-    private final int localExpirationTime;
-    private final int timeToLive;
-
-    public ExpiringColumn(ByteBuffer name, ByteBuffer value, long timestamp, int timeToLive)
-    {
-      this(name, value, timestamp, timeToLive, (int) (System.currentTimeMillis() / 1000) + timeToLive);
-    }
-
-    public ExpiringColumn(ByteBuffer name, ByteBuffer value, long timestamp, int timeToLive, int localExpirationTime)
-    {
-        super(name, value, timestamp);
-        assert timeToLive > 0 : timeToLive;
-        assert localExpirationTime > 0 : localExpirationTime;
-        this.timeToLive = timeToLive;
-        this.localExpirationTime = localExpirationTime;
-    }
-
-    /** @return Either a DeletedColumn, or an ExpiringColumn. */
-    public static Column create(ByteBuffer name, ByteBuffer value, long timestamp, int timeToLive, int localExpirationTime, int expireBefore, ColumnSerializer.Flag flag)
-    {
-        if (localExpirationTime >= expireBefore || flag == ColumnSerializer.Flag.PRESERVE_SIZE)
-            return new ExpiringColumn(name, value, timestamp, timeToLive, localExpirationTime);
-        // The column is now expired, we can safely return a simple tombstone. Note that
-        // as long as the expiring column and the tombstone put together live longer than GC grace seconds,
-        // we'll fulfil our responsibility to repair.  See discussion at
-        // http://cassandra-user-incubator-apache-org.3065146.n2.nabble.com/repair-compaction-and-tombstone-rows-td7583481.html
-        return new DeletedColumn(name, localExpirationTime - timeToLive, timestamp);
-    }
-
-    public int getTimeToLive()
-    {
-        return timeToLive;
-    }
-
-    @Override
-    public Column withUpdatedName(ByteBuffer newName)
-    {
-        return new ExpiringColumn(newName, value, timestamp, timeToLive, localExpirationTime);
-    }
-
-    @Override
-    public Column withUpdatedTimestamp(long newTimestamp)
-    {
-        return new ExpiringColumn(name, value, newTimestamp, timeToLive, localExpirationTime);
-    }
-
-    @Override
-    public int dataSize()
-    {
-        return super.dataSize() + TypeSizes.NATIVE.sizeof(localExpirationTime) + TypeSizes.NATIVE.sizeof(timeToLive);
-    }
-
-    @Override
-    public int serializedSize(TypeSizes typeSizes)
-    {
-        /*
-         * An expired column adds to a Column :
-         *    4 bytes for the localExpirationTime
-         *  + 4 bytes for the timeToLive
-        */
-        return super.serializedSize(typeSizes) + typeSizes.sizeof(localExpirationTime) + typeSizes.sizeof(timeToLive);
-    }
-
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        digest.update(name.duplicate());
-        digest.update(value.duplicate());
-
-        DataOutputBuffer buffer = new DataOutputBuffer();
-        try
-        {
-            buffer.writeLong(timestamp);
-            buffer.writeByte(serializationFlags());
-            buffer.writeInt(timeToLive);
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-        digest.update(buffer.getData(), 0, buffer.getLength());
-    }
-
-    @Override
-    public int getLocalDeletionTime()
-    {
-        return localExpirationTime;
-    }
-
-    @Override
-    public Column localCopy(ColumnFamilyStore cfs)
-    {
-        return new ExpiringColumn(cfs.internOrCopy(name, HeapAllocator.instance), ByteBufferUtil.clone(value), timestamp, timeToLive, localExpirationTime);
-    }
-
-    @Override
-    public Column localCopy(ColumnFamilyStore cfs, Allocator allocator)
-    {
-        ByteBuffer clonedName = cfs.maybeIntern(name);
-        if (clonedName == null)
-            clonedName = allocator.clone(name);
-        return new ExpiringColumn(clonedName, allocator.clone(value), timestamp, timeToLive, localExpirationTime);
-    }
-
-    @Override
-    public String getString(AbstractType<?> comparator)
-    {
-        StringBuilder sb = new StringBuilder();
-        sb.append(super.getString(comparator));
-        sb.append("!");
-        sb.append(timeToLive);
-        return sb.toString();
-    }
-
-    @Override
-    public boolean isMarkedForDelete(long now)
-    {
-        return (int) (now / 1000) >= getLocalDeletionTime();
-    }
-
-    @Override
-    public long getMarkedForDeleteAt()
-    {
-        return timestamp;
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.EXPIRATION_MASK;
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        super.validateFields(metadata);
-        if (timeToLive <= 0)
-            throw new MarshalException("A column TTL should be > 0");
-        if (localExpirationTime < 0)
-            throw new MarshalException("The local expiration time should not be negative");
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        // super.equals() returns false if o is not a CounterColumn
-        return super.equals(o)
-            && localExpirationTime == ((ExpiringColumn)o).localExpirationTime
-            && timeToLive == ((ExpiringColumn)o).timeToLive;
-    }
-
-    @Override
-    public int hashCode()
-    {
-        int result = super.hashCode();
-        result = 31 * result + localExpirationTime;
-        result = 31 * result + timeToLive;
-        return result;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/HintedHandOffManager.java b/src/java/org/apache/cassandra/db/HintedHandOffManager.java
index a6b6d4c..2eb49c7 100644
--- a/src/java/org/apache/cassandra/db/HintedHandOffManager.java
+++ b/src/java/org/apache/cassandra/db/HintedHandOffManager.java

@@ -40,12 +40,14 @@
 
 import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
 import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.composites.Composites;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.UUIDType;
 import org.apache.cassandra.dht.IPartitioner;
@@ -101,7 +103,6 @@
 
     private volatile boolean hintedHandOffPaused = false;
 
-    static final CompositeType comparator = CompositeType.getInstance(Arrays.<AbstractType<?>>asList(UUIDType.instance, Int32Type.instance));
     static final int maxHintTTL = Integer.parseInt(System.getProperty("cassandra.maxHintTTL", String.valueOf(Integer.MAX_VALUE)));
 
     private final NonBlockingHashSet<InetAddress> queuedDeliveries = new NonBlockingHashSet<InetAddress>();
@@ -119,7 +120,7 @@
      * Returns a mutation representing a Hint to be sent to <code>targetId</code>
      * as soon as it becomes available again.
      */
-    public RowMutation hintFor(RowMutation mutation, long now, int ttl, UUID targetId)
+    public Mutation hintFor(Mutation mutation, long now, int ttl, UUID targetId)
     {
         assert ttl > 0;
 
@@ -132,19 +133,19 @@
 
         UUID hintId = UUIDGen.getTimeUUID();
         // serialize the hint with id and version as a composite column name
-        ByteBuffer name = comparator.decompose(hintId, MessagingService.current_version);
-        ByteBuffer value = ByteBuffer.wrap(FBUtilities.serialize(mutation, RowMutation.serializer, MessagingService.current_version));
+        CellName name = CFMetaData.HintsCf.comparator.makeCellName(hintId, MessagingService.current_version);
+        ByteBuffer value = ByteBuffer.wrap(FBUtilities.serialize(mutation, Mutation.serializer, MessagingService.current_version));
         ColumnFamily cf = ArrayBackedSortedColumns.factory.create(Schema.instance.getCFMetaData(Keyspace.SYSTEM_KS, SystemKeyspace.HINTS_CF));
         cf.addColumn(name, value, now, ttl);
-        return new RowMutation(Keyspace.SYSTEM_KS, UUIDType.instance.decompose(targetId), cf);
+        return new Mutation(Keyspace.SYSTEM_KS, UUIDType.instance.decompose(targetId), cf);
     }
 
     /*
-     * determine the TTL for the hint RowMutation
+     * determine the TTL for the hint Mutation
      * this is set at the smallest GCGraceSeconds for any of the CFs in the RM
      * this ensures that deletes aren't "undone" by delivery of an old hint
      */
-    public static int calculateHintTTL(RowMutation mutation)
+    public static int calculateHintTTL(Mutation mutation)
     {
         int ttl = maxHintTTL;
         for (ColumnFamily cf : mutation.getColumnFamilies())
@@ -177,11 +178,11 @@
         StorageService.optionalTasks.scheduleWithFixedDelay(runnable, 10, 10, TimeUnit.MINUTES);
     }
 
-    private static void deleteHint(ByteBuffer tokenBytes, ByteBuffer columnName, long timestamp)
+    private static void deleteHint(ByteBuffer tokenBytes, CellName columnName, long timestamp)
     {
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, tokenBytes);
-        rm.delete(SystemKeyspace.HINTS_CF, columnName, timestamp);
-        rm.applyUnsafe(); // don't bother with commitlog since we're going to flush as soon as we're done with delivery
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, tokenBytes);
+        mutation.delete(SystemKeyspace.HINTS_CF, columnName, timestamp);
+        mutation.applyUnsafe(); // don't bother with commitlog since we're going to flush as soon as we're done with delivery
     }
 
     public void deleteHintsForEndpoint(final String ipOrHostname)
@@ -204,8 +205,8 @@
             return;
         UUID hostId = StorageService.instance.getTokenMetadata().getHostId(endpoint);
         ByteBuffer hostIdBytes = ByteBuffer.wrap(UUIDGen.decompose(hostId));
-        final RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, hostIdBytes);
-        rm.delete(SystemKeyspace.HINTS_CF, System.currentTimeMillis());
+        final Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, hostIdBytes);
+        mutation.delete(SystemKeyspace.HINTS_CF, System.currentTimeMillis());
 
         // execute asynchronously to avoid blocking caller (which may be processing gossip)
         Runnable runnable = new Runnable()
@@ -215,7 +216,7 @@
                 try
                 {
                     logger.info("Deleting any stored hints for {}", endpoint);
-                    rm.apply();
+                    mutation.apply();
                     hintStore.forceBlockingFlush();
                     compact();
                 }
@@ -267,11 +268,11 @@
         }
     }
 
-    private static boolean pagingFinished(ColumnFamily hintColumnFamily, ByteBuffer startColumn)
+    private static boolean pagingFinished(ColumnFamily hintColumnFamily, Composite startColumn)
     {
         // done if no hints found or the start column (same as last column processed in previous iteration) is the only one
         return hintColumnFamily == null
-               || (hintColumnFamily.getSortedColumns().size() == 1 && hintColumnFamily.getColumn(startColumn) != null);
+               || (!startColumn.isEmpty() && hintColumnFamily.getSortedColumns().size() == 1 && hintColumnFamily.getColumn((CellName)startColumn) != null);
     }
 
     private int waitForSchemaAgreement(InetAddress endpoint) throws TimeoutException
@@ -353,7 +354,7 @@
         DecoratedKey epkey =  StorageService.getPartitioner().decorateKey(hostIdBytes);
 
         final AtomicInteger rowsReplayed = new AtomicInteger(0);
-        ByteBuffer startColumn = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        Composite startColumn = Composites.EMPTY;
 
         int pageSize = calculatePageSize();
         logger.debug("Using pageSize of {}", pageSize);
@@ -371,7 +372,7 @@
             QueryFilter filter = QueryFilter.getSliceFilter(epkey,
                                                             SystemKeyspace.HINTS_CF,
                                                             startColumn,
-                                                            ByteBufferUtil.EMPTY_BYTE_BUFFER,
+                                                            Composites.EMPTY,
                                                             false,
                                                             pageSize,
                                                             now);
@@ -392,7 +393,7 @@
             }
 
             List<WriteResponseHandler> responseHandlers = Lists.newArrayList();
-            for (final Column hint : hintsPage)
+            for (final Cell hint : hintsPage)
             {
                 // check if hints delivery has been paused during the process
                 if (hintedHandOffPaused)
@@ -406,23 +407,22 @@
                 // in which the local deletion timestamp was generated on the last column in the old page, in which
                 // case the hint will have no columns (since it's deleted) but will still be included in the resultset
                 // since (even with gcgs=0) it's still a "relevant" tombstone.
-                if (!hint.isLive(System.currentTimeMillis()))
+                if (!hint.isLive())
                     continue;
 
                 startColumn = hint.name();
 
-                ByteBuffer[] components = comparator.split(hint.name());
-                int version = Int32Type.instance.compose(components[1]);
+                int version = Int32Type.instance.compose(hint.name().get(1));
                 DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(hint.value()));
-                RowMutation rm;
+                Mutation mutation;
                 try
                 {
-                    rm = RowMutation.serializer.deserialize(in, version);
+                    mutation = Mutation.serializer.deserialize(in, version);
                 }
                 catch (UnknownColumnFamilyException e)
                 {
                     logger.debug("Skipping delivery of hint for deleted columnfamily", e);
-                    deleteHint(hostIdBytes, hint.name(), hint.maxTimestamp());
+                    deleteHint(hostIdBytes, hint.name(), hint.timestamp());
                     continue;
                 }
                 catch (IOException e)
@@ -430,29 +430,29 @@
                     throw new AssertionError(e);
                 }
 
-                for (UUID cfId : rm.getColumnFamilyIds())
+                for (UUID cfId : mutation.getColumnFamilyIds())
                 {
-                    if (hint.maxTimestamp() <= SystemKeyspace.getTruncatedAt(cfId))
+                    if (hint.timestamp() <= SystemKeyspace.getTruncatedAt(cfId))
                     {
                         logger.debug("Skipping delivery of hint for truncated columnfamily {}", cfId);
-                        rm = rm.without(cfId);
+                        mutation = mutation.without(cfId);
                     }
                 }
 
-                if (rm.isEmpty())
+                if (mutation.isEmpty())
                 {
-                    deleteHint(hostIdBytes, hint.name(), hint.maxTimestamp());
+                    deleteHint(hostIdBytes, hint.name(), hint.timestamp());
                     continue;
                 }
 
-                MessageOut<RowMutation> message = rm.createMessage();
+                MessageOut<Mutation> message = mutation.createMessage();
                 rateLimiter.acquire(message.serializedSize(MessagingService.current_version));
                 Runnable callback = new Runnable()
                 {
                     public void run()
                     {
                         rowsReplayed.incrementAndGet();
-                        deleteHint(hostIdBytes, hint.name(), hint.maxTimestamp());
+                        deleteHint(hostIdBytes, hint.name(), hint.timestamp());
                     }
                 };
                 WriteResponseHandler responseHandler = new WriteResponseHandler(endpoint, WriteType.SIMPLE, callback);
@@ -490,7 +490,7 @@
             return PAGE_SIZE;
 
         // page size of 1 does not allow actual paging b/c of >= behavior on startColumn
-        return Math.max(2, Math.min(PAGE_SIZE, DatabaseDescriptor.getInMemoryCompactionLimit() / averageColumnSize));
+        return Math.max(2, Math.min(PAGE_SIZE, 4 * 1024 * 1024 / averageColumnSize));
     }
 
     /**
@@ -509,11 +509,11 @@
         IPartitioner p = StorageService.getPartitioner();
         RowPosition minPos = p.getMinimumToken().minKeyBound();
         Range<RowPosition> range = new Range<RowPosition>(minPos, minPos, p);
-        IDiskAtomFilter filter = new NamesQueryFilter(ImmutableSortedSet.<ByteBuffer>of());
+        IDiskAtomFilter filter = new NamesQueryFilter(ImmutableSortedSet.<CellName>of());
         List<Row> rows = hintStore.getRangeSlice(range, null, filter, Integer.MAX_VALUE, System.currentTimeMillis());
         for (Row row : rows)
         {
-            UUID hostId = UUIDGen.getUUID(row.key.key);
+            UUID hostId = UUIDGen.getUUID(row.key.getKey());
             InetAddress target = StorageService.instance.getTokenMetadata().getEndpointForHostId(hostId);
             // token may have since been removed (in which case we have just read back a tombstone)
             if (target != null)
@@ -576,7 +576,7 @@
         for (Row row : getHintsSlice(1))
         {
             if (row.cf != null) //ignore removed rows
-                result.addFirst(tokenFactory.toString(row.key.token));
+                result.addFirst(tokenFactory.toString(row.key.getToken()));
         }
         return result;
     }
@@ -584,8 +584,7 @@
     private List<Row> getHintsSlice(int columnCount)
     {
         // Get count # of columns...
-        SliceQueryFilter predicate = new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                          ByteBufferUtil.EMPTY_BYTE_BUFFER,
+        SliceQueryFilter predicate = new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY,
                                                           false,
                                                           columnCount);
 

diff --git a/src/java/org/apache/cassandra/db/IMutation.java b/src/java/org/apache/cassandra/db/IMutation.java
index 70bd79c..44df104 100644
--- a/src/java/org/apache/cassandra/db/IMutation.java
+++ b/src/java/org/apache/cassandra/db/IMutation.java

@@ -26,7 +26,7 @@
     public String getKeyspaceName();
     public Collection<UUID> getColumnFamilyIds();
     public ByteBuffer key();
-    public void apply();
+    public long getTimeout();
     public String toString(boolean shallow);
     public void addAll(IMutation m);
     public Collection<ColumnFamily> getColumnFamilies();

diff --git a/src/java/org/apache/cassandra/db/IndexExpression.java b/src/java/org/apache/cassandra/db/IndexExpression.java
new file mode 100644
index 0000000..b57890a
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/IndexExpression.java

@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class IndexExpression
+{
+    public final ByteBuffer column;
+    public final Operator operator;
+    public final ByteBuffer value;
+
+    public IndexExpression(ByteBuffer column, Operator operator, ByteBuffer value)
+    {
+        this.column = column;
+        this.operator = operator;
+        this.value = value;
+    }
+
+    public enum Operator
+    {
+        EQ, GTE, GT, LTE, LT, CONTAINS, CONTAINS_KEY;
+
+        public static Operator findByOrdinal(int ordinal)
+        {
+            switch (ordinal) {
+                case 0:
+                    return EQ;
+                case 1:
+                    return GTE;
+                case 2:
+                    return GT;
+                case 3:
+                    return LTE;
+                case 4:
+                    return LT;
+                case 5:
+                    return CONTAINS;
+                case 6:
+                    return CONTAINS_KEY;
+                default:
+                    throw new AssertionError();
+            }
+        }
+
+        public boolean allowsIndexQuery()
+        {
+            switch (this)
+            {
+                case EQ:
+                case CONTAINS:
+                case CONTAINS_KEY:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("%s %s %s", ByteBufferUtil.bytesToHex(column), operator, ByteBufferUtil.bytesToHex(value));
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof IndexExpression))
+            return false;
+
+        IndexExpression ie = (IndexExpression) o;
+
+        return Objects.equal(this.column, ie.column)
+            && Objects.equal(this.operator, ie.operator)
+            && Objects.equal(this.value, ie.value);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(column, operator, value);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java
index 28045f4..d27424e 100644
--- a/src/java/org/apache/cassandra/db/Keyspace.java
+++ b/src/java/org/apache/cassandra/db/Keyspace.java

@@ -19,11 +19,16 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.Future;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
 
 import com.google.common.base.Function;
 import com.google.common.collect.Iterables;
@@ -35,6 +40,7 @@
 import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
@@ -43,6 +49,7 @@
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.service.pager.QueryPagers;
 import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.metrics.KeyspaceMetrics;
 
 /**
@@ -55,14 +62,6 @@
 
     private static final Logger logger = LoggerFactory.getLogger(Keyspace.class);
 
-    /**
-     * accesses to CFS.memtable should acquire this for thread safety.
-     * CFS.maybeSwitchMemtable should aquire the writeLock; see that method for the full explanation.
-     * <p/>
-     * (Enabling fairness in the RRWL is observed to decrease throughput, so we leave it off.)
-     */
-    public static final ReentrantReadWriteLock switchLock = new ReentrantReadWriteLock();
-
     public final KeyspaceMetrics metric;
 
     // It is possible to call Keyspace.open without a running daemon, so it makes sense to ensure
@@ -74,6 +73,7 @@
     }
 
     public final KSMetaData metadata;
+    public final OpOrder writeOrder = new OpOrder();
 
     /* ColumnFamilyStore per column family */
     private final ConcurrentMap<UUID, ColumnFamilyStore> columnFamilyStores = new ConcurrentHashMap<UUID, ColumnFamilyStore>();
@@ -87,11 +87,19 @@
         }
     };
 
+    private static volatile boolean initialized = false;
+    public static void setInitialized()
+    {
+        initialized = true;
+    }
+
     public static Keyspace open(String keyspaceName)
     {
+        assert initialized || keyspaceName.equals(SYSTEM_KS);
         return open(keyspaceName, Schema.instance, true);
     }
 
+    // to only be used by org.apache.cassandra.tools.Standalone* classes
     public static Keyspace openWithoutSSTables(String keyspaceName)
     {
         return open(keyspaceName, Schema.instance, false);
@@ -290,6 +298,10 @@
         if (cfs == null)
             return;
 
+        // wait for any outstanding reads/writes that might affect the CFS
+        cfs.keyspace.writeOrder.awaitNewBarrier();
+        cfs.readOrdering.awaitNewBarrier();
+
         unloadCf(cfs);
     }
 
@@ -334,7 +346,7 @@
         return new Row(filter.key, columnFamily);
     }
 
-    public void apply(RowMutation mutation, boolean writeCommitLog)
+    public void apply(Mutation mutation, boolean writeCommitLog)
     {
         apply(mutation, writeCommitLog, true);
     }
@@ -347,17 +359,16 @@
      * @param writeCommitLog false to disable commitlog append entirely
      * @param updateIndexes  false to disable index updates (used by CollationController "defragmenting")
      */
-    public void apply(RowMutation mutation, boolean writeCommitLog, boolean updateIndexes)
+    public void apply(Mutation mutation, boolean writeCommitLog, boolean updateIndexes)
     {
-        // write the mutation to the commitlog and memtables
-        Tracing.trace("Acquiring switchLock read lock");
-        switchLock.readLock().lock();
-        try
+        try (OpOrder.Group opGroup = writeOrder.start())
         {
+            // write the mutation to the commitlog and memtables
+            ReplayPosition replayPosition = null;
             if (writeCommitLog)
             {
                 Tracing.trace("Appending to commitlog");
-                CommitLog.instance.add(mutation);
+                replayPosition = CommitLog.instance.add(mutation);
             }
 
             DecoratedKey key = StorageService.getPartitioner().decorateKey(mutation.key());
@@ -366,18 +377,17 @@
                 ColumnFamilyStore cfs = columnFamilyStores.get(cf.id());
                 if (cfs == null)
                 {
-                    logger.error("Attempting to mutate non-existant column family " + cf.id());
+                    logger.error("Attempting to mutate non-existant column family {}", cf.id());
                     continue;
                 }
 
                 Tracing.trace("Adding to {} memtable", cf.metadata().cfName);
-                cfs.apply(key, cf, updateIndexes ? cfs.indexManager.updaterFor(key, cf) : SecondaryIndexManager.nullUpdater);
+                SecondaryIndexManager.Updater updater = updateIndexes
+                                                      ? cfs.indexManager.updaterFor(key, cf, opGroup)
+                                                      : SecondaryIndexManager.nullUpdater;
+                cfs.apply(key, cf, updater, opGroup, replayPosition);
             }
         }
-        finally
-        {
-            switchLock.readLock().unlock();
-        }
     }
 
     public AbstractReplicationStrategy getReplicationStrategy()
@@ -393,30 +403,25 @@
     public static void indexRow(DecoratedKey key, ColumnFamilyStore cfs, Set<String> idxNames)
     {
         if (logger.isDebugEnabled())
-            logger.debug("Indexing row {} ", cfs.metadata.getKeyValidator().getString(key.key));
+            logger.debug("Indexing row {} ", cfs.metadata.getKeyValidator().getString(key.getKey()));
 
-        Set<SecondaryIndex> indexes = cfs.indexManager.getIndexesByNames(idxNames);
-
-        switchLock.readLock().lock();
-        try
+        try (OpOrder.Group opGroup = cfs.keyspace.writeOrder.start())
         {
-            Iterator<ColumnFamily> pager = QueryPagers.pageRowLocally(cfs, key.key, DEFAULT_PAGE_SIZE);
+            Set<SecondaryIndex> indexes = cfs.indexManager.getIndexesByNames(idxNames);
+
+            Iterator<ColumnFamily> pager = QueryPagers.pageRowLocally(cfs, key.getKey(), DEFAULT_PAGE_SIZE);
             while (pager.hasNext())
             {
                 ColumnFamily cf = pager.next();
                 ColumnFamily cf2 = cf.cloneMeShallow();
-                for (Column column : cf)
+                for (Cell cell : cf)
                 {
-                    if (cfs.indexManager.indexes(column.name(), indexes))
-                        cf2.addColumn(column);
+                    if (cfs.indexManager.indexes(cell.name(), indexes))
+                        cf2.addColumn(cell);
                 }
-                cfs.indexManager.indexRow(key.key, cf2);
+                cfs.indexManager.indexRow(key.getKey(), cf2, opGroup);
             }
         }
-        finally
-        {
-            switchLock.readLock().unlock();
-        }
     }
 
     public List<Future<?>> flush()

diff --git a/src/java/org/apache/cassandra/db/Memtable.java b/src/java/org/apache/cassandra/db/Memtable.java
index f9a6719..b0d2a11 100644
--- a/src/java/org/apache/cassandra/db/Memtable.java
+++ b/src/java/org/apache/cassandra/db/Memtable.java

@@ -18,136 +18,83 @@
 package org.apache.cassandra.db;
 
 import java.io.File;
-import java.util.*;
-import java.util.concurrent.*;
+import java.util.AbstractMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.ConcurrentNavigableMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicReference;
 
-import com.google.common.base.Function;
 import com.google.common.base.Throwables;
-import org.cliffc.high_scale_lib.NonBlockingHashSet;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
-import org.apache.cassandra.concurrent.NamedThreadFactory;
-import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
+import org.apache.cassandra.dht.LongToken;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.sstable.SSTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.util.DiskAwareRunnable;
-import org.apache.cassandra.utils.Allocator;
-import org.github.jamm.MemoryMeter;
+import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.*;
 
 public class Memtable
 {
     private static final Logger logger = LoggerFactory.getLogger(Memtable.class);
 
-    // size in memory can never be less than serialized size.
-    private static final double MIN_SANE_LIVE_RATIO = 1.0;
-    // max liveratio seen w/ 1-byte columns on a 64-bit jvm was 19. If it gets higher than 64 something is probably broken.
-    private static final double MAX_SANE_LIVE_RATIO = 64.0;
-    // reasonable initial live ratio used until we compute one.
-    private static final double INITIAL_LIVE_RATIO = 10.0;
+    static final MemtablePool MEMORY_POOL = DatabaseDescriptor.getMemtableAllocatorPool();
+    private static final int ROW_OVERHEAD_HEAP_SIZE = estimateRowOverhead(Integer.valueOf(System.getProperty("cassandra.memtable_row_overhead_computation_step", "100000")));
 
-    // ratio of in-memory memtable size, to serialized size
-    private volatile double liveRatio = INITIAL_LIVE_RATIO;
-    // ops count last time we computed liveRatio
-    private final AtomicLong liveRatioComputedAt = new AtomicLong(32);
-
-    /*
-     * switchMemtable puts Memtable.getSortedContents on the writer executor.  When the write is complete,
-     * we turn the writer into an SSTableReader and add it to ssTables where it is available for reads.
-     *
-     * There are two other things that switchMemtable does.
-     * First, it puts the Memtable into memtablesPendingFlush, where it stays until the flush is complete
-     * and it's been added as an SSTableReader to ssTables_.  Second, it adds an entry to commitLogUpdater
-     * that waits for the flush to complete, then calls onMemtableFlush.  This allows multiple flushes
-     * to happen simultaneously on multicore systems, while still calling onMF in the correct order,
-     * which is necessary for replay in case of a restart since CommitLog assumes that when onMF is
-     * called, all data up to the given context has been persisted to SSTables.
-     */
-    private static final ExecutorService flushWriter
-            = new JMXEnabledThreadPoolExecutor(DatabaseDescriptor.getFlushWriters(),
-                                               StageManager.KEEPALIVE,
-                                               TimeUnit.SECONDS,
-                                               new LinkedBlockingQueue<Runnable>(DatabaseDescriptor.getFlushQueueSize()),
-                                               new NamedThreadFactory("FlushWriter"),
-                                               "internal");
-
-    // We need to take steps to avoid retaining inactive membtables in memory, because counting is slow (can be
-    // minutes, for a large memtable and a busy server).  A strictly FIFO Memtable queue could keep memtables
-    // alive waiting for metering after they're flushed and would otherwise be GC'd.  Instead, the approach we take
-    // is to enqueue the CFS instead of the memtable, and to meter whatever the active memtable is when the executor
-    // starts to work on it.  We use a Set to make sure we don't enqueue redundant tasks for the same CFS.
-    private static final Set<ColumnFamilyStore> meteringInProgress = new NonBlockingHashSet<ColumnFamilyStore>();
-    private static final ExecutorService meterExecutor = new JMXEnabledThreadPoolExecutor(1,
-                                                                                          Integer.MAX_VALUE,
-                                                                                          TimeUnit.MILLISECONDS,
-                                                                                          new LinkedBlockingQueue<Runnable>(),
-                                                                                          new NamedThreadFactory("MemoryMeter"),
-                                                                                          "internal");
-    private final MemoryMeter meter;
-
-    volatile static ColumnFamilyStore activelyMeasuring;
-
-    private final AtomicLong currentSize = new AtomicLong(0);
+    private final MemtableAllocator allocator;
+    private final AtomicLong liveDataSize = new AtomicLong(0);
     private final AtomicLong currentOperations = new AtomicLong(0);
 
+    // the write barrier for directing writes to this memtable during a switch
+    private volatile OpOrder.Barrier writeBarrier;
+    // the last ReplayPosition owned by this Memtable; all ReplayPositions lower are owned by this or an earlier Memtable
+    private final AtomicReference<ReplayPosition> lastReplayPosition = new AtomicReference<>();
+    // the "first" ReplayPosition owned by this Memtable; this is inaccurate, and only used as a convenience to prevent CLSM flushing wantonly
+    private final ReplayPosition minReplayPosition = CommitLog.instance.getContext();
+
     // We index the memtable by RowPosition only for the purpose of being able
     // to select key range using Token.KeyBound. However put() ensures that we
     // actually only store DecoratedKey.
-    private final ConcurrentNavigableMap<RowPosition, AtomicSortedColumns> rows = new ConcurrentSkipListMap<RowPosition, AtomicSortedColumns>();
+    private final ConcurrentNavigableMap<RowPosition, AtomicBTreeColumns> rows = new ConcurrentSkipListMap<>();
     public final ColumnFamilyStore cfs;
     private final long creationTime = System.currentTimeMillis();
     private final long creationNano = System.nanoTime();
 
-    private final Allocator allocator = DatabaseDescriptor.getMemtableAllocator();
-    // We really only need one column by allocator but one by memtable is not a big waste and avoids needing allocators to know about CFS
-    private final Function<Column, Column> localCopyFunction = new Function<Column, Column>()
-    {
-        public Column apply(Column c)
-        {
-            return c.localCopy(cfs, allocator);
-        }
-    };
-
     // Record the comparator of the CFS at the creation of the memtable. This
     // is only used when a user update the CF comparator, to know if the
     // memtable was created with the new or old comparator.
-    public final AbstractType initialComparator;
+    public final CellNameType initialComparator;
 
     public Memtable(ColumnFamilyStore cfs)
     {
         this.cfs = cfs;
+        this.allocator = MEMORY_POOL.newAllocator();
         this.initialComparator = cfs.metadata.comparator;
         this.cfs.scheduleFlush();
-
-        Callable<Set<Object>> provider = new Callable<Set<Object>>()
-        {
-            public Set<Object> call() throws Exception
-            {
-                // avoid counting this once for each row
-                Set<Object> set = Collections.newSetFromMap(new IdentityHashMap<Object, Boolean>());
-                set.add(Memtable.this.cfs.metadata);
-                return set;
-            }
-        };
-        meter = new MemoryMeter().omitSharedBufferOverhead().withTrackerProvider(provider);
     }
 
-    public long getLiveSize()
+    public MemtableAllocator getAllocator()
     {
-        long estimatedSize = (long) (currentSize.get() * liveRatio);
+        return allocator;
+    }
 
-        // liveRatio is just an estimate; we can get a lower bound directly from the allocator
-        if (estimatedSize < allocator.getMinimumSize())
-            return allocator.getMinimumSize();
-
-        return estimatedSize;
+    public long getLiveDataSize()
+    {
+        return liveDataSize.get();
     }
 
     public long getOperations()
@@ -155,131 +102,28 @@
         return currentOperations.get();
     }
 
-    /**
-     * Should only be called by ColumnFamilyStore.apply.  NOT a public API.
-     * (CFS handles locking to avoid submitting an op
-     *  to a flushing memtable.  Any other way is unsafe.)
-    */
-    void put(DecoratedKey key, ColumnFamily columnFamily, SecondaryIndexManager.Updater indexer)
+    void setDiscarding(OpOrder.Barrier writeBarrier, ReplayPosition minLastReplayPosition)
     {
-        resolve(key, columnFamily, indexer);
+        assert this.writeBarrier == null;
+        this.lastReplayPosition.set(minLastReplayPosition);
+        this.writeBarrier = writeBarrier;
+        allocator.setDiscarding();
     }
 
-    public void maybeUpdateLiveRatio()
+    void setDiscarded()
     {
-        // recompute liveRatio, if we have increased the number of ops 10x since last calculated
-        while (true)
-        {
-            long last = liveRatioComputedAt.get();
-            long operations = currentOperations.get();
-            if (operations <= 10L * last)
-                break;
-            if (liveRatioComputedAt.compareAndSet(last, operations))
-            {
-                logger.debug("computing liveRatio of {} at {} ops", this, operations);
-                updateLiveRatio();
-            }
-        }
+        allocator.setDiscarded();
     }
 
-    public void updateLiveRatio() throws RuntimeException
+    public boolean accepts(OpOrder.Group opGroup)
     {
-        if (!MemoryMeter.isInitialized())
-        {
-            // hack for openjdk.  we log a warning about this in the startup script too.
-            logger.error("MemoryMeter uninitialized (jamm not specified as java agent); assuming liveRatio of {}.  "
-                         + " Usually this means cassandra-env.sh disabled jamm because you are using a buggy JRE; "
-                         + " upgrade to the Sun JRE instead", liveRatio);
-            return;
-        }
-
-        if (!meteringInProgress.add(cfs))
-        {
-            logger.debug("Metering already pending or active for {}; skipping liveRatio update", cfs);
-            return;
-        }
-
-        meterExecutor.submit(new MeteringRunnable(cfs));
+        OpOrder.Barrier barrier = this.writeBarrier;
+        return barrier == null || barrier.isAfter(opGroup);
     }
 
-    private void resolve(DecoratedKey key, ColumnFamily cf, SecondaryIndexManager.Updater indexer)
+    public boolean isLive()
     {
-        AtomicSortedColumns previous = rows.get(key);
-
-        if (previous == null)
-        {
-            AtomicSortedColumns empty = cf.cloneMeShallow(AtomicSortedColumns.factory, false);
-            // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent
-            previous = rows.putIfAbsent(new DecoratedKey(key.token, allocator.clone(key.key)), empty);
-            if (previous == null)
-                previous = empty;
-        }
-
-        long sizeDelta = previous.addAllWithSizeDelta(cf, allocator, localCopyFunction, indexer);
-        currentSize.addAndGet(sizeDelta);
-        currentOperations.addAndGet(cf.getColumnCount() + (cf.isMarkedForDelete() ? 1 : 0) + cf.deletionInfo().rangeCount());
-    }
-
-    // for debugging
-    public String contents()
-    {
-        StringBuilder builder = new StringBuilder();
-        builder.append("{");
-        for (Map.Entry<RowPosition, AtomicSortedColumns> entry : rows.entrySet())
-        {
-            builder.append(entry.getKey()).append(": ").append(entry.getValue()).append(", ");
-        }
-        builder.append("}");
-        return builder.toString();
-    }
-
-    public void flushAndSignal(final CountDownLatch latch, final Future<ReplayPosition> context)
-    {
-        flushWriter.execute(new FlushRunnable(latch, context));
-    }
-
-    public String toString()
-    {
-        return String.format("Memtable-%s@%s(%s/%s serialized/live bytes, %s ops)",
-                             cfs.name, hashCode(), currentSize, getLiveSize(), currentOperations);
-    }
-
-    /**
-     * @param startWith Include data in the result from and including this key and to the end of the memtable
-     * @return An iterator of entries with the data from the start key
-     */
-    public Iterator<Map.Entry<DecoratedKey, AtomicSortedColumns>> getEntryIterator(final RowPosition startWith, final RowPosition stopAt)
-    {
-        return new Iterator<Map.Entry<DecoratedKey, AtomicSortedColumns>>()
-        {
-            private Iterator<Map.Entry<RowPosition, AtomicSortedColumns>> iter = stopAt.isMinimum(cfs.partitioner)
-                                                                               ? rows.tailMap(startWith).entrySet().iterator()
-                                                                               : rows.subMap(startWith, true, stopAt, true).entrySet().iterator();
-            private Map.Entry<RowPosition, AtomicSortedColumns> currentEntry;
-
-            public boolean hasNext()
-            {
-                return iter.hasNext();
-            }
-
-            public Map.Entry<DecoratedKey, AtomicSortedColumns> next()
-            {
-                Map.Entry<RowPosition, AtomicSortedColumns> entry = iter.next();
-                // Store the reference to the current entry so that remove() can update the current size.
-                currentEntry = entry;
-                // Actual stored key should be true DecoratedKey
-                assert entry.getKey() instanceof DecoratedKey;
-                // Object cast is required since otherwise we can't turn RowPosition into DecoratedKey
-                return (Map.Entry<DecoratedKey, AtomicSortedColumns>) (Object)entry;
-            }
-
-            public void remove()
-            {
-                iter.remove();
-                currentSize.addAndGet(-currentEntry.getValue().dataSize());
-                currentEntry = null;
-            }
-        };
+        return allocator.isLive();
     }
 
     public boolean isClean()
@@ -287,6 +131,11 @@
         return rows.isEmpty();
     }
 
+    public boolean isCleanAfter(ReplayPosition position)
+    {
+        return isClean() || (position != null && minReplayPosition.compareTo(position) >= 0);
+    }
+
     /**
      * @return true if this memtable is expired. Expiration time is determined by CF's memtable_flush_period_in_ms.
      */
@@ -296,6 +145,124 @@
         return period > 0 && (System.nanoTime() - creationNano >= TimeUnit.MILLISECONDS.toNanos(period));
     }
 
+    /**
+     * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate
+     * OpOrdering.
+     *
+     * replayPosition should only be null if this is a secondary index, in which case it is *expected* to be null
+     */
+    void put(DecoratedKey key, ColumnFamily cf, SecondaryIndexManager.Updater indexer, OpOrder.Group opGroup, ReplayPosition replayPosition)
+    {
+        if (replayPosition != null && writeBarrier != null)
+        {
+            // if the writeBarrier is set, we want to maintain lastReplayPosition; this is an optimisation to avoid
+            // casing it for every write, but still ensure it is correct when writeBarrier.await() completes.
+            while (true)
+            {
+                ReplayPosition last = lastReplayPosition.get();
+                if (last.compareTo(replayPosition) >= 0)
+                    break;
+                if (lastReplayPosition.compareAndSet(last, replayPosition))
+                    break;
+            }
+        }
+
+        AtomicBTreeColumns previous = rows.get(key);
+
+        if (previous == null)
+        {
+            AtomicBTreeColumns empty = cf.cloneMeShallow(AtomicBTreeColumns.factory, false);
+            final DecoratedKey cloneKey = allocator.clone(key, opGroup);
+            // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent
+            previous = rows.putIfAbsent(cloneKey, empty);
+            if (previous == null)
+            {
+                previous = empty;
+                // allocate the row overhead after the fact; this saves over allocating and having to free after, but
+                // means we can overshoot our declared limit.
+                int overhead = (int) (cfs.partitioner.getHeapSizeOf(key.getToken()) + ROW_OVERHEAD_HEAP_SIZE);
+                allocator.onHeap().allocate(overhead, opGroup);
+            }
+            else
+            {
+                allocator.reclaimer().reclaimImmediately(cloneKey);
+            }
+        }
+
+        liveDataSize.addAndGet(previous.addAllWithSizeDelta(cf, allocator, opGroup, indexer));
+        currentOperations.addAndGet(cf.getColumnCount() + (cf.isMarkedForDelete() ? 1 : 0) + cf.deletionInfo().rangeCount());
+    }
+
+    // for debugging
+    public String contents()
+    {
+        StringBuilder builder = new StringBuilder();
+        builder.append("{");
+        for (Map.Entry<RowPosition, AtomicBTreeColumns> entry : rows.entrySet())
+        {
+            builder.append(entry.getKey()).append(": ").append(entry.getValue()).append(", ");
+        }
+        builder.append("}");
+        return builder.toString();
+    }
+
+    public FlushRunnable flushRunnable()
+    {
+        return new FlushRunnable(lastReplayPosition.get());
+    }
+
+    public String toString()
+    {
+        return String.format("Memtable-%s@%s(%s serialized bytes, %s ops, %.0f%%/%.0f%% of on/off-heap limit)",
+                             cfs.name, hashCode(), liveDataSize, currentOperations, 100 * allocator.onHeap().ownershipRatio(), 100 * allocator.offHeap().ownershipRatio());
+    }
+
+    /**
+     * @param startWith Include data in the result from and including this key and to the end of the memtable
+     * @return An iterator of entries with the data from the start key
+     */
+    public Iterator<Map.Entry<DecoratedKey, ColumnFamily>> getEntryIterator(final RowPosition startWith, final RowPosition stopAt)
+    {
+        return new Iterator<Map.Entry<DecoratedKey, ColumnFamily>>()
+        {
+            private Iterator<? extends Map.Entry<? extends RowPosition, AtomicBTreeColumns>> iter = stopAt.isMinimum(cfs.partitioner)
+                    ? rows.tailMap(startWith).entrySet().iterator()
+                    : rows.subMap(startWith, true, stopAt, true).entrySet().iterator();
+
+            private Map.Entry<? extends RowPosition, ? extends ColumnFamily> currentEntry;
+
+            public boolean hasNext()
+            {
+                return iter.hasNext();
+            }
+
+            public Map.Entry<DecoratedKey, ColumnFamily> next()
+            {
+                Map.Entry<? extends RowPosition, ? extends ColumnFamily> entry = iter.next();
+                // Actual stored key should be true DecoratedKey
+                assert entry.getKey() instanceof DecoratedKey;
+                if (MEMORY_POOL.needToCopyOnHeap())
+                {
+                    DecoratedKey key = (DecoratedKey) entry.getKey();
+                    key = new BufferDecoratedKey(key.getToken(), HeapAllocator.instance.clone(key.getKey()));
+                    ColumnFamily cells = ArrayBackedSortedColumns.localCopy(entry.getValue(), HeapAllocator.instance);
+                    entry = new AbstractMap.SimpleImmutableEntry<>(key, cells);
+                }
+                // Store the reference to the current entry so that remove() can update the current size.
+                currentEntry = entry;
+                // Object cast is required since otherwise we can't turn RowPosition into DecoratedKey
+                return (Map.Entry<DecoratedKey, ColumnFamily>) entry;
+            }
+
+            public void remove()
+            {
+                iter.remove();
+                liveDataSize.addAndGet(-currentEntry.getValue().dataSize());
+                currentEntry = null;
+            }
+        };
+    }
+
     public ColumnFamily getColumnFamily(DecoratedKey key)
     {
         return rows.get(key);
@@ -306,15 +273,18 @@
         return creationTime;
     }
 
+    public ReplayPosition getLastReplayPosition()
+    {
+        return lastReplayPosition.get();
+    }
+
     class FlushRunnable extends DiskAwareRunnable
     {
-        private final CountDownLatch latch;
-        private final Future<ReplayPosition> context;
+        private final ReplayPosition context;
         private final long estimatedSize;
 
-        FlushRunnable(CountDownLatch latch, Future<ReplayPosition> context)
+        FlushRunnable(ReplayPosition context)
         {
-            this.latch = latch;
             this.context = context;
 
             long keySize = 0;
@@ -322,11 +292,11 @@
             {
                 //  make sure we don't write non-sensical keys
                 assert key instanceof DecoratedKey;
-                keySize += ((DecoratedKey)key).key.remaining();
+                keySize += ((DecoratedKey)key).getKey().remaining();
             }
             estimatedSize = (long) ((keySize // index entries
                                     + keySize // keys in data file
-                                    + currentSize.get()) // data
+                                    + liveDataSize.get()) // data
                                     * 1.2); // bloom filter and row index overhead
         }
 
@@ -341,7 +311,6 @@
 
             SSTableReader sstable = writeSortedContents(context, sstableDirectory);
             cfs.replaceFlushed(Memtable.this, sstable);
-            latch.countDown();
         }
 
         protected Directories getDirectories()
@@ -349,59 +318,62 @@
             return cfs.directories;
         }
 
-        private SSTableReader writeSortedContents(Future<ReplayPosition> context, File sstableDirectory)
+        private SSTableReader writeSortedContents(ReplayPosition context, File sstableDirectory)
         throws ExecutionException, InterruptedException
         {
-            logger.info("Writing " + Memtable.this.toString());
+            logger.info("Writing {}", Memtable.this.toString());
 
             SSTableReader ssTable;
             // errors when creating the writer that may leave empty temp files.
             SSTableWriter writer = createFlushWriter(cfs.getTempSSTablePath(sstableDirectory));
             try
             {
+                boolean trackContention = logger.isDebugEnabled();
+                int heavilyContendedRowCount = 0;
                 // (we can't clear out the map as-we-go to free up memory,
                 //  since the memtable is being used for queries in the "pending flush" category)
-                for (Map.Entry<RowPosition, AtomicSortedColumns> entry : rows.entrySet())
+                for (Map.Entry<RowPosition, AtomicBTreeColumns> entry : rows.entrySet())
                 {
-                    ColumnFamily cf = entry.getValue();
-                    if (cf.isMarkedForDelete())
+                    AtomicBTreeColumns cf = entry.getValue();
+
+                    if (cf.isMarkedForDelete() && cf.hasColumns())
                     {
                         // When every node is up, there's no reason to write batchlog data out to sstables
                         // (which in turn incurs cost like compaction) since the BL write + delete cancel each other out,
                         // and BL data is strictly local, so we don't need to preserve tombstones for repair.
                         // If we have a data row + row level tombstone, then writing it is effectively an expensive no-op so we skip it.
                         // See CASSANDRA-4667.
-                        if (cfs.name.equals(SystemKeyspace.BATCHLOG_CF) && cfs.keyspace.getName().equals(Keyspace.SYSTEM_KS) && !(cf.getColumnCount() == 0))
+                        if (cfs.name.equals(SystemKeyspace.BATCHLOG_CF) && cfs.keyspace.getName().equals(Keyspace.SYSTEM_KS))
                             continue;
-
-                        // Pedantically, you could purge column level tombstones that are past GcGRace when writing to the SSTable.
-                        // But it can result in unexpected behaviour where deletes never make it to disk,
-                        // as they are lost and so cannot override existing column values. So we only remove deleted columns if there
-                        // is a CF level tombstone to ensure the delete makes it into an SSTable.
-                        // We also shouldn't be dropping any columns obsoleted by partition and/or range tombstones in case
-                        // the table has secondary indexes, or else the stale entries wouldn't be cleaned up during compaction,
-                        // and will only be dropped during 2i query read-repair, if at all.
-                        if (!cfs.indexManager.hasIndexes())
-                            currentSize.addAndGet(-ColumnFamilyStore.removeDeletedColumnsOnly(cf, Integer.MIN_VALUE));
                     }
 
-                    if (cf.getColumnCount() > 0 || cf.isMarkedForDelete())
+                    if (trackContention && cf.usePessimisticLocking())
+                        heavilyContendedRowCount++;
+
+                    if (!cf.isEmpty())
                         writer.append((DecoratedKey)entry.getKey(), cf);
                 }
 
                 if (writer.getFilePointer() > 0)
                 {
+                    writer.isolateReferences();
+
+                    // temp sstables should contain non-repaired data.
                     ssTable = writer.closeAndOpenReader();
                     logger.info(String.format("Completed flushing %s (%d bytes) for commitlog position %s",
-                                              ssTable.getFilename(), new File(ssTable.getFilename()).length(), context.get()));
+                                              ssTable.getFilename(), new File(ssTable.getFilename()).length(), context));
                 }
                 else
                 {
                     writer.abort();
                     ssTable = null;
                     logger.info("Completed flushing; nothing needed to be retained.  Commitlog position was {}",
-                                context.get());
+                                context);
                 }
+
+                if (heavilyContendedRowCount > 0)
+                    logger.debug(String.format("High update contention in %d/%d partitions of %s ", heavilyContendedRowCount, rows.size(), Memtable.this.toString()));
+
                 return ssTable;
             }
             catch (Throwable e)
@@ -413,71 +385,32 @@
 
         public SSTableWriter createFlushWriter(String filename) throws ExecutionException, InterruptedException
         {
-            SSTableMetadata.Collector sstableMetadataCollector = SSTableMetadata.createCollector(cfs.metadata.comparator).replayPosition(context.get());
+            MetadataCollector sstableMetadataCollector = new MetadataCollector(cfs.metadata.comparator).replayPosition(context);
             return new SSTableWriter(filename,
                                      rows.size(),
+                                     ActiveRepairService.UNREPAIRED_SSTABLE,
                                      cfs.metadata,
                                      cfs.partitioner,
                                      sstableMetadataCollector);
         }
     }
 
-    private static class MeteringRunnable implements Runnable
+    private static int estimateRowOverhead(final int count)
     {
-        // we might need to wait in the meter queue for a while.  measure whichever memtable is active at that point,
-        // rather than keeping the original memtable referenced (and thus un-freeable) until this runs.
-        private final ColumnFamilyStore cfs;
-
-        public MeteringRunnable(ColumnFamilyStore cfs)
-        {
-            this.cfs = cfs;
-        }
-
-        public void run()
-        {
-            try
-            {
-                activelyMeasuring = cfs;
-                Memtable memtable = cfs.getMemtableThreadSafe();
-
-                long start = System.nanoTime();
-                // ConcurrentSkipListMap has cycles, so measureDeep will have to track a reference to EACH object it visits.
-                // So to reduce the memory overhead of doing a measurement, we break it up to row-at-a-time.
-                long deepSize = memtable.meter.measure(memtable.rows);
-                int objects = 0;
-                for (Map.Entry<RowPosition, AtomicSortedColumns> entry : memtable.rows.entrySet())
-                {
-                    deepSize += memtable.meter.measureDeep(entry.getKey()) + memtable.meter.measureDeep(entry.getValue());
-                    objects += entry.getValue().getColumnCount();
-                }
-                double newRatio = (double) deepSize / memtable.currentSize.get();
-
-                if (newRatio < MIN_SANE_LIVE_RATIO)
-                {
-                    logger.debug("setting live ratio to minimum of {} instead of {}", MIN_SANE_LIVE_RATIO, newRatio);
-                    newRatio = MIN_SANE_LIVE_RATIO;
-                }
-                if (newRatio > MAX_SANE_LIVE_RATIO)
-                {
-                    logger.debug("setting live ratio to maximum of {} instead of {}", MAX_SANE_LIVE_RATIO, newRatio);
-                    newRatio = MAX_SANE_LIVE_RATIO;
-                }
-
-                // we want to be very conservative about our estimate, since the penalty for guessing low is OOM
-                // death. thus, higher estimates are believed immediately; lower ones are averaged w/ the old
-                if (newRatio > memtable.liveRatio)
-                    memtable.liveRatio = newRatio;
-                else
-                    memtable.liveRatio = (memtable.liveRatio + newRatio) / 2.0;
-
-                logger.debug("{} liveRatio is {} (just-counted was {}).  calculation took {}ms for {} cells",
-                             cfs, memtable.liveRatio, newRatio, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start), objects);
-            }
-            finally
-            {
-                activelyMeasuring = null;
-                meteringInProgress.remove(cfs);
-            }
-        }
+        // calculate row overhead
+        final OpOrder.Group group = new OpOrder().start();
+        int rowOverhead;
+        MemtableAllocator allocator = MEMORY_POOL.newAllocator();
+        ConcurrentNavigableMap<RowPosition, Object> rows = new ConcurrentSkipListMap<>();
+        final Object val = new Object();
+        for (int i = 0 ; i < count ; i++)
+            rows.put(allocator.clone(new BufferDecoratedKey(new LongToken((long) i), ByteBufferUtil.EMPTY_BYTE_BUFFER), group), val);
+        double avgSize = ObjectSizes.measureDeep(rows) / (double) count;
+        rowOverhead = (int) ((avgSize - Math.floor(avgSize)) < 0.05 ? Math.floor(avgSize) : Math.ceil(avgSize));
+        rowOverhead -= ObjectSizes.measureDeep(new LongToken((long) 0));
+        rowOverhead += AtomicBTreeColumns.EMPTY_SIZE;
+        allocator.setDiscarding();
+        allocator.setDiscarded();
+        return rowOverhead;
     }
 }

diff --git a/src/java/org/apache/cassandra/db/MeteredFlusher.java b/src/java/org/apache/cassandra/db/MeteredFlusher.java
deleted file mode 100644
index 4f06bc6..0000000
--- a/src/java/org/apache/cassandra/db/MeteredFlusher.java
+++ /dev/null

@@ -1,134 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.DatabaseDescriptor;
-
-public class MeteredFlusher implements Runnable
-{
-    private static final Logger logger = LoggerFactory.getLogger(MeteredFlusher.class);
-
-    public void run()
-    {
-        long allowedSize = calculateAllowedSize();
-
-        // find how much memory non-active memtables are using
-        long flushingSize = calculateFlushingSize();
-        if (flushingSize > 0)
-            logger.debug("Currently flushing {} bytes of {} max", flushingSize, allowedSize);
-
-        List<ColumnFamilyStore> affectedCFs = affectedColumnFamilies();
-        long liveSize = 0;
-
-        // flush CFs using more than 1 / (maximum number of memtables it could have in the pipeline)
-        // of the total size allotted. Then, flush other CFs in order of size if necessary.
-        for (ColumnFamilyStore cfs : affectedCFs)
-        {
-            int maxInFlight = (int) Math.ceil((double) (1 // live memtable
-                                                        + 1 // potentially a flushed memtable being counted by jamm
-                                                        + DatabaseDescriptor.getFlushWriters()
-                                                        + DatabaseDescriptor.getFlushQueueSize())
-                                              / (1 + cfs.indexManager.getIndexesBackedByCfs().size()));
-            long size = cfs.getTotalMemtableLiveSize();
-            if (allowedSize > flushingSize && size > (allowedSize - flushingSize) / maxInFlight)
-            {
-                logger.info("flushing high-traffic column family {} (estimated {} bytes)", cfs, size);
-                cfs.forceFlush();
-            }
-            else
-            {
-                liveSize += size;
-            }
-        }
-
-        if (liveSize + flushingSize <= allowedSize)
-            return;
-        logger.info("estimated {} live and {} flushing bytes used by all memtables", liveSize, flushingSize);
-
-        Collections.sort(affectedCFs, new Comparator<ColumnFamilyStore>()
-        {
-            public int compare(ColumnFamilyStore lhs, ColumnFamilyStore rhs)
-            {
-                return Long.compare(lhs.getTotalMemtableLiveSize(), rhs.getTotalMemtableLiveSize());
-            }
-        });
-
-        // flush largest first until we get below our threshold.
-        // although it looks like liveSize + flushingSize will stay a constant, it will not if flushes finish
-        // while we loop, which is especially likely to happen if the flush queue fills up (so further forceFlush calls block)
-        while (!affectedCFs.isEmpty())
-        {
-            flushingSize = calculateFlushingSize();
-            if (liveSize + flushingSize <= allowedSize)
-                break;
-
-            ColumnFamilyStore cfs = affectedCFs.remove(affectedCFs.size() - 1);
-            long size = cfs.getTotalMemtableLiveSize();
-            if (size > 0)
-            {
-                logger.info("flushing {} to free up {} bytes", cfs, size);
-                liveSize -= size;
-                cfs.forceFlush();
-            }
-        }
-
-        logger.trace("memtable memory usage is {} bytes with {} live", liveSize + flushingSize, liveSize);
-    }
-
-    private static List<ColumnFamilyStore> affectedColumnFamilies()
-    {
-        List<ColumnFamilyStore> affected = new ArrayList<>();
-        // filter out column families that aren't affected by MeteredFlusher
-        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
-            if (cfs.getCompactionStrategy().isAffectedByMeteredFlusher())
-                affected.add(cfs);
-        return affected;
-    }
-
-    private static long calculateAllowedSize()
-    {
-        long allowed = DatabaseDescriptor.getTotalMemtableSpaceInMB() * 1048576L;
-        // deduct the combined memory limit of the tables unaffected by the metered flusher (we don't flush them, we
-        // should not count their limits to the total limit either).
-        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
-            if (!cfs.getCompactionStrategy().isAffectedByMeteredFlusher())
-                allowed -= cfs.getCompactionStrategy().getMemtableReservedSize();
-        return allowed;
-    }
-
-    private static long calculateFlushingSize()
-    {
-        ColumnFamilyStore measuredCFS = Memtable.activelyMeasuring;
-        long flushing = measuredCFS != null && measuredCFS.getCompactionStrategy().isAffectedByMeteredFlusher()
-                      ? measuredCFS.getMemtableThreadSafe().getLiveSize()
-                      : 0;
-        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
-            if (cfs.getCompactionStrategy().isAffectedByMeteredFlusher())
-                for (Memtable memtable : cfs.getMemtablesPendingFlush())
-                    flushing += memtable.getLiveSize();
-        return flushing;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/MigrationRequestVerbHandler.java b/src/java/org/apache/cassandra/db/MigrationRequestVerbHandler.java
index 31a64a9..d4503ba 100644
--- a/src/java/org/apache/cassandra/db/MigrationRequestVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/MigrationRequestVerbHandler.java

@@ -29,7 +29,7 @@
 import org.apache.cassandra.service.MigrationManager;
 
 /**
- * Sends it's current schema state in form of row mutations in reply to the remote node's request.
+ * Sends it's current schema state in form of mutations in reply to the remote node's request.
  * Such a request is made when one of the nodes, by means of Gossip, detects schema disagreement in the ring.
  */
 public class MigrationRequestVerbHandler implements IVerbHandler
@@ -39,9 +39,9 @@
     public void doVerb(MessageIn message, int id)
     {
         logger.debug("Received migration request from {}.", message.from);
-        MessageOut<Collection<RowMutation>> response = new MessageOut<>(MessagingService.Verb.INTERNAL_RESPONSE,
-                                                                        SystemKeyspace.serializeSchema(),
-                                                                        MigrationManager.MigrationsSerializer.instance);
+        MessageOut<Collection<Mutation>> response = new MessageOut<>(MessagingService.Verb.INTERNAL_RESPONSE,
+                                                                     SystemKeyspace.serializeSchema(),
+                                                                     MigrationManager.MigrationsSerializer.instance);
         MessagingService.instance().sendReply(response, id, message.from);
     }
 }
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/db/RowMutation.java b/src/java/org/apache/cassandra/db/Mutation.java
similarity index 69%
rename from src/java/org/apache/cassandra/db/RowMutation.java
rename to src/java/org/apache/cassandra/db/Mutation.java
index 223225e..a6d23cb 100644
--- a/src/java/org/apache/cassandra/db/RowMutation.java
+++ b/src/java/org/apache/cassandra/db/Mutation.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
@@ -26,58 +25,67 @@
 import org.apache.commons.lang3.StringUtils;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-// TODO convert this to a Builder pattern instead of encouraging RM.add directly,
+// TODO convert this to a Builder pattern instead of encouraging M.add directly,
 // which is less-efficient since we have to keep a mutable HashMap around
-public class RowMutation implements IMutation
+public class Mutation implements IMutation
 {
-    public static final RowMutationSerializer serializer = new RowMutationSerializer();
+    public static final MutationSerializer serializer = new MutationSerializer();
+    private static final Logger logger = LoggerFactory.getLogger(Mutation.class);
+
     public static final String FORWARD_TO = "FWD_TO";
     public static final String FORWARD_FROM = "FWD_FRM";
 
     // todo this is redundant
-    // when we remove it, also restore SerializationsTest.testRowMutationRead to not regenerate new RowMutations each test
+    // when we remove it, also restore SerializationsTest.testMutationRead to not regenerate new Mutations each test
     private final String keyspaceName;
 
     private final ByteBuffer key;
     // map of column family id to mutations for that column family.
     private final Map<UUID, ColumnFamily> modifications;
 
-    public RowMutation(String keyspaceName, ByteBuffer key)
+    public Mutation(String keyspaceName, ByteBuffer key)
     {
         this(keyspaceName, key, new HashMap<UUID, ColumnFamily>());
     }
 
-    public RowMutation(String keyspaceName, ByteBuffer key, ColumnFamily cf)
+    public Mutation(String keyspaceName, ByteBuffer key, ColumnFamily cf)
     {
         this(keyspaceName, key, Collections.singletonMap(cf.id(), cf));
     }
 
-    public RowMutation(String keyspaceName, Row row)
+    public Mutation(String keyspaceName, Row row)
     {
-        this(keyspaceName, row.key.key, row.cf);
+        this(keyspaceName, row.key.getKey(), row.cf);
     }
 
-    protected RowMutation(String keyspaceName, ByteBuffer key, Map<UUID, ColumnFamily> modifications)
+    protected Mutation(String keyspaceName, ByteBuffer key, Map<UUID, ColumnFamily> modifications)
     {
         this.keyspaceName = keyspaceName;
         this.key = key;
         this.modifications = modifications;
     }
 
-    public RowMutation(ByteBuffer key, ColumnFamily cf)
+    public Mutation(ByteBuffer key, ColumnFamily cf)
     {
         this(cf.metadata().ksName, key, cf);
     }
 
-    public RowMutation copy()
+    public Mutation copy()
     {
-        return new RowMutation(keyspaceName, key, new HashMap<>(modifications));
+        Mutation copy = new Mutation(keyspaceName, key, new HashMap<>(modifications));
+        return copy;
     }
 
     public String getKeyspaceName()
@@ -121,7 +129,7 @@
     }
 
     /**
-     * @return the ColumnFamily in this RowMutation corresponding to @param cfName, creating an empty one if necessary.
+     * @return the ColumnFamily in this Mutation corresponding to @param cfName, creating an empty one if necessary.
      */
     public ColumnFamily addOrGet(String cfName)
     {
@@ -130,15 +138,10 @@
 
     public ColumnFamily addOrGet(CFMetaData cfm)
     {
-        return addOrGet(cfm, TreeMapBackedSortedColumns.factory);
-    }
-
-    public ColumnFamily addOrGet(CFMetaData cfm, ColumnFamily.Factory factory)
-    {
         ColumnFamily cf = modifications.get(cfm.cfId);
         if (cf == null)
         {
-            cf = factory.create(cfm);
+            cf = ArrayBackedSortedColumns.factory.create(cfm);
             modifications.put(cfm.cfId, cf);
         }
         return cf;
@@ -149,17 +152,17 @@
         return modifications.isEmpty();
     }
 
-    public void add(String cfName, ByteBuffer name, ByteBuffer value, long timestamp, int timeToLive)
+    public void add(String cfName, CellName name, ByteBuffer value, long timestamp, int timeToLive)
     {
         addOrGet(cfName).addColumn(name, value, timestamp, timeToLive);
     }
 
-    public void addCounter(String cfName, ByteBuffer name, long value)
+    public void addCounter(String cfName, CellName name, long value)
     {
         addOrGet(cfName).addCounter(name, value);
     }
 
-    public void add(String cfName, ByteBuffer name, ByteBuffer value, long timestamp)
+    public void add(String cfName, CellName name, ByteBuffer value, long timestamp)
     {
         add(cfName, name, value, timestamp, 0);
     }
@@ -170,13 +173,13 @@
         addOrGet(cfName).delete(new DeletionInfo(timestamp, localDeleteTime));
     }
 
-    public void delete(String cfName, ByteBuffer name, long timestamp)
+    public void delete(String cfName, CellName name, long timestamp)
     {
         int localDeleteTime = (int) (System.currentTimeMillis() / 1000);
         addOrGet(cfName).addTombstone(name, localDeleteTime, timestamp);
     }
 
-    public void deleteRange(String cfName, ByteBuffer start, ByteBuffer end, long timestamp)
+    public void deleteRange(String cfName, Composite start, Composite end, long timestamp)
     {
         int localDeleteTime = (int) (System.currentTimeMillis() / 1000);
         addOrGet(cfName).addAtom(new RangeTombstone(start, end, timestamp, localDeleteTime));
@@ -184,20 +187,20 @@
 
     public void addAll(IMutation m)
     {
-        if (!(m instanceof RowMutation))
+        if (!(m instanceof Mutation))
             throw new IllegalArgumentException();
 
-        RowMutation rm = (RowMutation)m;
-        if (!keyspaceName.equals(rm.keyspaceName) || !key.equals(rm.key))
+        Mutation mutation = (Mutation)m;
+        if (!keyspaceName.equals(mutation.keyspaceName) || !key.equals(mutation.key))
             throw new IllegalArgumentException();
 
-        for (Map.Entry<UUID, ColumnFamily> entry : rm.modifications.entrySet())
+        for (Map.Entry<UUID, ColumnFamily> entry : mutation.modifications.entrySet())
         {
             // It's slighty faster to assume the key wasn't present and fix if
             // not in the case where it wasn't there indeed.
             ColumnFamily cf = modifications.put(entry.getKey(), entry.getValue());
             if (cf != null)
-                entry.getValue().resolve(cf);
+                entry.getValue().addAll(cf);
         }
     }
 
@@ -216,14 +219,19 @@
         Keyspace.open(keyspaceName).apply(this, false);
     }
 
-    public MessageOut<RowMutation> createMessage()
+    public MessageOut<Mutation> createMessage()
     {
         return createMessage(MessagingService.Verb.MUTATION);
     }
 
-    public MessageOut<RowMutation> createMessage(MessagingService.Verb verb)
+    public MessageOut<Mutation> createMessage(MessagingService.Verb verb)
     {
-        return new MessageOut<RowMutation>(verb, this, serializer);
+        return new MessageOut<>(verb, this, serializer);
+    }
+
+    public long getTimeout()
+    {
+        return DatabaseDescriptor.getWriteRpcTimeout();
     }
 
     public String toString()
@@ -233,7 +241,7 @@
 
     public String toString(boolean shallow)
     {
-        StringBuilder buff = new StringBuilder("RowMutation(");
+        StringBuilder buff = new StringBuilder("Mutation(");
         buff.append("keyspace='").append(keyspaceName).append('\'');
         buff.append(", key='").append(ByteBufferUtil.bytesToHex(key)).append('\'');
         buff.append(", modifications=[");
@@ -252,33 +260,33 @@
         return buff.append("])").toString();
     }
 
-    public RowMutation without(UUID cfId)
+    public Mutation without(UUID cfId)
     {
-        RowMutation rm = new RowMutation(keyspaceName, key);
+        Mutation mutation = new Mutation(keyspaceName, key);
         for (Map.Entry<UUID, ColumnFamily> entry : modifications.entrySet())
             if (!entry.getKey().equals(cfId))
-                rm.add(entry.getValue());
-        return rm;
+                mutation.add(entry.getValue());
+        return mutation;
     }
 
-    public static class RowMutationSerializer implements IVersionedSerializer<RowMutation>
+    public static class MutationSerializer implements IVersionedSerializer<Mutation>
     {
-        public void serialize(RowMutation rm, DataOutput out, int version) throws IOException
+        public void serialize(Mutation mutation, DataOutputPlus out, int version) throws IOException
         {
             if (version < MessagingService.VERSION_20)
-                out.writeUTF(rm.getKeyspaceName());
+                out.writeUTF(mutation.getKeyspaceName());
 
-            ByteBufferUtil.writeWithShortLength(rm.key(), out);
+            ByteBufferUtil.writeWithShortLength(mutation.key(), out);
 
             /* serialize the modifications in the mutation */
-            int size = rm.modifications.size();
+            int size = mutation.modifications.size();
             out.writeInt(size);
             assert size > 0;
-            for (Map.Entry<UUID, ColumnFamily> entry : rm.modifications.entrySet())
+            for (Map.Entry<UUID, ColumnFamily> entry : mutation.modifications.entrySet())
                 ColumnFamily.serializer.serialize(entry.getValue(), out, version);
         }
 
-        public RowMutation deserialize(DataInput in, int version, ColumnSerializer.Flag flag) throws IOException
+        public Mutation deserialize(DataInput in, int version, ColumnSerializer.Flag flag) throws IOException
         {
             String keyspaceName = null; // will always be set from cf.metadata but javac isn't smart enough to see that
             if (version < MessagingService.VERSION_20)
@@ -306,35 +314,35 @@
                 }
             }
 
-            return new RowMutation(keyspaceName, key, modifications);
+            return new Mutation(keyspaceName, key, modifications);
         }
 
         private ColumnFamily deserializeOneCf(DataInput in, int version, ColumnSerializer.Flag flag) throws IOException
         {
-            ColumnFamily cf = ColumnFamily.serializer.deserialize(in, UnsortedColumns.factory, flag, version);
-            // We don't allow RowMutation with null column family, so we should never get null back.
+            ColumnFamily cf = ColumnFamily.serializer.deserialize(in, ArrayBackedSortedColumns.factory, flag, version);
+            // We don't allow Mutation with null column family, so we should never get null back.
             assert cf != null;
             return cf;
         }
 
-        public RowMutation deserialize(DataInput in, int version) throws IOException
+        public Mutation deserialize(DataInput in, int version) throws IOException
         {
             return deserialize(in, version, ColumnSerializer.Flag.FROM_REMOTE);
         }
 
-        public long serializedSize(RowMutation rm, int version)
+        public long serializedSize(Mutation mutation, int version)
         {
             TypeSizes sizes = TypeSizes.NATIVE;
             int size = 0;
 
             if (version < MessagingService.VERSION_20)
-                size += sizes.sizeof(rm.getKeyspaceName());
+                size += sizes.sizeof(mutation.getKeyspaceName());
 
-            int keySize = rm.key().remaining();
+            int keySize = mutation.key().remaining();
             size += sizes.sizeof((short) keySize) + keySize;
 
-            size += sizes.sizeof(rm.modifications.size());
-            for (Map.Entry<UUID,ColumnFamily> entry : rm.modifications.entrySet())
+            size += sizes.sizeof(mutation.modifications.size());
+            for (Map.Entry<UUID,ColumnFamily> entry : mutation.modifications.entrySet())
                 size += ColumnFamily.serializer.serializedSize(entry.getValue(), TypeSizes.NATIVE, version);
 
             return size;

diff --git a/src/java/org/apache/cassandra/db/RowMutationVerbHandler.java b/src/java/org/apache/cassandra/db/MutationVerbHandler.java
similarity index 69%
rename from src/java/org/apache/cassandra/db/RowMutationVerbHandler.java
rename to src/java/org/apache/cassandra/db/MutationVerbHandler.java
index da7fa6a..43ffeae 100644
--- a/src/java/org/apache/cassandra/db/RowMutationVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/MutationVerbHandler.java

@@ -28,39 +28,37 @@
 import org.apache.cassandra.net.*;
 import org.apache.cassandra.tracing.Tracing;
 
-public class RowMutationVerbHandler implements IVerbHandler<RowMutation>
+public class MutationVerbHandler implements IVerbHandler<Mutation>
 {
-    private static final Logger logger = LoggerFactory.getLogger(RowMutationVerbHandler.class);
+    private static final Logger logger = LoggerFactory.getLogger(MutationVerbHandler.class);
 
-    public void doVerb(MessageIn<RowMutation> message, int id)
+    public void doVerb(MessageIn<Mutation> message, int id)
     {
         try
         {
-            RowMutation rm = message.payload;
-
             // Check if there were any forwarding headers in this message
-            byte[] from = message.parameters.get(RowMutation.FORWARD_FROM);
+            byte[] from = message.parameters.get(Mutation.FORWARD_FROM);
             InetAddress replyTo;
             if (from == null)
             {
                 replyTo = message.from;
-                byte[] forwardBytes = message.parameters.get(RowMutation.FORWARD_TO);
+                byte[] forwardBytes = message.parameters.get(Mutation.FORWARD_TO);
                 if (forwardBytes != null)
-                    forwardToLocalNodes(rm, message.verb, forwardBytes, message.from, message.version);
+                    forwardToLocalNodes(message.payload, message.verb, forwardBytes, message.from);
             }
             else
             {
                 replyTo = InetAddress.getByAddress(from);
             }
 
-            rm.apply();
+            message.payload.apply();
             WriteResponse response = new WriteResponse();
             Tracing.trace("Enqueuing response to {}", replyTo);
             MessagingService.instance().sendReply(response.createMessage(), id, replyTo);
         }
         catch (IOException e)
         {
-            logger.error("Error in row mutation", e);
+            logger.error("Error in mutation", e);
         }
     }
 
@@ -68,27 +66,18 @@
      * Older version (< 1.0) will not send this message at all, hence we don't
      * need to check the version of the data.
      */
-    private void forwardToLocalNodes(RowMutation rm, MessagingService.Verb verb, byte[] forwardBytes, InetAddress from, int version) throws IOException
+    private void forwardToLocalNodes(Mutation mutation, MessagingService.Verb verb, byte[] forwardBytes, InetAddress from) throws IOException
     {
         DataInputStream in = new DataInputStream(new FastByteArrayInputStream(forwardBytes));
         int size = in.readInt();
 
         // tell the recipients who to send their ack to
-        MessageOut<RowMutation> message = new MessageOut<RowMutation>(verb, rm, RowMutation.serializer).withParameter(RowMutation.FORWARD_FROM, from.getAddress());
+        MessageOut<Mutation> message = new MessageOut<>(verb, mutation, Mutation.serializer).withParameter(Mutation.FORWARD_FROM, from.getAddress());
         // Send a message to each of the addresses on our Forward List
         for (int i = 0; i < size; i++)
         {
             InetAddress address = CompactEndpointSerializationHelper.deserialize(in);
-            int id;
-            if (version < MessagingService.VERSION_20)
-            {
-                String s = in.readUTF();
-                id = Integer.parseInt(s);
-            }
-            else
-            {
-                id = in.readInt();
-            }
+            int id = in.readInt();
             Tracing.trace("Enqueuing forwarded write to {}", address);
             MessagingService.instance().sendOneWay(message, id, address);
         }

diff --git a/src/java/org/apache/cassandra/db/NativeCell.java b/src/java/org/apache/cassandra/db/NativeCell.java
new file mode 100644
index 0000000..dac5674
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/NativeCell.java

@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+import org.apache.cassandra.utils.memory.NativeAllocator;
+
+public class NativeCell extends AbstractNativeCell
+{
+    private static final long SIZE = ObjectSizes.measure(new NativeCell());
+
+    NativeCell()
+    {}
+
+    public NativeCell(NativeAllocator allocator, OpOrder.Group writeOp, Cell copyOf)
+    {
+        super(allocator, writeOp, copyOf);
+    }
+
+    @Override
+    public CellName name()
+    {
+        return this;
+    }
+
+    @Override
+    public long timestamp()
+    {
+        return getLong(TIMESTAMP_OFFSET);
+    }
+
+    @Override
+    public Cell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        return new BufferCell(copy(metadata, allocator), allocator.clone(value()), timestamp());
+    }
+
+    @Override
+    public Cell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        return allocator.clone(this, metadata, opGroup);
+    }
+
+    @Override
+    public void updateDigest(MessageDigest digest)
+    {
+        updateWithName(digest);  // name
+        updateWithValue(digest); // value
+
+        FBUtilities.updateWithLong(digest, timestamp());
+        FBUtilities.updateWithByte(digest, serializationFlags());
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return SIZE;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return SIZE;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/NativeCounterCell.java b/src/java/org/apache/cassandra/db/NativeCounterCell.java
new file mode 100644
index 0000000..3fe73ce
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/NativeCounterCell.java

@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+import org.apache.cassandra.utils.memory.NativeAllocator;
+
+public class NativeCounterCell extends NativeCell implements CounterCell
+{
+    private static final long SIZE = ObjectSizes.measure(new NativeCounterCell());
+
+    private NativeCounterCell()
+    {}
+
+    public NativeCounterCell(NativeAllocator allocator, OpOrder.Group writeOp, CounterCell copyOf)
+    {
+        super(allocator, writeOp, copyOf);
+    }
+
+    @Override
+    protected void construct(Cell from)
+    {
+        super.construct(from);
+        setLong(internalSize() - 8, ((CounterCell) from).timestampOfLastDelete());
+    }
+
+    @Override
+    protected int postfixSize()
+    {
+        return 8;
+    }
+
+    @Override
+    protected int sizeOf(Cell cell)
+    {
+        return 8 + super.sizeOf(cell);
+    }
+
+    @Override
+    public long timestampOfLastDelete()
+    {
+        return getLong(internalSize() - 8);
+    }
+
+    @Override
+    public long total()
+    {
+        return contextManager.total(value());
+    }
+
+    @Override
+    public boolean hasLegacyShards()
+    {
+        return contextManager.hasLegacyShards(value());
+    }
+
+    @Override
+    public Cell markLocalToBeCleared()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Cell diff(Cell cell)
+    {
+        return diffCounter(cell);
+    }
+
+    @Override
+    public Cell reconcile(Cell cell)
+    {
+        return reconcileCounter(cell);
+    }
+
+    @Override
+    public int serializationFlags()
+    {
+        return ColumnSerializer.COUNTER_MASK;
+    }
+
+    @Override
+    public int cellDataSize()
+    {
+        // A counter column adds 8 bytes for timestampOfLastDelete to Cell.
+        return super.cellDataSize() + TypeSizes.NATIVE.sizeof(timestampOfLastDelete());
+    }
+
+    @Override
+    public int serializedSize(CellNameType type, TypeSizes typeSizes)
+    {
+        return super.serializedSize(type, typeSizes) + typeSizes.sizeof(timestampOfLastDelete());
+    }
+
+    @Override
+    public void validateFields(CFMetaData metadata) throws MarshalException
+    {
+        validateName(metadata);
+        // We cannot use the value validator as for other columns as the CounterColumnType validate a long,
+        // which is not the internal representation of counters
+        contextManager.validateContext(value());
+    }
+
+    /*
+     * We have to special case digest creation for counter column because
+     * we don't want to include the information about which shard of the
+     * context is a delta or not, since this information differs from node to
+     * node.
+     */
+    @Override
+    public void updateDigest(MessageDigest digest)
+    {
+        updateWithName(digest);
+
+        // We don't take the deltas into account in a digest
+        contextManager.updateDigest(digest, value());
+
+        FBUtilities.updateWithLong(digest, timestamp());
+        FBUtilities.updateWithByte(digest, serializationFlags());
+        FBUtilities.updateWithLong(digest, timestampOfLastDelete());
+    }
+
+    @Override
+    public String getString(CellNameType comparator)
+    {
+        return String.format("%s(%s:false:%s@%d!%d)",
+                             getClass().getSimpleName(),
+                             comparator.getString(name()),
+                             contextManager.toString(value()),
+                             timestamp(),
+                             timestampOfLastDelete());
+    }
+
+    @Override
+    public CounterCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        return new BufferCounterCell(copy(metadata, allocator), allocator.clone(value()), timestamp(), timestampOfLastDelete());
+    }
+
+    @Override
+    public CounterCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        return allocator.clone(this, metadata, opGroup);
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return SIZE;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return SIZE;
+    }
+
+    public boolean equals(Cell cell)
+    {
+        return cell instanceof CounterCell && equals((CounterCell) cell);
+    }
+
+    public boolean equals(CounterCell cell)
+    {
+        return super.equals(cell) && timestampOfLastDelete() == cell.timestampOfLastDelete();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java
new file mode 100644
index 0000000..79dc53f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java

@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.utils.FastByteOperations;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.MemoryUtil;
+import org.apache.cassandra.utils.memory.NativeAllocator;
+
+public class NativeDecoratedKey extends DecoratedKey
+{
+    final long peer;
+
+    public NativeDecoratedKey(Token token, NativeAllocator allocator, OpOrder.Group writeOp, ByteBuffer key)
+    {
+        super(token);
+        assert key != null;
+        int size = key.remaining();
+        this.peer = allocator.allocate(4 + size, writeOp);
+        MemoryUtil.setInt(peer, size);
+        MemoryUtil.setBytes(peer + 4, key);
+    }
+
+    public ByteBuffer getKey()
+    {
+        return MemoryUtil.getByteBuffer(peer + 4, MemoryUtil.getInt(peer));
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/NativeDeletedCell.java b/src/java/org/apache/cassandra/db/NativeDeletedCell.java
new file mode 100644
index 0000000..e900635
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/NativeDeletedCell.java

@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemoryUtil;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+import org.apache.cassandra.utils.memory.NativeAllocator;
+
+public class NativeDeletedCell extends NativeCell implements DeletedCell
+{
+    private static final long SIZE = ObjectSizes.measure(new NativeDeletedCell());
+
+    private NativeDeletedCell()
+    {}
+
+    public NativeDeletedCell(NativeAllocator allocator, OpOrder.Group writeOp, DeletedCell copyOf)
+    {
+        super(allocator, writeOp, copyOf);
+    }
+
+    @Override
+    public Cell reconcile(Cell cell)
+    {
+        if (cell instanceof DeletedCell)
+            return super.reconcile(cell);
+        return cell.reconcile(this);
+    }
+
+    @Override
+    public boolean isLive()
+    {
+        return false;
+    }
+
+    @Override
+    public boolean isLive(long now)
+    {
+        return false;
+    }
+
+    @Override
+    public int getLocalDeletionTime()
+    {
+        int v = getInt(valueStartOffset());
+        return MemoryUtil.INVERTED_ORDER ? Integer.reverseBytes(v) : v;
+    }
+
+    @Override
+    public int serializationFlags()
+    {
+        return ColumnSerializer.DELETION_MASK;
+    }
+
+    @Override
+    public void validateFields(CFMetaData metadata) throws MarshalException
+    {
+        validateName(metadata);
+
+        if ((int) (internalSize() - valueStartOffset()) != 4)
+            throw new MarshalException("A tombstone value should be 4 bytes long");
+        if (getLocalDeletionTime() < 0)
+            throw new MarshalException("The local deletion time should not be negative");
+    }
+
+    @Override
+    public void updateDigest(MessageDigest digest)
+    {
+        updateWithName(digest);
+        FBUtilities.updateWithLong(digest, timestamp());
+        FBUtilities.updateWithByte(digest, serializationFlags());
+    }
+
+    @Override
+    public DeletedCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        return new BufferDeletedCell(copy(metadata, allocator), allocator.clone(value()), timestamp());
+    }
+
+    @Override
+    public DeletedCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        return allocator.clone(this, metadata, opGroup);
+    }
+
+    @Override
+    public boolean equals(Cell cell)
+    {
+        return timestamp() == cell.timestamp() && getLocalDeletionTime() == cell.getLocalDeletionTime() && name().equals(cell.name());
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return SIZE;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return SIZE;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/NativeExpiringCell.java b/src/java/org/apache/cassandra/db/NativeExpiringCell.java
new file mode 100644
index 0000000..d97e080
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/NativeExpiringCell.java

@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+import org.apache.cassandra.utils.memory.NativeAllocator;
+
+public class NativeExpiringCell extends NativeCell implements ExpiringCell
+{
+    private static final long SIZE = ObjectSizes.measure(new NativeExpiringCell());
+
+    private NativeExpiringCell()
+    {}
+
+    public NativeExpiringCell(NativeAllocator allocator, OpOrder.Group writeOp, ExpiringCell copyOf)
+    {
+        super(allocator, writeOp, copyOf);
+    }
+
+    @Override
+    protected int sizeOf(Cell cell)
+    {
+        return super.sizeOf(cell) + 8;
+    }
+
+    @Override
+    protected void construct(Cell from)
+    {
+        ExpiringCell expiring = (ExpiringCell) from;
+
+        setInt(internalSize() - 4, expiring.getTimeToLive());
+        setInt(internalSize() - 8, expiring.getLocalDeletionTime());
+        super.construct(from);
+    }
+
+    @Override
+    protected int postfixSize()
+    {
+        return 8;
+    }
+
+    @Override
+    public int getTimeToLive()
+    {
+        return getInt(internalSize() - 4);
+    }
+
+    @Override
+    public int getLocalDeletionTime()
+    {
+        return getInt(internalSize() - 8);
+    }
+
+    @Override
+    public boolean isLive()
+    {
+        return isLive(System.currentTimeMillis());
+    }
+
+    @Override
+    public boolean isLive(long now)
+    {
+        return (int) (now / 1000) < getLocalDeletionTime();
+    }
+
+    @Override
+    public int serializationFlags()
+    {
+        return ColumnSerializer.EXPIRATION_MASK;
+    }
+
+    @Override
+    public int cellDataSize()
+    {
+        return super.cellDataSize() + TypeSizes.NATIVE.sizeof(getLocalDeletionTime()) + TypeSizes.NATIVE.sizeof(getTimeToLive());
+    }
+
+    @Override
+    public int serializedSize(CellNameType type, TypeSizes typeSizes)
+    {
+        /*
+         * An expired column adds to a Cell :
+         *    4 bytes for the localExpirationTime
+         *  + 4 bytes for the timeToLive
+        */
+        return super.serializedSize(type, typeSizes) + typeSizes.sizeof(getLocalDeletionTime()) + typeSizes.sizeof(getTimeToLive());
+    }
+
+    @Override
+    public void validateFields(CFMetaData metadata) throws MarshalException
+    {
+        super.validateFields(metadata);
+
+        if (getTimeToLive() <= 0)
+            throw new MarshalException("A column TTL should be > 0");
+        if (getLocalDeletionTime() < 0)
+            throw new MarshalException("The local expiration time should not be negative");
+    }
+
+    @Override
+    public void updateDigest(MessageDigest digest)
+    {
+        super.updateDigest(digest);
+        FBUtilities.updateWithInt(digest, getTimeToLive());
+    }
+
+    @Override
+    public Cell reconcile(Cell cell)
+    {
+        long ts1 = timestamp(), ts2 = cell.timestamp();
+        if (ts1 != ts2)
+            return ts1 < ts2 ? cell : this;
+        // we should prefer tombstones
+        if (cell instanceof DeletedCell)
+            return cell;
+        int c = value().compareTo(cell.value());
+        if (c != 0)
+            return c < 0 ? cell : this;
+        // If we have same timestamp and value, prefer the longest ttl
+        if (cell instanceof ExpiringCell)
+        {
+            int let1 = getLocalDeletionTime(), let2 = cell.getLocalDeletionTime();
+            if (let1 < let2)
+                return cell;
+        }
+        return this;
+    }
+
+    public boolean equals(Cell cell)
+    {
+        return cell instanceof ExpiringCell && equals((ExpiringCell) cell);
+    }
+
+    protected boolean equals(ExpiringCell cell)
+    {
+        // super.equals() returns false if o is not a CounterCell
+        return super.equals(cell)
+                && getLocalDeletionTime() == cell.getLocalDeletionTime()
+                && getTimeToLive() == cell.getTimeToLive();
+    }
+
+    @Override
+    public String getString(CellNameType comparator)
+    {
+        return String.format("%s(%s!%d)", getClass().getSimpleName(), super.getString(comparator), getTimeToLive());
+    }
+
+    @Override
+    public ExpiringCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
+    {
+        return new BufferExpiringCell(name().copy(metadata, allocator), allocator.clone(value()), timestamp(), getTimeToLive(), getLocalDeletionTime());
+    }
+
+    @Override
+    public ExpiringCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
+    {
+        return allocator.clone(this, metadata, opGroup);
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return SIZE;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return SIZE;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/OnDiskAtom.java b/src/java/org/apache/cassandra/db/OnDiskAtom.java
index 2956d6b..b53e43b 100644
--- a/src/java/org/apache/cassandra/db/OnDiskAtom.java
+++ b/src/java/org/apache/cassandra/db/OnDiskAtom.java

@@ -18,49 +18,50 @@
 package org.apache.cassandra.db;
 
 import java.io.*;
-import java.nio.ByteBuffer;
 import java.security.MessageDigest;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.io.ISSTableSerializer;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 public interface OnDiskAtom
 {
-    public ByteBuffer name();
+    public Composite name();
 
     /**
      * For a standard column, this is the same as timestamp().
      * For a super column, this is the min/max column timestamp of the sub columns.
      */
-    public long minTimestamp();
-    public long maxTimestamp();
+    public long timestamp();
     public int getLocalDeletionTime(); // for tombstone GC, so int is sufficient granularity
 
-    public int serializedSize(TypeSizes typeSizes);
-    public long serializedSizeForSSTable();
-
     public void validateFields(CFMetaData metadata) throws MarshalException;
     public void updateDigest(MessageDigest digest);
 
     public static class Serializer implements ISSTableSerializer<OnDiskAtom>
     {
-        public static Serializer instance = new Serializer();
+        private final CellNameType type;
 
-        private Serializer() {}
-
-        public void serializeForSSTable(OnDiskAtom atom, DataOutput out) throws IOException
+        public Serializer(CellNameType type)
         {
-            if (atom instanceof Column)
+            this.type = type;
+        }
+
+        public void serializeForSSTable(OnDiskAtom atom, DataOutputPlus out) throws IOException
+        {
+            if (atom instanceof Cell)
             {
-                Column.serializer.serialize((Column) atom, out);
+                type.columnSerializer().serialize((Cell)atom, out);
             }
             else
             {
                 assert atom instanceof RangeTombstone;
-                RangeTombstone.serializer.serializeForSSTable((RangeTombstone)atom, out);
+                type.rangeTombstoneSerializer().serializeForSSTable((RangeTombstone)atom, out);
             }
         }
 
@@ -71,8 +72,8 @@
 
         public OnDiskAtom deserializeFromSSTable(DataInput in, ColumnSerializer.Flag flag, int expireBefore, Descriptor.Version version) throws IOException
         {
-            ByteBuffer name = ByteBufferUtil.readWithShortLength(in);
-            if (name.remaining() <= 0)
+            Composite name = type.serializer().deserialize(in);
+            if (name.isEmpty())
             {
                 // SSTableWriter.END_OF_ROW
                 return null;
@@ -80,9 +81,22 @@
 
             int b = in.readUnsignedByte();
             if ((b & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0)
-                return RangeTombstone.serializer.deserializeBody(in, name, version);
+                return type.rangeTombstoneSerializer().deserializeBody(in, name, version);
             else
-                return Column.serializer.deserializeColumnBody(in, name, b, flag, expireBefore);
+                return type.columnSerializer().deserializeColumnBody(in, (CellName)name, b, flag, expireBefore);
+        }
+
+        public long serializedSizeForSSTable(OnDiskAtom atom)
+        {
+            if (atom instanceof Cell)
+            {
+                return type.columnSerializer().serializedSize((Cell)atom, TypeSizes.NATIVE);
+            }
+            else
+            {
+                assert atom instanceof RangeTombstone;
+                return type.rangeTombstoneSerializer().serializedSizeForSSTable((RangeTombstone)atom);
+            }
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/PagedRangeCommand.java b/src/java/org/apache/cassandra/db/PagedRangeCommand.java
index d6f3ca1..f2d81b9 100644
--- a/src/java/org/apache/cassandra/db/PagedRangeCommand.java
+++ b/src/java/org/apache/cassandra/db/PagedRangeCommand.java

@@ -18,54 +18,57 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.*;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class PagedRangeCommand extends AbstractRangeCommand
 {
     public static final IVersionedSerializer<PagedRangeCommand> serializer = new Serializer();
 
-    public final ByteBuffer start;
-    public final ByteBuffer stop;
+    public final Composite start;
+    public final Composite stop;
     public final int limit;
+    private final boolean countCQL3Rows;
 
     public PagedRangeCommand(String keyspace,
                              String columnFamily,
                              long timestamp,
                              AbstractBounds<RowPosition> keyRange,
                              SliceQueryFilter predicate,
-                             ByteBuffer start,
-                             ByteBuffer stop,
+                             Composite start,
+                             Composite stop,
                              List<IndexExpression> rowFilter,
-                             int limit)
+                             int limit,
+                             boolean countCQL3Rows)
     {
         super(keyspace, columnFamily, timestamp, keyRange, predicate, rowFilter);
         this.start = start;
         this.stop = stop;
         this.limit = limit;
+        this.countCQL3Rows = countCQL3Rows;
     }
 
     public MessageOut<PagedRangeCommand> createMessage()
     {
-        return new MessageOut<PagedRangeCommand>(MessagingService.Verb.PAGED_RANGE, this, serializer);
+        return new MessageOut<>(MessagingService.Verb.PAGED_RANGE, this, serializer);
     }
 
     public AbstractRangeCommand forSubRange(AbstractBounds<RowPosition> subRange)
     {
-        ByteBuffer newStart = subRange.left.equals(keyRange.left) ? start : ((SliceQueryFilter)predicate).start();
-        ByteBuffer newStop = subRange.right.equals(keyRange.right) ? stop : ((SliceQueryFilter)predicate).finish();
+        Composite newStart = subRange.left.equals(keyRange.left) ? start : ((SliceQueryFilter)predicate).start();
+        Composite newStop = subRange.right.equals(keyRange.right) ? stop : ((SliceQueryFilter)predicate).finish();
         return new PagedRangeCommand(keyspace,
                                      columnFamily,
                                      timestamp,
@@ -74,7 +77,8 @@
                                      newStart,
                                      newStop,
                                      rowFilter,
-                                     limit);
+                                     limit,
+                                     countCQL3Rows);
     }
 
     public AbstractRangeCommand withUpdatedLimit(int newLimit)
@@ -87,7 +91,8 @@
                                      start,
                                      stop,
                                      rowFilter,
-                                     newLimit);
+                                     newLimit,
+                                     countCQL3Rows);
     }
 
     public int limit()
@@ -97,15 +102,7 @@
 
     public boolean countCQL3Rows()
     {
-        // We only use PagedRangeCommand for CQL3. However, for SELECT DISTINCT, we want to return false here, because
-        // we just want to pick the first cell of each partition and returning true here would throw off the logic in
-        // ColumnFamilyStore.filter().
-        // What we do know is that for a SELECT DISTINCT the underlying SliceQueryFilter will have a compositesToGroup==-1
-        // and a count==1. And while it would be possible for a normal SELECT on a COMPACT table to also have such
-        // parameters, it's fine returning false since if we do count one cell for each partition, then each partition
-        // will coincide with exactly one CQL3 row.
-        SliceQueryFilter filter = (SliceQueryFilter)predicate;
-        return filter.compositesToGroup >= 0 || filter.count != 1;
+        return countCQL3Rows;
     }
 
     public List<Row> executeLocally()
@@ -127,7 +124,7 @@
 
     private static class Serializer implements IVersionedSerializer<PagedRangeCommand>
     {
-        public void serialize(PagedRangeCommand cmd, DataOutput out, int version) throws IOException
+        public void serialize(PagedRangeCommand cmd, DataOutputPlus out, int version) throws IOException
         {
             out.writeUTF(cmd.keyspace);
             out.writeUTF(cmd.columnFamily);
@@ -135,23 +132,27 @@
 
             AbstractBounds.serializer.serialize(cmd.keyRange, out, version);
 
+            CFMetaData metadata = Schema.instance.getCFMetaData(cmd.keyspace, cmd.columnFamily);
+
             // SliceQueryFilter (the count is not used)
             SliceQueryFilter filter = (SliceQueryFilter)cmd.predicate;
-            SliceQueryFilter.serializer.serialize(filter, out, version);
+            metadata.comparator.sliceQueryFilterSerializer().serialize(filter, out, version);
 
             // The start and stop of the page
-            ByteBufferUtil.writeWithShortLength(cmd.start, out);
-            ByteBufferUtil.writeWithShortLength(cmd.stop, out);
+            metadata.comparator.serializer().serialize(cmd.start, out);
+            metadata.comparator.serializer().serialize(cmd.stop, out);
 
             out.writeInt(cmd.rowFilter.size());
             for (IndexExpression expr : cmd.rowFilter)
             {
-                ByteBufferUtil.writeWithShortLength(expr.column_name, out);
-                out.writeInt(expr.op.getValue());
+                ByteBufferUtil.writeWithShortLength(expr.column, out);
+                out.writeInt(expr.operator.ordinal());
                 ByteBufferUtil.writeWithShortLength(expr.value, out);
             }
 
             out.writeInt(cmd.limit);
+            if (version >= MessagingService.VERSION_21)
+                out.writeBoolean(cmd.countCQL3Rows);
         }
 
         public PagedRangeCommand deserialize(DataInput in, int version) throws IOException
@@ -162,23 +163,28 @@
 
             AbstractBounds<RowPosition> keyRange = AbstractBounds.serializer.deserialize(in, version).toRowBounds();
 
-            SliceQueryFilter predicate = SliceQueryFilter.serializer.deserialize(in, version);
+            CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, columnFamily);
 
-            ByteBuffer start = ByteBufferUtil.readWithShortLength(in);
-            ByteBuffer stop = ByteBufferUtil.readWithShortLength(in);
+            SliceQueryFilter predicate = metadata.comparator.sliceQueryFilterSerializer().deserialize(in, version);
+
+            Composite start = metadata.comparator.serializer().deserialize(in);
+            Composite stop =  metadata.comparator.serializer().deserialize(in);
 
             int filterCount = in.readInt();
             List<IndexExpression> rowFilter = new ArrayList<IndexExpression>(filterCount);
             for (int i = 0; i < filterCount; i++)
             {
                 IndexExpression expr = new IndexExpression(ByteBufferUtil.readWithShortLength(in),
-                                                           IndexOperator.findByValue(in.readInt()),
+                                                           IndexExpression.Operator.findByOrdinal(in.readInt()),
                                                            ByteBufferUtil.readWithShortLength(in));
                 rowFilter.add(expr);
             }
 
             int limit = in.readInt();
-            return new PagedRangeCommand(keyspace, columnFamily, timestamp, keyRange, predicate, start, stop, rowFilter, limit);
+            boolean countCQL3Rows = version >= MessagingService.VERSION_21
+                                  ? in.readBoolean()
+                                  : predicate.compositesToGroup >= 0 || predicate.count != 1; // See #6857
+            return new PagedRangeCommand(keyspace, columnFamily, timestamp, keyRange, predicate, start, stop, rowFilter, limit, countCQL3Rows);
         }
 
         public long serializedSize(PagedRangeCommand cmd, int version)
@@ -191,20 +197,24 @@
 
             size += AbstractBounds.serializer.serializedSize(cmd.keyRange, version);
 
-            size += SliceQueryFilter.serializer.serializedSize((SliceQueryFilter)cmd.predicate, version);
+            CFMetaData metadata = Schema.instance.getCFMetaData(cmd.keyspace, cmd.columnFamily);
 
-            size += TypeSizes.NATIVE.sizeofWithShortLength(cmd.start);
-            size += TypeSizes.NATIVE.sizeofWithShortLength(cmd.stop);
+            size += metadata.comparator.sliceQueryFilterSerializer().serializedSize((SliceQueryFilter)cmd.predicate, version);
+
+            size += metadata.comparator.serializer().serializedSize(cmd.start, TypeSizes.NATIVE);
+            size += metadata.comparator.serializer().serializedSize(cmd.stop, TypeSizes.NATIVE);
 
             size += TypeSizes.NATIVE.sizeof(cmd.rowFilter.size());
             for (IndexExpression expr : cmd.rowFilter)
             {
-                size += TypeSizes.NATIVE.sizeofWithShortLength(expr.column_name);
-                size += TypeSizes.NATIVE.sizeof(expr.op.getValue());
+                size += TypeSizes.NATIVE.sizeofWithShortLength(expr.column);
+                size += TypeSizes.NATIVE.sizeof(expr.operator.ordinal());
                 size += TypeSizes.NATIVE.sizeofWithShortLength(expr.value);
             }
 
             size += TypeSizes.NATIVE.sizeof(cmd.limit);
+            if (version >= MessagingService.VERSION_21)
+                size += TypeSizes.NATIVE.sizeof(cmd.countCQL3Rows);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/db/RangeSliceCommand.java b/src/java/org/apache/cassandra/db/RangeSliceCommand.java
index 4aa1595..82e892c 100644
--- a/src/java/org/apache/cassandra/db/RangeSliceCommand.java
+++ b/src/java/org/apache/cassandra/db/RangeSliceCommand.java

@@ -18,26 +18,22 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 
+import com.google.common.base.Objects;
+
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.filter.ExtendedFilter;
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.pager.Pageable;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class RangeSliceCommand extends AbstractRangeCommand implements Pageable
@@ -49,37 +45,37 @@
     public final boolean isPaging;
 
     public RangeSliceCommand(String keyspace,
-                             String column_family,
+                             String columnFamily,
                              long timestamp,
                              IDiskAtomFilter predicate,
                              AbstractBounds<RowPosition> range,
                              int maxResults)
     {
-        this(keyspace, column_family, timestamp, predicate, range, null, maxResults, false, false);
+        this(keyspace, columnFamily, timestamp, predicate, range, null, maxResults, false, false);
     }
 
     public RangeSliceCommand(String keyspace,
-                             String column_family,
+                             String columnFamily,
                              long timestamp,
                              IDiskAtomFilter predicate,
                              AbstractBounds<RowPosition> range,
                              List<IndexExpression> row_filter,
                              int maxResults)
     {
-        this(keyspace, column_family, timestamp, predicate, range, row_filter, maxResults, false, false);
+        this(keyspace, columnFamily, timestamp, predicate, range, row_filter, maxResults, false, false);
     }
 
     public RangeSliceCommand(String keyspace,
-                             String column_family,
+                             String columnFamily,
                              long timestamp,
                              IDiskAtomFilter predicate,
                              AbstractBounds<RowPosition> range,
-                             List<IndexExpression> row_filter,
+                             List<IndexExpression> rowFilter,
                              int maxResults,
                              boolean countCQL3Rows,
                              boolean isPaging)
     {
-        super(keyspace, column_family, timestamp, range, predicate, row_filter);
+        super(keyspace, columnFamily, timestamp, range, predicate, rowFilter);
         this.maxResults = maxResults;
         this.countCQL3Rows = countCQL3Rows;
         this.isPaging = isPaging;
@@ -87,7 +83,7 @@
 
     public MessageOut<RangeSliceCommand> createMessage()
     {
-        return new MessageOut<RangeSliceCommand>(MessagingService.Verb.RANGE_SLICE, this, serializer);
+        return new MessageOut<>(MessagingService.Verb.RANGE_SLICE, this, serializer);
     }
 
     public AbstractRangeCommand forSubRange(AbstractBounds<RowPosition> subRange)
@@ -140,50 +136,30 @@
     @Override
     public String toString()
     {
-        return "RangeSliceCommand{" +
-               "keyspace='" + keyspace + '\'' +
-               ", columnFamily='" + columnFamily + '\'' +
-               ", timestamp=" + timestamp +
-               ", predicate=" + predicate +
-               ", range=" + keyRange +
-               ", rowFilter =" + rowFilter +
-               ", maxResults=" + maxResults +
-               ", countCQL3Rows=" + countCQL3Rows +
-               "}";
+        return Objects.toStringHelper(this)
+                      .add("keyspace", keyspace)
+                      .add("columnFamily", columnFamily)
+                      .add("predicate", predicate)
+                      .add("keyRange", keyRange)
+                      .add("rowFilter", rowFilter)
+                      .add("maxResults", maxResults)
+                      .add("counterCQL3Rows", countCQL3Rows)
+                      .add("timestamp", timestamp)
+                      .toString();
     }
 }
 
 class RangeSliceCommandSerializer implements IVersionedSerializer<RangeSliceCommand>
 {
-    public void serialize(RangeSliceCommand sliceCommand, DataOutput out, int version) throws IOException
+    public void serialize(RangeSliceCommand sliceCommand, DataOutputPlus out, int version) throws IOException
     {
         out.writeUTF(sliceCommand.keyspace);
         out.writeUTF(sliceCommand.columnFamily);
+        out.writeLong(sliceCommand.timestamp);
 
-        if (version >= MessagingService.VERSION_20)
-            out.writeLong(sliceCommand.timestamp);
+        CFMetaData metadata = Schema.instance.getCFMetaData(sliceCommand.keyspace, sliceCommand.columnFamily);
 
-        IDiskAtomFilter filter = sliceCommand.predicate;
-        if (version < MessagingService.VERSION_20)
-        {
-            // Pre-2.0, we need to know if it's a super column. If it is, we
-            // must extract the super column name from the predicate (and
-            // modify the predicate accordingly)
-            ByteBuffer sc = null;
-            CFMetaData metadata = Schema.instance.getCFMetaData(sliceCommand.getKeyspace(), sliceCommand.columnFamily);
-            if (metadata.cfType == ColumnFamilyType.Super)
-            {
-                SuperColumns.SCFilter scFilter = SuperColumns.filterToSC((CompositeType)metadata.comparator, filter);
-                sc = scFilter.scName;
-                filter = scFilter.updatedFilter;
-            }
-
-            out.writeInt(sc == null ? 0 : sc.remaining());
-            if (sc != null)
-                ByteBufferUtil.write(sc, out);
-        }
-
-        IDiskAtomFilter.Serializer.instance.serialize(filter, out, version);
+        metadata.comparator.diskAtomFilterSerializer().serialize(sliceCommand.predicate, out, version);
 
         if (sliceCommand.rowFilter == null)
         {
@@ -194,8 +170,8 @@
             out.writeInt(sliceCommand.rowFilter.size());
             for (IndexExpression expr : sliceCommand.rowFilter)
             {
-                ByteBufferUtil.writeWithShortLength(expr.column_name, out);
-                out.writeInt(expr.op.getValue());
+                ByteBufferUtil.writeWithShortLength(expr.column, out);
+                out.writeInt(expr.operator.ordinal());
                 ByteBufferUtil.writeWithShortLength(expr.value, out);
             }
         }
@@ -209,52 +185,20 @@
     {
         String keyspace = in.readUTF();
         String columnFamily = in.readUTF();
-
-        long timestamp = version < MessagingService.VERSION_20 ? System.currentTimeMillis() : in.readLong();
+        long timestamp = in.readLong();
 
         CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, columnFamily);
 
-        IDiskAtomFilter predicate;
-        if (version < MessagingService.VERSION_20)
-        {
-            int scLength = in.readInt();
-            ByteBuffer superColumn = null;
-            if (scLength > 0)
-            {
-                byte[] buf = new byte[scLength];
-                in.readFully(buf);
-                superColumn = ByteBuffer.wrap(buf);
-            }
-
-            AbstractType<?> comparator;
-            if (metadata.cfType == ColumnFamilyType.Super)
-            {
-                CompositeType type = (CompositeType)metadata.comparator;
-                comparator = superColumn == null ? type.types.get(0) : type.types.get(1);
-            }
-            else
-            {
-                comparator = metadata.comparator;
-            }
-
-            predicate = IDiskAtomFilter.Serializer.instance.deserialize(in, version, comparator);
-
-            if (metadata.cfType == ColumnFamilyType.Super)
-                predicate = SuperColumns.fromSCFilter((CompositeType)metadata.comparator, superColumn, predicate);
-        }
-        else
-        {
-            predicate = IDiskAtomFilter.Serializer.instance.deserialize(in, version, metadata.comparator);
-        }
+        IDiskAtomFilter predicate = metadata.comparator.diskAtomFilterSerializer().deserialize(in, version);
 
         List<IndexExpression> rowFilter;
         int filterCount = in.readInt();
-        rowFilter = new ArrayList<IndexExpression>(filterCount);
+        rowFilter = new ArrayList<>(filterCount);
         for (int i = 0; i < filterCount; i++)
         {
             IndexExpression expr;
             expr = new IndexExpression(ByteBufferUtil.readWithShortLength(in),
-                                       IndexOperator.findByValue(in.readInt()),
+                                       IndexExpression.Operator.findByOrdinal(in.readInt()),
                                        ByteBufferUtil.readWithShortLength(in));
             rowFilter.add(expr);
         }
@@ -270,34 +214,13 @@
     {
         long size = TypeSizes.NATIVE.sizeof(rsc.keyspace);
         size += TypeSizes.NATIVE.sizeof(rsc.columnFamily);
+        size += TypeSizes.NATIVE.sizeof(rsc.timestamp);
 
-        if (version >= MessagingService.VERSION_20)
-            size += TypeSizes.NATIVE.sizeof(rsc.timestamp);
+        CFMetaData metadata = Schema.instance.getCFMetaData(rsc.keyspace, rsc.columnFamily);
 
         IDiskAtomFilter filter = rsc.predicate;
-        if (version < MessagingService.VERSION_20)
-        {
-            ByteBuffer sc = null;
-            CFMetaData metadata = Schema.instance.getCFMetaData(rsc.keyspace, rsc.columnFamily);
-            if (metadata.cfType == ColumnFamilyType.Super)
-            {
-                SuperColumns.SCFilter scFilter = SuperColumns.filterToSC((CompositeType)metadata.comparator, filter);
-                sc = scFilter.scName;
-                filter = scFilter.updatedFilter;
-            }
 
-            if (sc != null)
-            {
-                size += TypeSizes.NATIVE.sizeof(sc.remaining());
-                size += sc.remaining();
-            }
-            else
-            {
-                size += TypeSizes.NATIVE.sizeof(0);
-            }
-        }
-
-        size += IDiskAtomFilter.Serializer.instance.serializedSize(filter, version);
+        size += metadata.comparator.diskAtomFilterSerializer().serializedSize(filter, version);
 
         if (rsc.rowFilter == null)
         {
@@ -308,8 +231,8 @@
             size += TypeSizes.NATIVE.sizeof(rsc.rowFilter.size());
             for (IndexExpression expr : rsc.rowFilter)
             {
-                size += TypeSizes.NATIVE.sizeofWithShortLength(expr.column_name);
-                size += TypeSizes.NATIVE.sizeof(expr.op.getValue());
+                size += TypeSizes.NATIVE.sizeofWithShortLength(expr.column);
+                size += TypeSizes.NATIVE.sizeof(expr.operator.ordinal());
                 size += TypeSizes.NATIVE.sizeofWithShortLength(expr.value);
             }
         }

diff --git a/src/java/org/apache/cassandra/db/RangeSliceReply.java b/src/java/org/apache/cassandra/db/RangeSliceReply.java
index 10667a0..5964ea8 100644
--- a/src/java/org/apache/cassandra/db/RangeSliceReply.java
+++ b/src/java/org/apache/cassandra/db/RangeSliceReply.java

@@ -19,7 +19,6 @@
 
 import java.io.DataInput;
 import java.io.DataInputStream;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
@@ -27,6 +26,7 @@
 import org.apache.commons.lang3.StringUtils;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FastByteArrayInputStream;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
@@ -62,7 +62,7 @@
 
     private static class RangeSliceReplySerializer implements IVersionedSerializer<RangeSliceReply>
     {
-        public void serialize(RangeSliceReply rsr, DataOutput out, int version) throws IOException
+        public void serialize(RangeSliceReply rsr, DataOutputPlus out, int version) throws IOException
         {
             out.writeInt(rsr.rows.size());
             for (Row row : rsr.rows)

diff --git a/src/java/org/apache/cassandra/db/RangeTombstone.java b/src/java/org/apache/cassandra/db/RangeTombstone.java
index 16fc27a..3f3d6754 100644
--- a/src/java/org/apache/cassandra/db/RangeTombstone.java
+++ b/src/java/org/apache/cassandra/db/RangeTombstone.java

@@ -20,33 +20,32 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.security.MessageDigest;
 import java.util.*;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.io.ISSTableSerializer;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Interval;
 
-public class RangeTombstone extends Interval<ByteBuffer, DeletionTime> implements OnDiskAtom
+public class RangeTombstone extends Interval<Composite, DeletionTime> implements OnDiskAtom
 {
-    public static final Serializer serializer = new Serializer();
-
-    public RangeTombstone(ByteBuffer start, ByteBuffer stop, long markedForDeleteAt, int localDeletionTime)
+    public RangeTombstone(Composite start, Composite stop, long markedForDeleteAt, int localDeletionTime)
     {
         this(start, stop, new DeletionTime(markedForDeleteAt, localDeletionTime));
     }
 
-    public RangeTombstone(ByteBuffer start, ByteBuffer stop, DeletionTime delTime)
+    public RangeTombstone(Composite start, Composite stop, DeletionTime delTime)
     {
         super(start, stop, delTime);
     }
 
-    public ByteBuffer name()
+    public Composite name()
     {
         return min;
     }
@@ -56,30 +55,11 @@
         return data.localDeletionTime;
     }
 
-    public long minTimestamp()
+    public long timestamp()
     {
         return data.markedForDeleteAt;
     }
 
-    public long maxTimestamp()
-    {
-        return data.markedForDeleteAt;
-    }
-
-    public int serializedSize(TypeSizes typeSizes)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public long serializedSizeForSSTable()
-    {
-        TypeSizes typeSizes = TypeSizes.NATIVE;
-        return typeSizes.sizeof((short)min.remaining()) + min.remaining()
-             + 1 // serialization flag
-             + typeSizes.sizeof((short)max.remaining()) + max.remaining()
-             + DeletionTime.serializer.serializedSize(data, typeSizes);
-    }
-
     public void validateFields(CFMetaData metadata) throws MarshalException
     {
         metadata.comparator.validate(min);
@@ -88,8 +68,8 @@
 
     public void updateDigest(MessageDigest digest)
     {
-        digest.update(min.duplicate());
-        digest.update(max.duplicate());
+        digest.update(min.toByteBuffer().duplicate());
+        digest.update(max.toByteBuffer().duplicate());
         DataOutputBuffer buffer = new DataOutputBuffer();
         try
         {
@@ -106,7 +86,7 @@
      * This tombstone supersedes another one if it is more recent and cover a
      * bigger range than rt.
      */
-    public boolean supersedes(RangeTombstone rt, Comparator<ByteBuffer> comparator)
+    public boolean supersedes(RangeTombstone rt, Comparator<Composite> comparator)
     {
         if (rt.data.markedForDeleteAt > data.markedForDeleteAt)
             return false;
@@ -114,9 +94,14 @@
         return comparator.compare(min, rt.min) <= 0 && comparator.compare(max, rt.max) >= 0;
     }
 
+    public boolean includes(Comparator<Composite> comparator, Composite name)
+    {
+        return comparator.compare(name, min) >= 0 && comparator.compare(name, max) <= 0;
+    }
+
     public static class Tracker
     {
-        private final Comparator<ByteBuffer> comparator;
+        private final Comparator<Composite> comparator;
         private final Deque<RangeTombstone> ranges = new ArrayDeque<RangeTombstone>();
         private final SortedSet<RangeTombstone> maxOrderingSet = new TreeSet<RangeTombstone>(new Comparator<RangeTombstone>()
         {
@@ -128,7 +113,7 @@
         public final Set<RangeTombstone> expired = new HashSet<RangeTombstone>();
         private int atomCount;
 
-        public Tracker(Comparator<ByteBuffer> comparator)
+        public Tracker(Comparator<Composite> comparator)
         {
             this.comparator = comparator;
         }
@@ -139,7 +124,7 @@
          * Returns the total serialized size of said tombstones and write them
          * to {@code out} it if isn't null.
          */
-        public long writeOpenedMarker(OnDiskAtom firstColumn, DataOutput out, OnDiskAtom.Serializer atomSerializer) throws IOException
+        public long writeOpenedMarker(OnDiskAtom firstColumn, DataOutputPlus out, OnDiskAtom.Serializer atomSerializer) throws IOException
         {
             long size = 0;
             if (ranges.isEmpty())
@@ -178,7 +163,7 @@
 
             for (RangeTombstone tombstone : toWrite)
             {
-                size += tombstone.serializedSizeForSSTable();
+                size += atomSerializer.serializedSizeForSSTable(tombstone);
                 atomCount++;
                 if (out != null)
                     atomSerializer.serializeForSSTable(tombstone, out);
@@ -193,7 +178,7 @@
 
         /**
          * Update this tracker given an {@code atom}.
-         * If column is a Column, check if any tracked range is useless and
+         * If column is a Cell, check if any tracked range is useless and
          * can be removed. If it is a RangeTombstone, add it to this tracker.
          */
         public void update(OnDiskAtom atom, boolean isExpired)
@@ -222,7 +207,7 @@
             }
             else
             {
-                assert atom instanceof Column;
+                assert atom instanceof Cell;
                 Iterator<RangeTombstone> iter = maxOrderingSet.iterator();
                 while (iter.hasNext())
                 {
@@ -243,13 +228,13 @@
             }
         }
 
-        public boolean isDeleted(Column column)
+        public boolean isDeleted(Cell cell)
         {
             for (RangeTombstone tombstone : ranges)
             {
-                if (comparator.compare(column.name(), tombstone.min) >= 0
-                    && comparator.compare(column.name(), tombstone.max) <= 0
-                    && tombstone.maxTimestamp() >= column.timestamp())
+                if (comparator.compare(cell.name(), tombstone.min) >= 0
+                    && comparator.compare(cell.name(), tombstone.max) <= 0
+                    && tombstone.timestamp() >= cell.timestamp())
                 {
                     return true;
                 }
@@ -260,33 +245,50 @@
 
     public static class Serializer implements ISSTableSerializer<RangeTombstone>
     {
-        public void serializeForSSTable(RangeTombstone t, DataOutput out) throws IOException
+        private final CType type;
+
+        public Serializer(CType type)
         {
-            ByteBufferUtil.writeWithShortLength(t.min, out);
+            this.type = type;
+        }
+
+        public void serializeForSSTable(RangeTombstone t, DataOutputPlus out) throws IOException
+        {
+            type.serializer().serialize(t.min, out);
             out.writeByte(ColumnSerializer.RANGE_TOMBSTONE_MASK);
-            ByteBufferUtil.writeWithShortLength(t.max, out);
+            type.serializer().serialize(t.max, out);
             DeletionTime.serializer.serialize(t.data, out);
         }
 
         public RangeTombstone deserializeFromSSTable(DataInput in, Descriptor.Version version) throws IOException
         {
-            ByteBuffer min = ByteBufferUtil.readWithShortLength(in);
-            if (min.remaining() <= 0)
-                throw ColumnSerializer.CorruptColumnException.create(in, min);
+            Composite min = type.serializer().deserialize(in);
 
             int b = in.readUnsignedByte();
             assert (b & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0;
             return deserializeBody(in, min, version);
         }
 
-        public RangeTombstone deserializeBody(DataInput in, ByteBuffer min, Descriptor.Version version) throws IOException
+        public RangeTombstone deserializeBody(DataInput in, Composite min, Descriptor.Version version) throws IOException
         {
-            ByteBuffer max = ByteBufferUtil.readWithShortLength(in);
-            if (max.remaining() <= 0)
-                throw ColumnSerializer.CorruptColumnException.create(in, max);
-
+            Composite max = type.serializer().deserialize(in);
             DeletionTime dt = DeletionTime.serializer.deserialize(in);
             return new RangeTombstone(min, max, dt);
         }
+
+        public void skipBody(DataInput in, Descriptor.Version version) throws IOException
+        {
+            type.serializer().skip(in);
+            DeletionTime.serializer.skip(in);
+        }
+
+        public long serializedSizeForSSTable(RangeTombstone t)
+        {
+            TypeSizes typeSizes = TypeSizes.NATIVE;
+            return type.serializer().serializedSize(t.min, typeSizes)
+                 + 1 // serialization flag
+                 + type.serializer().serializedSize(t.max, typeSizes)
+                 + DeletionTime.serializer.serializedSize(t.data, typeSizes);
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/RangeTombstoneList.java b/src/java/org/apache/cassandra/db/RangeTombstoneList.java
index dadcc20..c0ab42b 100644
--- a/src/java/org/apache/cassandra/db/RangeTombstoneList.java
+++ b/src/java/org/apache/cassandra/db/RangeTombstoneList.java

@@ -18,22 +18,29 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.security.MessageDigest;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.Iterator;
 
 import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
+import com.google.common.collect.Iterators;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.cache.IMeasurableMemory;
+import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.HeapPool;
+
 /**
  * Data structure holding the range tombstones of a ColumnFamily.
  * <p>
@@ -50,24 +57,25 @@
  * The only use of the local deletion time is to know when a given tombstone can
  * be purged, which will be done by the purge() method.
  */
-public class RangeTombstoneList implements Iterable<RangeTombstone>
+public class RangeTombstoneList implements Iterable<RangeTombstone>, IMeasurableMemory
 {
     private static final Logger logger = LoggerFactory.getLogger(RangeTombstoneList.class);
 
-    public static final Serializer serializer = new Serializer();
+    private static long EMPTY_SIZE = ObjectSizes.measure(new RangeTombstoneList(null, 0));
 
-    private final Comparator<ByteBuffer> comparator;
+    private final Comparator<Composite> comparator;
 
     // Note: we don't want to use a List for the markedAts and delTimes to avoid boxing. We could
     // use a List for starts and ends, but having arrays everywhere is almost simpler.
-    private ByteBuffer[] starts;
-    private ByteBuffer[] ends;
+    private Composite[] starts;
+    private Composite[] ends;
     private long[] markedAts;
     private int[] delTimes;
 
+    private long boundaryHeapSize;
     private int size;
 
-    private RangeTombstoneList(Comparator<ByteBuffer> comparator, ByteBuffer[] starts, ByteBuffer[] ends, long[] markedAts, int[] delTimes, int size)
+    private RangeTombstoneList(Comparator<Composite> comparator, Composite[] starts, Composite[] ends, long[] markedAts, int[] delTimes, long boundaryHeapSize, int size)
     {
         assert starts.length == ends.length && starts.length == markedAts.length && starts.length == delTimes.length;
         this.comparator = comparator;
@@ -76,11 +84,12 @@
         this.markedAts = markedAts;
         this.delTimes = delTimes;
         this.size = size;
+        this.boundaryHeapSize = boundaryHeapSize;
     }
 
-    public RangeTombstoneList(Comparator<ByteBuffer> comparator, int capacity)
+    public RangeTombstoneList(Comparator<Composite> comparator, int capacity)
     {
-        this(comparator, new ByteBuffer[capacity], new ByteBuffer[capacity], new long[capacity], new int[capacity], 0);
+        this(comparator, new Composite[capacity], new Composite[capacity], new long[capacity], new int[capacity], 0, 0);
     }
 
     public boolean isEmpty()
@@ -93,7 +102,7 @@
         return size;
     }
 
-    public Comparator<ByteBuffer> comparator()
+    public Comparator<Composite> comparator()
     {
         return comparator;
     }
@@ -105,7 +114,28 @@
                                       Arrays.copyOf(ends, size),
                                       Arrays.copyOf(markedAts, size),
                                       Arrays.copyOf(delTimes, size),
-                                      size);
+                                      boundaryHeapSize, size);
+    }
+
+    public RangeTombstoneList copy(AbstractAllocator allocator)
+    {
+        RangeTombstoneList copy =  new RangeTombstoneList(comparator,
+                                      new Composite[size],
+                                      new Composite[size],
+                                      Arrays.copyOf(markedAts, size),
+                                      Arrays.copyOf(delTimes, size),
+                                      boundaryHeapSize, size);
+
+
+        for (int i = 0; i < size; i++)
+        {
+            assert !(starts[i] instanceof AbstractNativeCell || ends[i] instanceof AbstractNativeCell); //this should never happen
+
+            copy.starts[i] = starts[i].copy(null, allocator);
+            copy.ends[i] = ends[i].copy(null, allocator);
+        }
+
+        return copy;
     }
 
     public void add(RangeTombstone tombstone)
@@ -119,7 +149,7 @@
      * This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case), 
      * but it doesn't assume it.
      */
-    public void add(ByteBuffer start, ByteBuffer end, long markedAt, int delTime)
+    public void add(Composite start, Composite end, long markedAt, int delTime)
     {
         if (isEmpty())
         {
@@ -140,6 +170,7 @@
             int pos = Arrays.binarySearch(ends, 0, size, start, comparator);
             insertFrom((pos >= 0 ? pos : -pos-1), start, end, markedAt, delTime);
         }
+        boundaryHeapSize += start.unsharedHeapSize() + end.unsharedHeapSize();
     }
 
     /**
@@ -205,10 +236,11 @@
      * Returns whether the given name/timestamp pair is deleted by one of the tombstone
      * of this RangeTombstoneList.
      */
-    public boolean isDeleted(ByteBuffer name, long timestamp)
+    public boolean isDeleted(Cell cell)
     {
-        int idx = searchInternal(name);
-        return idx >= 0 && markedAts[idx] >= timestamp;
+        int idx = searchInternal(cell.name(), 0);
+        // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
+        return idx >= 0 && (cell instanceof CounterCell || markedAts[idx] >= cell.timestamp());
     }
 
     /**
@@ -223,17 +255,28 @@
      * Returns the DeletionTime for the tombstone overlapping {@code name} (there can't be more than one),
      * or null if {@code name} is not covered by any tombstone.
      */
-    public DeletionTime search(ByteBuffer name) {
-        int idx = searchInternal(name);
+    public DeletionTime searchDeletionTime(Composite name)
+    {
+        int idx = searchInternal(name, 0);
         return idx < 0 ? null : new DeletionTime(markedAts[idx], delTimes[idx]);
     }
 
-    private int searchInternal(ByteBuffer name)
+    public RangeTombstone search(Composite name)
+    {
+        int idx = searchInternal(name, 0);
+        return idx < 0 ? null : rangeTombstone(idx);
+    }
+
+    /*
+     * Return is the index of the range covering name if name is covered. If the return idx is negative,
+     * no range cover name and -idx-1 is the index of the first range whose start is greater than name.
+     */
+    private int searchInternal(Composite name, int startIdx)
     {
         if (isEmpty())
             return -1;
 
-        int pos = Arrays.binarySearch(starts, 0, size, name, comparator);
+        int pos = Arrays.binarySearch(starts, startIdx, size, name, comparator);
         if (pos >= 0)
         {
             // We're exactly on an interval start. The one subtility is that we need to check if
@@ -250,7 +293,7 @@
             if (idx < 0)
                 return -1;
 
-            return comparator.compare(name, ends[idx]) <= 0 ? idx : -1;
+            return comparator.compare(name, ends[idx]) <= 0 ? idx : -idx-2;
         }
     }
 
@@ -259,7 +302,7 @@
         int dataSize = TypeSizes.NATIVE.sizeof(size);
         for (int i = 0; i < size; i++)
         {
-            dataSize += starts[i].remaining() + ends[i].remaining();
+            dataSize += starts[i].dataSize() + ends[i].dataSize();
             dataSize += TypeSizes.NATIVE.sizeof(markedAts[i]);
             dataSize += TypeSizes.NATIVE.sizeof(delTimes[i]);
         }
@@ -315,6 +358,11 @@
         return false;
     }
 
+    private RangeTombstone rangeTombstone(int idx)
+    {
+        return new RangeTombstone(starts[idx], ends[idx], markedAts[idx], delTimes[idx]);
+    }
+
     public Iterator<RangeTombstone> iterator()
     {
         return new AbstractIterator<RangeTombstone>()
@@ -326,13 +374,109 @@
                 if (idx >= size)
                     return endOfData();
 
-                RangeTombstone t = new RangeTombstone(starts[idx], ends[idx], markedAts[idx], delTimes[idx]);
-                idx++;
-                return t;
+                return rangeTombstone(idx++);
             }
         };
     }
 
+    public Iterator<RangeTombstone> iterator(Composite from, Composite till)
+    {
+        int startIdx = from.isEmpty() ? 0 : searchInternal(from, 0);
+        final int start = startIdx < 0 ? -startIdx-1 : startIdx;
+
+        if (start >= size)
+            return Iterators.<RangeTombstone>emptyIterator();
+
+        int finishIdx = till.isEmpty() ? size : searchInternal(till, start);
+        // if stopIdx is the first range after 'till' we care only until the previous range
+        final int finish = finishIdx < 0 ? -finishIdx-2 : finishIdx;
+
+        // Note: the following is true because we know 'from' is before 'till' in sorted order.
+        if (start > finish)
+            return Iterators.<RangeTombstone>emptyIterator();
+        else if (start == finish)
+            return Iterators.<RangeTombstone>singletonIterator(rangeTombstone(start));
+
+        return new AbstractIterator<RangeTombstone>()
+        {
+            private int idx = start;
+
+            protected RangeTombstone computeNext()
+            {
+                if (idx >= size || idx > finish)
+                    return endOfData();
+
+                return rangeTombstone(idx++);
+            }
+        };
+    }
+
+    /**
+     * Evaluates a diff between superset (known to be all merged tombstones) and this list for read repair
+     *
+     * @return null if there is no difference
+     */
+    public RangeTombstoneList diff(RangeTombstoneList superset)
+    {
+        if (isEmpty())
+            return superset;
+
+        RangeTombstoneList diff = null;
+
+        int j = 0; // index to iterate through our own list
+        for (int i = 0; i < superset.size; i++)
+        {
+            // we can assume that this list is a subset of the superset list
+            while (j < size && comparator.compare(starts[j], superset.starts[i]) < 0)
+                j++;
+
+            if (j >= size)
+            {
+                // we're at the end of our own list, add the remainder of the superset to the diff
+                if (i < superset.size)
+                {
+                    if (diff == null)
+                        diff = new RangeTombstoneList(comparator, superset.size - i);
+
+                    for(int k = i; k < superset.size; k++)
+                        diff.add(superset.starts[k], superset.ends[k], superset.markedAts[k], superset.delTimes[k]);
+                }
+                return diff;
+            }
+
+            // we don't care about local deletion time here, because it doesn't matter for read repair
+            if (!starts[j].equals(superset.starts[i])
+                || !ends[j].equals(superset.ends[i])
+                || markedAts[j] != superset.markedAts[i])
+            {
+                if (diff == null)
+                    diff = new RangeTombstoneList(comparator, Math.min(8, superset.size - i));
+                diff.add(superset.starts[i], superset.ends[i], superset.markedAts[i], superset.delTimes[i]);
+            }
+        }
+
+        return diff;
+    }
+    
+    /**
+     * Calculates digest for triggering read repair on mismatch
+     */
+    public void updateDigest(MessageDigest digest)
+    {
+        ByteBuffer longBuffer = ByteBuffer.allocate(8);
+        for (int i = 0; i < size; i++)
+        {
+            for (int j = 0; j < starts[i].size(); j++)
+                digest.update(starts[i].get(j).duplicate());
+            for (int j = 0; j < ends[i].size(); j++)
+                digest.update(ends[i].get(j).duplicate());
+
+            longBuffer.putLong(0, markedAts[i]);
+            digest.update(longBuffer.array(), 0, 8);
+        }
+    }
+
+
     @Override
     public boolean equals(Object o)
     {
@@ -341,7 +485,7 @@
         RangeTombstoneList that = (RangeTombstoneList)o;
         if (size != that.size)
             return false;
-
+        
         for (int i = 0; i < size; i++)
         {
             if (!starts[i].equals(that.starts[i]))
@@ -377,6 +521,7 @@
         System.arraycopy(src.markedAts, 0, dst.markedAts, 0, src.size);
         System.arraycopy(src.delTimes, 0, dst.delTimes, 0, src.size);
         dst.size = src.size;
+        dst.boundaryHeapSize = src.boundaryHeapSize;
     }
 
     /*
@@ -393,7 +538,7 @@
      * conditions).
      *
      */
-    private void insertFrom(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime)
+    private void insertFrom(int i, Composite start, Composite end, long markedAt, int delTime)
     {
         while (i < size)
         {
@@ -533,7 +678,7 @@
     /*
      * Adds the new tombstone at index i, growing and/or moving elements to make room for it.
      */
-    private void addInternal(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime)
+    private void addInternal(int i, Composite start, Composite end, long markedAt, int delTime)
     {
         assert i >= 0;
 
@@ -586,12 +731,12 @@
         delTimes = grow(delTimes, size, newLength, i);
     }
 
-    private static ByteBuffer[] grow(ByteBuffer[] a, int size, int newLength, int i)
+    private static Composite[] grow(Composite[] a, int size, int newLength, int i)
     {
         if (i < 0 || i >= size)
             return Arrays.copyOf(a, newLength);
 
-        ByteBuffer[] newA = new ByteBuffer[newLength];
+        Composite[] newA = new Composite[newLength];
         System.arraycopy(a, 0, newA, 0, i);
         System.arraycopy(a, i, newA, i+1, size - i);
         return newA;
@@ -631,21 +776,43 @@
         System.arraycopy(ends, i, ends, i+1, size - i);
         System.arraycopy(markedAts, i, markedAts, i+1, size - i);
         System.arraycopy(delTimes, i, delTimes, i+1, size - i);
+        // we set starts[i] to null to indicate the position is now empty, so that we update boundaryHeapSize
+        // when we set it
+        starts[i] = null;
     }
 
-    private void setInternal(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime)
+    private void setInternal(int i, Composite start, Composite end, long markedAt, int delTime)
     {
+        if (starts[i] != null)
+            boundaryHeapSize -= starts[i].unsharedHeapSize() + ends[i].unsharedHeapSize();
         starts[i] = start;
         ends[i] = end;
         markedAts[i] = markedAt;
         delTimes[i] = delTime;
+        boundaryHeapSize += start.unsharedHeapSize() + end.unsharedHeapSize();
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE
+                + boundaryHeapSize
+                + ObjectSizes.sizeOfArray(starts)
+                + ObjectSizes.sizeOfArray(ends)
+                + ObjectSizes.sizeOfArray(markedAts)
+                + ObjectSizes.sizeOfArray(delTimes);
     }
 
     public static class Serializer implements IVersionedSerializer<RangeTombstoneList>
     {
-        private Serializer() {}
+        private final CType type;
 
-        public void serialize(RangeTombstoneList tombstones, DataOutput out, int version) throws IOException
+        public Serializer(CType type)
+        {
+            this.type = type;
+        }
+
+        public void serialize(RangeTombstoneList tombstones, DataOutputPlus out, int version) throws IOException
         {
             if (tombstones == null)
             {
@@ -656,34 +823,25 @@
             out.writeInt(tombstones.size);
             for (int i = 0; i < tombstones.size; i++)
             {
-                ByteBufferUtil.writeWithShortLength(tombstones.starts[i], out);
-                ByteBufferUtil.writeWithShortLength(tombstones.ends[i], out);
+                type.serializer().serialize(tombstones.starts[i], out);
+                type.serializer().serialize(tombstones.ends[i], out);
                 out.writeInt(tombstones.delTimes[i]);
                 out.writeLong(tombstones.markedAts[i]);
             }
         }
 
-        /*
-         * RangeTombstoneList depends on the column family comparator, but it is not serialized.
-         * Thus deserialize(DataInput, int, Comparator<ByteBuffer>) should be used instead of this method.
-         */
         public RangeTombstoneList deserialize(DataInput in, int version) throws IOException
         {
-            throw new UnsupportedOperationException();
-        }
-
-        public RangeTombstoneList deserialize(DataInput in, int version, Comparator<ByteBuffer> comparator) throws IOException
-        {
             int size = in.readInt();
             if (size == 0)
                 return null;
 
-            RangeTombstoneList tombstones = new RangeTombstoneList(comparator, size);
+            RangeTombstoneList tombstones = new RangeTombstoneList(type, size);
 
             for (int i = 0; i < size; i++)
             {
-                ByteBuffer start = ByteBufferUtil.readWithShortLength(in);
-                ByteBuffer end = ByteBufferUtil.readWithShortLength(in);
+                Composite start = type.serializer().deserialize(in);
+                Composite end = type.serializer().deserialize(in);
                 int delTime =  in.readInt();
                 long markedAt = in.readLong();
 
@@ -715,10 +873,8 @@
             long size = typeSizes.sizeof(tombstones.size);
             for (int i = 0; i < tombstones.size; i++)
             {
-                int startSize = tombstones.starts[i].remaining();
-                size += typeSizes.sizeof((short)startSize) + startSize;
-                int endSize = tombstones.ends[i].remaining();
-                size += typeSizes.sizeof((short)endSize) + endSize;
+                size += type.serializer().serializedSize(tombstones.starts[i], typeSizes);
+                size += type.serializer().serializedSize(tombstones.ends[i], typeSizes);
                 size += typeSizes.sizeof(tombstones.delTimes[i]);
                 size += typeSizes.sizeof(tombstones.markedAts[i]);
             }
@@ -744,32 +900,42 @@
     {
         private int idx;
 
-        public boolean isDeleted(ByteBuffer name, long timestamp)
+        public boolean isDeleted(Cell cell)
         {
+            CellName name = cell.name();
+            long timestamp = cell.timestamp();
+
             while (idx < size)
             {
                 int cmp = comparator.compare(name, starts[idx]);
-                if (cmp == 0)
+
+                if (cmp < 0)
                 {
+                    return false;
+                }
+                else if (cmp == 0)
+                {
+                    // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
+                    if (cell instanceof CounterCell)
+                        return true;
+
                     // As for searchInternal, we need to check the previous end
                     if (idx > 0 && comparator.compare(name, ends[idx-1]) == 0 && markedAts[idx-1] > markedAts[idx])
                         return markedAts[idx-1] >= timestamp;
                     else
                         return markedAts[idx] >= timestamp;
                 }
-                else if (cmp < 0)
-                {
-                    return false;
-                }
                 else
                 {
                     if (comparator.compare(name, ends[idx]) <= 0)
-                        return markedAts[idx] >= timestamp;
+                        return markedAts[idx] >= timestamp || cell instanceof CounterCell;
                     else
                         idx++;
                 }
             }
+
             return false;
         }
     }
+
 }

diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java
index cadcd7d..299693e 100644
--- a/src/java/org/apache/cassandra/db/ReadCommand.java
+++ b/src/java/org/apache/cassandra/db/ReadCommand.java

@@ -18,18 +18,15 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
 import org.apache.cassandra.db.filter.NamesQueryFilter;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.IReadCommand;
@@ -60,7 +57,7 @@
 
     public MessageOut<ReadCommand> createMessage()
     {
-        return new MessageOut<ReadCommand>(MessagingService.Verb.READ, this, serializer);
+        return new MessageOut<>(MessagingService.Verb.READ, this, serializer);
     }
 
     public final String ksName;
@@ -133,33 +130,16 @@
 
 class ReadCommandSerializer implements IVersionedSerializer<ReadCommand>
 {
-    public void serialize(ReadCommand command, DataOutput out, int version) throws IOException
+    public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
     {
-        // For super columns, when talking to an older node, we need to translate the filter used.
-        // That translation can change the filter type (names -> slice), and so change the command type.
-        // Hence we need to detect that early on, before we've written the command type.
-        ReadCommand newCommand = command;
-        ByteBuffer superColumn = null;
-        if (version < MessagingService.VERSION_20)
-        {
-            CFMetaData metadata = Schema.instance.getCFMetaData(command.ksName, command.cfName);
-            if (metadata.cfType == ColumnFamilyType.Super)
-            {
-                SuperColumns.SCFilter scFilter = SuperColumns.filterToSC((CompositeType)metadata.comparator, command.filter());
-                newCommand = ReadCommand.create(command.ksName, command.key, command.cfName, command.timestamp, scFilter.updatedFilter);
-                newCommand.setDigestQuery(command.isDigestQuery());
-                superColumn = scFilter.scName;
-            }
-        }
-
-        out.writeByte(newCommand.commandType.serializedValue);
+        out.writeByte(command.commandType.serializedValue);
         switch (command.commandType)
         {
             case GET_BY_NAMES:
-                SliceByNamesReadCommand.serializer.serialize(newCommand, superColumn, out, version);
+                SliceByNamesReadCommand.serializer.serialize(command, out, version);
                 break;
             case GET_SLICES:
-                SliceFromReadCommand.serializer.serialize(newCommand, superColumn, out, version);
+                SliceFromReadCommand.serializer.serialize(command, out, version);
                 break;
             default:
                 throw new AssertionError();
@@ -182,26 +162,12 @@
 
     public long serializedSize(ReadCommand command, int version)
     {
-        ReadCommand newCommand = command;
-        ByteBuffer superColumn = null;
-        if (version < MessagingService.VERSION_20)
-        {
-            CFMetaData metadata = Schema.instance.getCFMetaData(command.ksName, command.cfName);
-            if (metadata.cfType == ColumnFamilyType.Super)
-            {
-                SuperColumns.SCFilter scFilter = SuperColumns.filterToSC((CompositeType)metadata.comparator, command.filter());
-                newCommand = ReadCommand.create(command.ksName, command.key, command.cfName, command.timestamp, scFilter.updatedFilter);
-                newCommand.setDigestQuery(command.isDigestQuery());
-                superColumn = scFilter.scName;
-            }
-        }
-
         switch (command.commandType)
         {
             case GET_BY_NAMES:
-                return 1 + SliceByNamesReadCommand.serializer.serializedSize(newCommand, superColumn, version);
+                return 1 + SliceByNamesReadCommand.serializer.serializedSize(command, version);
             case GET_SLICES:
-                return 1 + SliceFromReadCommand.serializer.serializedSize(newCommand, superColumn, version);
+                return 1 + SliceFromReadCommand.serializer.serializedSize(command, version);
             default:
                 throw new AssertionError();
         }

diff --git a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java
index fca4938..849ac70 100644
--- a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java

@@ -21,12 +21,11 @@
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessagingService;
 
-public class ReadRepairVerbHandler implements IVerbHandler<RowMutation>
+public class ReadRepairVerbHandler implements IVerbHandler<Mutation>
 {
-    public void doVerb(MessageIn<RowMutation> message, int id)
+    public void doVerb(MessageIn<Mutation> message, int id)
     {
-        RowMutation rm = message.payload;
-        rm.apply();
+        message.payload.apply();
         WriteResponse response = new WriteResponse();
         MessagingService.instance().sendReply(response.createMessage(), id, message.from);
     }

diff --git a/src/java/org/apache/cassandra/db/ReadResponse.java b/src/java/org/apache/cassandra/db/ReadResponse.java
index 3fe6ec4..39022a4 100644
--- a/src/java/org/apache/cassandra/db/ReadResponse.java
+++ b/src/java/org/apache/cassandra/db/ReadResponse.java

@@ -21,6 +21,7 @@
 import java.nio.ByteBuffer;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 /*
@@ -67,11 +68,11 @@
 
 class ReadResponseSerializer implements IVersionedSerializer<ReadResponse>
 {
-    public void serialize(ReadResponse response, DataOutput out, int version) throws IOException
+    public void serialize(ReadResponse response, DataOutputPlus out, int version) throws IOException
     {
         out.writeInt(response.isDigestQuery() ? response.digest().remaining() : 0);
         ByteBuffer buffer = response.isDigestQuery() ? response.digest() : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        ByteBufferUtil.write(buffer, out);
+        out.write(buffer);
         out.writeBoolean(response.isDigestQuery());
         if (!response.isDigestQuery())
             Row.serializer.serialize(response.row(), out, version);

diff --git a/src/java/org/apache/cassandra/db/Row.java b/src/java/org/apache/cassandra/db/Row.java
index 13e6f67..a826894 100644
--- a/src/java/org/apache/cassandra/db/Row.java
+++ b/src/java/org/apache/cassandra/db/Row.java

@@ -22,6 +22,7 @@
 
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -61,9 +62,9 @@
 
     public static class RowSerializer implements IVersionedSerializer<Row>
     {
-        public void serialize(Row row, DataOutput out, int version) throws IOException
+        public void serialize(Row row, DataOutputPlus out, int version) throws IOException
         {
-            ByteBufferUtil.writeWithShortLength(row.key.key, out);
+            ByteBufferUtil.writeWithShortLength(row.key.getKey(), out);
             ColumnFamily.serializer.serialize(row.cf, out, version);
         }
 
@@ -80,7 +81,7 @@
 
         public long serializedSize(Row row, int version)
         {
-            int keySize = row.key.key.remaining();
+            int keySize = row.key.getKey().remaining();
             return TypeSizes.NATIVE.sizeof((short) keySize) + keySize + ColumnFamily.serializer.serializedSize(row.cf, TypeSizes.NATIVE, version);
         }
     }

diff --git a/src/java/org/apache/cassandra/db/RowIndexEntry.java b/src/java/org/apache/cassandra/db/RowIndexEntry.java
index cdafd5d6..01035c4 100644
--- a/src/java/org/apache/cassandra/db/RowIndexEntry.java
+++ b/src/java/org/apache/cassandra/db/RowIndexEntry.java

@@ -18,23 +18,26 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 
 import com.google.common.primitives.Ints;
 
 import org.apache.cassandra.cache.IMeasurableMemory;
+import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.IndexHelper;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.ObjectSizes;
 
 public class RowIndexEntry implements IMeasurableMemory
 {
-    public static final Serializer serializer = new Serializer();
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new RowIndexEntry(0));
 
     public final long position;
 
@@ -43,24 +46,7 @@
         this.position = position;
     }
 
-    public int serializedSize()
-    {
-        int size = TypeSizes.NATIVE.sizeof(position) + TypeSizes.NATIVE.sizeof(promotedSize());
-
-        if (isIndexed())
-        {
-            List<IndexHelper.IndexInfo> index = columnsIndex();
-
-            size += DeletionTime.serializer.serializedSize(deletionTime(), TypeSizes.NATIVE);
-            size += TypeSizes.NATIVE.sizeof(index.size());
-            for (IndexHelper.IndexInfo info : index)
-                size += info.serializedSize(TypeSizes.NATIVE);
-        }
-
-        return size;
-    }
-
-    protected int promotedSize()
+    protected int promotedSize(CType type)
     {
         return 0;
     }
@@ -98,24 +84,32 @@
         return Collections.emptyList();
     }
 
-    public long memorySize()
+    public long unsharedHeapSize()
     {
-        return ObjectSizes.getFieldSize(TypeSizes.NATIVE.sizeof(position));
+        return EMPTY_SIZE;
     }
 
     public static class Serializer
     {
-        public void serialize(RowIndexEntry rie, DataOutput out) throws IOException
+        private final CType type;
+
+        public Serializer(CType type)
+        {
+            this.type = type;
+        }
+
+        public void serialize(RowIndexEntry rie, DataOutputPlus out) throws IOException
         {
             out.writeLong(rie.position);
-            out.writeInt(rie.promotedSize());
+            out.writeInt(rie.promotedSize(type));
 
             if (rie.isIndexed())
             {
                 DeletionTime.serializer.serialize(rie.deletionTime(), out);
                 out.writeInt(rie.columnsIndex().size());
+                ISerializer<IndexHelper.IndexInfo> idxSerializer = type.indexSerializer();
                 for (IndexHelper.IndexInfo info : rie.columnsIndex())
-                    info.serialize(out);
+                    idxSerializer.serialize(info, out);
             }
         }
 
@@ -129,9 +123,10 @@
                 DeletionTime deletionTime = DeletionTime.serializer.deserialize(in);
 
                 int entries = in.readInt();
+                ISerializer<IndexHelper.IndexInfo> idxSerializer = type.indexSerializer();
                 List<IndexHelper.IndexInfo> columnsIndex = new ArrayList<IndexHelper.IndexInfo>(entries);
                 for (int i = 0; i < entries; i++)
-                    columnsIndex.add(IndexHelper.IndexInfo.deserialize(in));
+                    columnsIndex.add(idxSerializer.deserialize(in));
 
                 return new IndexedEntry(position, deletionTime, columnsIndex);
             }
@@ -141,13 +136,13 @@
             }
         }
 
-        public void skip(DataInput in) throws IOException
+        public static void skip(DataInput in) throws IOException
         {
             in.readLong();
             skipPromotedIndex(in);
         }
 
-        public void skipPromotedIndex(DataInput in) throws IOException
+        public static void skipPromotedIndex(DataInput in) throws IOException
         {
             int size = in.readInt();
             if (size <= 0)
@@ -155,6 +150,25 @@
 
             FileUtils.skipBytesFully(in, size);
         }
+
+        public int serializedSize(RowIndexEntry rie)
+        {
+            int size = TypeSizes.NATIVE.sizeof(rie.position) + TypeSizes.NATIVE.sizeof(rie.promotedSize(type));
+
+            if (rie.isIndexed())
+            {
+                List<IndexHelper.IndexInfo> index = rie.columnsIndex();
+
+                size += DeletionTime.serializer.serializedSize(rie.deletionTime(), TypeSizes.NATIVE);
+                size += TypeSizes.NATIVE.sizeof(index.size());
+
+                ISerializer<IndexHelper.IndexInfo> idxSerializer = type.indexSerializer();
+                for (IndexHelper.IndexInfo info : index)
+                    size += idxSerializer.serializedSize(info, TypeSizes.NATIVE);
+            }
+
+            return size;
+        }
     }
 
     /**
@@ -164,6 +178,9 @@
     {
         private final DeletionTime deletionTime;
         private final List<IndexHelper.IndexInfo> columnsIndex;
+        private static final long BASE_SIZE =
+                ObjectSizes.measure(new IndexedEntry(0, DeletionTime.LIVE, Arrays.<IndexHelper.IndexInfo>asList(null, null)))
+              + ObjectSizes.measure(new ArrayList<>(1));
 
         private IndexedEntry(long position, DeletionTime deletionTime, List<IndexHelper.IndexInfo> columnsIndex)
         {
@@ -187,31 +204,29 @@
         }
 
         @Override
-        public int promotedSize()
+        public int promotedSize(CType type)
         {
             TypeSizes typeSizes = TypeSizes.NATIVE;
             long size = DeletionTime.serializer.serializedSize(deletionTime, typeSizes);
             size += typeSizes.sizeof(columnsIndex.size()); // number of entries
+            ISerializer<IndexHelper.IndexInfo> idxSerializer = type.indexSerializer();
             for (IndexHelper.IndexInfo info : columnsIndex)
-                size += info.serializedSize(typeSizes);
+                size += idxSerializer.serializedSize(info, typeSizes);
 
             return Ints.checkedCast(size);
         }
 
         @Override
-        public long memorySize()
+        public long unsharedHeapSize()
         {
             long entrySize = 0;
             for (IndexHelper.IndexInfo idx : columnsIndex)
-                entrySize += idx.memorySize();
+                entrySize += idx.unsharedHeapSize();
 
-            return ObjectSizes.getSuperClassFieldSize(TypeSizes.NATIVE.sizeof(position))
-                   + ObjectSizes.getFieldSize(// deletionTime
-                                              ObjectSizes.getReferenceSize() +
-                                              // columnsIndex
-                                              ObjectSizes.getReferenceSize())
-                   + deletionTime.memorySize()
-                   + ObjectSizes.getArraySize(columnsIndex.size(), ObjectSizes.getReferenceSize()) + entrySize + 4;
+            return BASE_SIZE
+                   + entrySize
+                   + deletionTime.unsharedHeapSize()
+                   + ObjectSizes.sizeOfReferenceArray(columnsIndex.size());
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/RowIteratorFactory.java b/src/java/org/apache/cassandra/db/RowIteratorFactory.java
index c02da1d..5bd2d9b 100644
--- a/src/java/org/apache/cassandra/db/RowIteratorFactory.java
+++ b/src/java/org/apache/cassandra/db/RowIteratorFactory.java

@@ -19,12 +19,14 @@
 
 import java.util.*;
 
+import com.google.common.collect.Iterables;
+
 import org.apache.cassandra.db.columniterator.IColumnIteratorFactory;
 import org.apache.cassandra.db.columniterator.LazyColumnIterator;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.filter.IDiskAtomFilter;
 import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.io.sstable.SSTableScanner;
 import org.apache.cassandra.utils.CloseableIterator;
 import org.apache.cassandra.utils.MergeIterator;
 
@@ -56,32 +58,26 @@
                                                      final long now)
     {
         // fetch data from current memtable, historical memtables, and SSTables in the correct order.
-        final List<CloseableIterator<OnDiskAtomIterator>> iterators = new ArrayList<CloseableIterator<OnDiskAtomIterator>>();
+        final List<CloseableIterator<OnDiskAtomIterator>> iterators = new ArrayList<>(Iterables.size(memtables) + sstables.size());
 
-        // memtables
         for (Memtable memtable : memtables)
-        {
-            iterators.add(new ConvertToColumnIterator<AtomicSortedColumns>(range, memtable.getEntryIterator(range.startKey(), range.stopKey())));
-        }
+            iterators.add(new ConvertToColumnIterator(range, memtable.getEntryIterator(range.startKey(), range.stopKey())));
 
         for (SSTableReader sstable : sstables)
-        {
-            final SSTableScanner scanner = sstable.getScanner(range);
-            iterators.add(scanner);
-        }
+            iterators.add(sstable.getScanner(range));
 
         // reduce rows from all sources into a single row
         return MergeIterator.get(iterators, COMPARE_BY_KEY, new MergeIterator.Reducer<OnDiskAtomIterator, Row>()
         {
             private final int gcBefore = cfs.gcBefore(now);
-            private final List<OnDiskAtomIterator> colIters = new ArrayList<OnDiskAtomIterator>();
+            private final List<OnDiskAtomIterator> colIters = new ArrayList<>();
             private DecoratedKey key;
             private ColumnFamily returnCF;
 
             @Override
             protected void onKeyChange()
             {
-                this.returnCF = TreeMapBackedSortedColumns.factory.create(cfs.metadata);
+                this.returnCF = ArrayBackedSortedColumns.factory.create(cfs.metadata, range.columnFilter.isReversed());
             }
 
             public void reduce(OnDiskAtomIterator current)
@@ -93,16 +89,18 @@
 
             protected Row getReduced()
             {
-                // First check if this row is in the rowCache. If it is we can skip the rest
+                // First check if this row is in the rowCache. If it is and it covers our filter, we can skip the rest
                 ColumnFamily cached = cfs.getRawCachedRow(key);
-                if (cached == null)
+                IDiskAtomFilter filter = range.columnFilter(key.getKey());
+
+                if (cached == null || !cfs.isFilterFullyCoveredBy(filter, cached, now))
                 {
                     // not cached: collate
-                    QueryFilter.collateOnDiskAtom(returnCF, colIters, range.columnFilter(key.key), gcBefore, now);
+                    QueryFilter.collateOnDiskAtom(returnCF, colIters, filter, gcBefore, now);
                 }
                 else
                 {
-                    QueryFilter keyFilter = new QueryFilter(key, cfs.name, range.columnFilter(key.key), now);
+                    QueryFilter keyFilter = new QueryFilter(key, cfs.name, filter, now);
                     returnCF = cfs.filterColumnFamily(cached, keyFilter);
                 }
 
@@ -117,12 +115,12 @@
     /**
      * Get a ColumnIterator for a specific key in the memtable.
      */
-    private static class ConvertToColumnIterator<T extends ColumnFamily> implements CloseableIterator<OnDiskAtomIterator>
+    private static class ConvertToColumnIterator implements CloseableIterator<OnDiskAtomIterator>
     {
         private final DataRange range;
-        private final Iterator<Map.Entry<DecoratedKey, T>> iter;
+        private final Iterator<Map.Entry<DecoratedKey, ColumnFamily>> iter;
 
-        public ConvertToColumnIterator(DataRange range, Iterator<Map.Entry<DecoratedKey, T>> iter)
+        public ConvertToColumnIterator(DataRange range, Iterator<Map.Entry<DecoratedKey, ColumnFamily>> iter)
         {
             this.range = range;
             this.iter = iter;
@@ -142,12 +140,12 @@
          */
         public OnDiskAtomIterator next()
         {
-            final Map.Entry<DecoratedKey, T> entry = iter.next();
+            final Map.Entry<DecoratedKey, ColumnFamily> entry = iter.next();
             return new LazyColumnIterator(entry.getKey(), new IColumnIteratorFactory()
             {
                 public OnDiskAtomIterator create()
                 {
-                    return range.columnFilter(entry.getKey().key).getColumnFamilyIterator(entry.getKey(), entry.getValue());
+                    return range.columnFilter(entry.getKey().getKey()).getColumnIterator(entry.getKey(), entry.getValue());
                 }
             });
         }

diff --git a/src/java/org/apache/cassandra/db/RowPosition.java b/src/java/org/apache/cassandra/db/RowPosition.java
index cb68620..3bcd627 100644
--- a/src/java/org/apache/cassandra/db/RowPosition.java
+++ b/src/java/org/apache/cassandra/db/RowPosition.java

@@ -18,16 +18,16 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-public abstract class RowPosition implements RingPosition<RowPosition>
+public interface RowPosition extends RingPosition<RowPosition>
 {
     public static enum Kind
     {
@@ -43,20 +43,18 @@
         }
     }
 
+    public static final class ForKey
+    {
+        public static RowPosition get(ByteBuffer key, IPartitioner p)
+        {
+            return key == null || key.remaining() == 0 ? p.getMinimumToken().minKeyBound() : p.decorateKey(key);
+        }
+    }
+
     public static final RowPositionSerializer serializer = new RowPositionSerializer();
 
-    public static RowPosition forKey(ByteBuffer key, IPartitioner p)
-    {
-        return key == null || key.remaining() == 0 ? p.getMinimumToken().minKeyBound() : p.decorateKey(key);
-    }
-
-    public abstract Token getToken();
-    public abstract Kind kind();
-
-    public boolean isMinimum()
-    {
-        return isMinimum(StorageService.getPartitioner());
-    }
+    public Kind kind();
+    public boolean isMinimum();
 
     public static class RowPositionSerializer implements ISerializer<RowPosition>
     {
@@ -71,12 +69,12 @@
          * token is recreated on the other side). In the other cases, we then
          * serialize the token.
          */
-        public void serialize(RowPosition pos, DataOutput out) throws IOException
+        public void serialize(RowPosition pos, DataOutputPlus out) throws IOException
         {
             Kind kind = pos.kind();
             out.writeByte(kind.ordinal());
             if (kind == Kind.ROW_KEY)
-                ByteBufferUtil.writeWithShortLength(((DecoratedKey)pos).key, out);
+                ByteBufferUtil.writeWithShortLength(((DecoratedKey)pos).getKey(), out);
             else
                 Token.serializer.serialize(pos.getToken(), out);
         }
@@ -102,7 +100,7 @@
             int size = 1; // 1 byte for enum
             if (kind == Kind.ROW_KEY)
             {
-                int keySize = ((DecoratedKey)pos).key.remaining();
+                int keySize = ((DecoratedKey)pos).getKey().remaining();
                 size += typeSizes.sizeof((short) keySize) + keySize;
             }
             else

diff --git a/src/java/org/apache/cassandra/db/SliceByNamesReadCommand.java b/src/java/org/apache/cassandra/db/SliceByNamesReadCommand.java
index ae3db78..b1829f3 100644
--- a/src/java/org/apache/cassandra/db/SliceByNamesReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SliceByNamesReadCommand.java

@@ -20,13 +20,13 @@
 import java.io.*;
 import java.nio.ByteBuffer;
 
+import com.google.common.base.Objects;
+
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -58,13 +58,13 @@
     @Override
     public String toString()
     {
-        return "SliceByNamesReadCommand(" +
-               "keyspace='" + ksName + '\'' +
-               ", key=" + ByteBufferUtil.bytesToHex(key) +
-               ", cfName='" + cfName + '\'' +
-               ", timestamp='" + timestamp + '\'' +
-               ", filter=" + filter +
-               ')';
+        return Objects.toStringHelper(this)
+                      .add("ksName", ksName)
+                      .add("cfName", cfName)
+                      .add("key", ByteBufferUtil.bytesToHex(key))
+                      .add("filter", filter)
+                      .add("timestamp", timestamp)
+                      .toString();
     }
 
     public IDiskAtomFilter filter()
@@ -75,27 +75,17 @@
 
 class SliceByNamesReadCommandSerializer implements IVersionedSerializer<ReadCommand>
 {
-    public void serialize(ReadCommand cmd, DataOutput out, int version) throws IOException
-    {
-        serialize(cmd, null, out, version);
-    }
-
-    public void serialize(ReadCommand cmd, ByteBuffer superColumn, DataOutput out, int version) throws IOException
+    public void serialize(ReadCommand cmd, DataOutputPlus out, int version) throws IOException
     {
         SliceByNamesReadCommand command = (SliceByNamesReadCommand) cmd;
         out.writeBoolean(command.isDigestQuery());
         out.writeUTF(command.ksName);
         ByteBufferUtil.writeWithShortLength(command.key, out);
+        out.writeUTF(command.cfName);
+        out.writeLong(cmd.timestamp);
 
-        if (version < MessagingService.VERSION_20)
-            new QueryPath(command.cfName, superColumn).serialize(out);
-        else
-            out.writeUTF(command.cfName);
-
-        if (version >= MessagingService.VERSION_20)
-            out.writeLong(cmd.timestamp);
-
-        NamesQueryFilter.serializer.serialize(command.filter, out, version);
+        CFMetaData metadata = Schema.instance.getCFMetaData(cmd.ksName, cmd.cfName);
+        metadata.comparator.namesQueryFilterSerializer().serialize(command.filter, out, version);
     }
 
     public ReadCommand deserialize(DataInput in, int version) throws IOException
@@ -103,82 +93,29 @@
         boolean isDigest = in.readBoolean();
         String keyspaceName = in.readUTF();
         ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
-
-        String cfName;
-        ByteBuffer sc = null;
-        if (version < MessagingService.VERSION_20)
-        {
-            QueryPath path = QueryPath.deserialize(in);
-            cfName = path.columnFamilyName;
-            sc = path.superColumnName;
-        }
-        else
-        {
-            cfName = in.readUTF();
-        }
-
-        long timestamp = version < MessagingService.VERSION_20 ? System.currentTimeMillis() : in.readLong();
-
+        String cfName = in.readUTF();
+        long timestamp = in.readLong();
         CFMetaData metadata = Schema.instance.getCFMetaData(keyspaceName, cfName);
-        ReadCommand command;
-        if (version < MessagingService.VERSION_20)
-        {
-            AbstractType<?> comparator;
-            if (metadata.cfType == ColumnFamilyType.Super)
-            {
-                CompositeType type = (CompositeType)metadata.comparator;
-                comparator = sc == null ? type.types.get(0) : type.types.get(1);
-            }
-            else
-            {
-                comparator = metadata.comparator;
-            }
-
-            IDiskAtomFilter filter = NamesQueryFilter.serializer.deserialize(in, version, comparator);
-
-            if (metadata.cfType == ColumnFamilyType.Super)
-                filter = SuperColumns.fromSCFilter((CompositeType)metadata.comparator, sc, filter);
-
-            // Due to SC compat, it's possible we get back a slice filter at this point
-            if (filter instanceof NamesQueryFilter)
-                command = new SliceByNamesReadCommand(keyspaceName, key, cfName, timestamp, (NamesQueryFilter)filter);
-            else
-                command = new SliceFromReadCommand(keyspaceName, key, cfName, timestamp, (SliceQueryFilter)filter);
-        }
-        else
-        {
-            NamesQueryFilter filter = NamesQueryFilter.serializer.deserialize(in, version, metadata.comparator);
-            command = new SliceByNamesReadCommand(keyspaceName, key, cfName, timestamp, filter);
-        }
-
+        NamesQueryFilter filter = metadata.comparator.namesQueryFilterSerializer().deserialize(in, version);
+        ReadCommand command = new SliceByNamesReadCommand(keyspaceName, key, cfName, timestamp, filter);
         command.setDigestQuery(isDigest);
         return command;
     }
 
     public long serializedSize(ReadCommand cmd, int version)
     {
-        return serializedSize(cmd, null, version);
-    }
-
-    public long serializedSize(ReadCommand cmd, ByteBuffer superColumn, int version)
-    {
         TypeSizes sizes = TypeSizes.NATIVE;
         SliceByNamesReadCommand command = (SliceByNamesReadCommand) cmd;
         int size = sizes.sizeof(command.isDigestQuery());
         int keySize = command.key.remaining();
 
+        CFMetaData metadata = Schema.instance.getCFMetaData(cmd.ksName, cmd.cfName);
+
         size += sizes.sizeof(command.ksName);
         size += sizes.sizeof((short)keySize) + keySize;
-
-        if (version < MessagingService.VERSION_20)
-            size += new QueryPath(command.cfName, superColumn).serializedSize(sizes);
-        else
-            size += sizes.sizeof(command.cfName);
-
-        if (version >= MessagingService.VERSION_20)
-            size += sizes.sizeof(cmd.timestamp);
-
-        size += NamesQueryFilter.serializer.serializedSize(command.filter, version);
+        size += sizes.sizeof(command.cfName);
+        size += sizes.sizeof(cmd.timestamp);
+        size += metadata.comparator.namesQueryFilterSerializer().serializedSize(command.filter, version);
 
         return size;
     }

diff --git a/src/java/org/apache/cassandra/db/SliceFromReadCommand.java b/src/java/org/apache/cassandra/db/SliceFromReadCommand.java
index 7526796..f06b9dc 100644
--- a/src/java/org/apache/cassandra/db/SliceFromReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SliceFromReadCommand.java

@@ -18,30 +18,24 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import com.google.common.base.Objects;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
 import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.filter.QueryPath;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.service.RowDataResolver;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class SliceFromReadCommand extends ReadCommand
 {
-    static final Logger logger = LoggerFactory.getLogger(SliceFromReadCommand.class);
-
     static final SliceFromReadCommandSerializer serializer = new SliceFromReadCommandSerializer();
 
     public final SliceQueryFilter filter;
@@ -124,39 +118,28 @@
     @Override
     public String toString()
     {
-        return "SliceFromReadCommand(" +
-               "keyspace='" + ksName + '\'' +
-               ", key='" + ByteBufferUtil.bytesToHex(key) + '\'' +
-               ", cfName='" + cfName + '\'' +
-               ", timestamp='" + timestamp + '\'' +
-               ", filter='" + filter + '\'' +
-               ')';
+        return Objects.toStringHelper(this)
+                      .add("ksName", ksName)
+                      .add("cfName", cfName)
+                      .add("key", ByteBufferUtil.bytesToHex(key))
+                      .add("filter", filter)
+                      .add("timestamp", timestamp)
+                      .toString();
     }
 }
 
 class SliceFromReadCommandSerializer implements IVersionedSerializer<ReadCommand>
 {
-    public void serialize(ReadCommand rm, DataOutput out, int version) throws IOException
-    {
-        serialize(rm, null, out, version);
-    }
-
-    public void serialize(ReadCommand rm, ByteBuffer superColumn, DataOutput out, int version) throws IOException
+    public void serialize(ReadCommand rm, DataOutputPlus out, int version) throws IOException
     {
         SliceFromReadCommand realRM = (SliceFromReadCommand)rm;
         out.writeBoolean(realRM.isDigestQuery());
         out.writeUTF(realRM.ksName);
         ByteBufferUtil.writeWithShortLength(realRM.key, out);
-
-        if (version < MessagingService.VERSION_20)
-            new QueryPath(realRM.cfName, superColumn).serialize(out);
-        else
-            out.writeUTF(realRM.cfName);
-
-        if (version >= MessagingService.VERSION_20)
-            out.writeLong(realRM.timestamp);
-
-        SliceQueryFilter.serializer.serialize(realRM.filter, out, version);
+        out.writeUTF(realRM.cfName);
+        out.writeLong(realRM.timestamp);
+        CFMetaData metadata = Schema.instance.getCFMetaData(realRM.ksName, realRM.cfName);
+        metadata.comparator.sliceQueryFilterSerializer().serialize(realRM.filter, out, version);
     }
 
     public ReadCommand deserialize(DataInput in, int version) throws IOException
@@ -164,36 +147,10 @@
         boolean isDigest = in.readBoolean();
         String keyspaceName = in.readUTF();
         ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
-
-        String cfName;
-        ByteBuffer sc = null;
-        if (version < MessagingService.VERSION_20)
-        {
-            QueryPath path = QueryPath.deserialize(in);
-            cfName = path.columnFamilyName;
-            sc = path.superColumnName;
-        }
-        else
-        {
-            cfName = in.readUTF();
-        }
-
-        long timestamp = version < MessagingService.VERSION_20 ? System.currentTimeMillis() : in.readLong();
-
+        String cfName = in.readUTF();
+        long timestamp = in.readLong();
         CFMetaData metadata = Schema.instance.getCFMetaData(keyspaceName, cfName);
-        SliceQueryFilter filter;
-        if (version < MessagingService.VERSION_20)
-        {
-            filter = SliceQueryFilter.serializer.deserialize(in, version);
-
-            if (metadata.cfType == ColumnFamilyType.Super)
-                filter = SuperColumns.fromSCSliceFilter((CompositeType)metadata.comparator, sc, filter);
-        }
-        else
-        {
-            filter = SliceQueryFilter.serializer.deserialize(in, version);
-        }
-
+        SliceQueryFilter filter = metadata.comparator.sliceQueryFilterSerializer().deserialize(in, version);
         ReadCommand command = new SliceFromReadCommand(keyspaceName, key, cfName, timestamp, filter);
         command.setDigestQuery(isDigest);
         return command;
@@ -201,28 +158,18 @@
 
     public long serializedSize(ReadCommand cmd, int version)
     {
-        return serializedSize(cmd, null, version);
-    }
-
-    public long serializedSize(ReadCommand cmd, ByteBuffer superColumn, int version)
-    {
         TypeSizes sizes = TypeSizes.NATIVE;
         SliceFromReadCommand command = (SliceFromReadCommand) cmd;
         int keySize = command.key.remaining();
 
+        CFMetaData metadata = Schema.instance.getCFMetaData(cmd.ksName, cmd.cfName);
+
         int size = sizes.sizeof(cmd.isDigestQuery()); // boolean
         size += sizes.sizeof(command.ksName);
         size += sizes.sizeof((short) keySize) + keySize;
-
-        if (version < MessagingService.VERSION_20)
-            size += new QueryPath(command.cfName, superColumn).serializedSize(sizes);
-        else
-            size += sizes.sizeof(command.cfName);
-
-        if (version >= MessagingService.VERSION_20)
-            size += sizes.sizeof(cmd.timestamp);
-
-        size += SliceQueryFilter.serializer.serializedSize(command.filter, version);
+        size += sizes.sizeof(command.cfName);
+        size += sizes.sizeof(cmd.timestamp);
+        size += metadata.comparator.sliceQueryFilterSerializer().serializedSize(command.filter, version);
 
         return size;
     }

diff --git a/src/java/org/apache/cassandra/db/SnapshotCommand.java b/src/java/org/apache/cassandra/db/SnapshotCommand.java
index 7bec637..427e9ec 100644
--- a/src/java/org/apache/cassandra/db/SnapshotCommand.java
+++ b/src/java/org/apache/cassandra/db/SnapshotCommand.java

@@ -18,10 +18,10 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 
@@ -59,7 +59,7 @@
 
 class SnapshotCommandSerializer implements IVersionedSerializer<SnapshotCommand>
 {
-    public void serialize(SnapshotCommand snapshot_command, DataOutput out, int version) throws IOException
+    public void serialize(SnapshotCommand snapshot_command, DataOutputPlus out, int version) throws IOException
     {
         out.writeUTF(snapshot_command.keyspace);
         out.writeUTF(snapshot_command.column_family);

diff --git a/src/java/org/apache/cassandra/db/SnapshotDetailsTabularData.java b/src/java/org/apache/cassandra/db/SnapshotDetailsTabularData.java
new file mode 100644
index 0000000..97caea1
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/SnapshotDetailsTabularData.java

@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.util.Map;
+import javax.management.openmbean.*;
+
+import com.google.common.base.Throwables;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.Pair;
+
+
+
+
+public class SnapshotDetailsTabularData
+{
+
+    private static final String[] ITEM_NAMES = new String[]{"Snapshot name",
+            "Keyspace name",
+            "Column family name",
+            "True size",
+            "Size on disk"};
+
+    private static final String[] ITEM_DESCS = new String[]{"snapshot_name",
+            "keyspace_name",
+            "columnfamily_name",
+            "TrueDiskSpaceUsed",
+            "TotalDiskSpaceUsed"};
+
+    private static final String TYPE_NAME = "SnapshotDetails";
+
+    private static final String ROW_DESC = "SnapshotDetails";
+
+    private static final OpenType<?>[] ITEM_TYPES;
+
+    private static final CompositeType COMPOSITE_TYPE;
+
+    public static final TabularType TABULAR_TYPE;
+
+    static
+    {
+        try
+        {
+            ITEM_TYPES = new OpenType[]{ SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING };
+
+            COMPOSITE_TYPE = new CompositeType(TYPE_NAME, ROW_DESC, ITEM_NAMES, ITEM_DESCS, ITEM_TYPES);
+
+            TABULAR_TYPE = new TabularType(TYPE_NAME, ROW_DESC, COMPOSITE_TYPE, ITEM_NAMES);
+        }
+        catch (OpenDataException e)
+        {
+            throw Throwables.propagate(e);
+        }
+    }
+
+
+    public static void from(final String snapshot, final String ks, final String cf, Map.Entry<String, Pair<Long,Long>> snapshotDetail, TabularDataSupport result)
+    {
+        try
+        {
+            final String totalSize = FileUtils.stringifyFileSize(snapshotDetail.getValue().left);
+            final String liveSize =  FileUtils.stringifyFileSize(snapshotDetail.getValue().right);
+            result.put(new CompositeDataSupport(COMPOSITE_TYPE, ITEM_NAMES,
+                    new Object[]{ snapshot, ks, cf, liveSize, totalSize }));
+        }
+        catch (OpenDataException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/SuperColumns.java b/src/java/org/apache/cassandra/db/SuperColumns.java
index 0f74587..2006cbd 100644
--- a/src/java/org/apache/cassandra/db/SuperColumns.java
+++ b/src/java/org/apache/cassandra/db/SuperColumns.java

@@ -18,123 +18,39 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOError;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.LinkedHashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.*;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class SuperColumns
 {
-    public static Iterator<OnDiskAtom> onDiskIterator(DataInput in, int superColumnCount, ColumnSerializer.Flag flag, int expireBefore)
+    public static Iterator<OnDiskAtom> onDiskIterator(DataInput in, int superColumnCount, ColumnSerializer.Flag flag, int expireBefore, CellNameType type)
     {
-        return new SCIterator(in, superColumnCount, flag, expireBefore);
-    }
-
-    public static void serializeSuperColumnFamily(ColumnFamily scf, DataOutput out, int version) throws IOException
-    {
-        /*
-         * There is 2 complications:
-         *   1) We need to know the number of super columns in the column
-         *   family to write in the header (so we do a first pass to group
-         *   columns before serializing).
-         *   2) For deletion infos, we need to figure out which are top-level
-         *   deletions and which are super columns deletions (i.e. the
-         *   subcolumns range deletions).
-         */
-        DeletionInfo delInfo = scf.deletionInfo();
-        Map<ByteBuffer, List<Column>> scMap = groupSuperColumns(scf);
-
-        // Actually Serialize
-        DeletionInfo.serializer().serialize(new DeletionInfo(delInfo.getTopLevelDeletion()), out, version);
-        out.writeInt(scMap.size());
-
-        for (Map.Entry<ByteBuffer, List<Column>> entry : scMap.entrySet())
-        {
-            ByteBufferUtil.writeWithShortLength(entry.getKey(), out);
-
-            DeletionTime delTime = delInfo.rangeCovering(entry.getKey());
-            DeletionInfo scDelInfo = delTime == null ? DeletionInfo.live() : new DeletionInfo(delTime);
-            DeletionTime.serializer.serialize(scDelInfo.getTopLevelDeletion(), out);
-
-            out.writeInt(entry.getValue().size());
-            for (Column subColumn : entry.getValue())
-                Column.serializer.serialize(subColumn, out);
-        }
-    }
-
-    private static Map<ByteBuffer, List<Column>> groupSuperColumns(ColumnFamily scf)
-    {
-        CompositeType type = (CompositeType)scf.getComparator();
-        // The order of insertion matters!
-        Map<ByteBuffer, List<Column>> scMap = new LinkedHashMap<ByteBuffer, List<Column>>();
-
-        ByteBuffer scName = null;
-        List<Column> subColumns = null;
-        for (Column column : scf)
-        {
-            ByteBuffer newScName = scName(column.name());
-            ByteBuffer newSubName = subName(column.name());
-
-            if (scName == null || type.types.get(0).compare(scName, newScName) != 0)
-            {
-                // new super column
-                scName = newScName;
-                subColumns = new ArrayList<Column>();
-                scMap.put(scName, subColumns);
-            }
-
-            subColumns.add(((Column)column).withUpdatedName(newSubName));
-        }
-        return scMap;
+        return new SCIterator(in, superColumnCount, flag, expireBefore, type);
     }
 
     public static void deserializerSuperColumnFamily(DataInput in, ColumnFamily cf, ColumnSerializer.Flag flag, int version) throws IOException
     {
         // Note that there was no way to insert a range tombstone in a SCF in 1.2
-        cf.delete(DeletionInfo.serializer().deserialize(in, version, cf.getComparator()));
+        cf.delete(cf.getComparator().deletionInfoSerializer().deserialize(in, version));
         assert !cf.deletionInfo().rangeIterator().hasNext();
 
-        Iterator<OnDiskAtom> iter = onDiskIterator(in, in.readInt(), flag, Integer.MIN_VALUE);
+        Iterator<OnDiskAtom> iter = onDiskIterator(in, in.readInt(), flag, Integer.MIN_VALUE, cf.getComparator());
         while (iter.hasNext())
             cf.addAtom(iter.next());
     }
 
-    public static long serializedSize(ColumnFamily scf, TypeSizes typeSizes, int version)
-    {
-        Map<ByteBuffer, List<Column>> scMap = groupSuperColumns(scf);
-        DeletionInfo delInfo = scf.deletionInfo();
-
-        // Actually Serialize
-        long size = DeletionInfo.serializer().serializedSize(new DeletionInfo(delInfo.getTopLevelDeletion()), version);
-        for (Map.Entry<ByteBuffer, List<Column>> entry : scMap.entrySet())
-        {
-            int nameSize = entry.getKey().remaining();
-            size += typeSizes.sizeof((short) nameSize) + nameSize;
-
-            DeletionTime delTime = delInfo.rangeCovering(entry.getKey());
-            DeletionInfo scDelInfo = delTime == null ? DeletionInfo.live() : new DeletionInfo(delTime);
-            size += DeletionTime.serializer.serializedSize(scDelInfo.getTopLevelDeletion(), TypeSizes.NATIVE);
-
-            size += typeSizes.sizeof(entry.getValue().size());
-            for (Column subColumn : entry.getValue())
-                size += Column.serializer.serializedSize(subColumn, typeSizes);
-        }
-        return size;
-    }
-
     private static class SCIterator implements Iterator<OnDiskAtom>
     {
         private final DataInput in;
@@ -143,16 +59,19 @@
         private final ColumnSerializer.Flag flag;
         private final int expireBefore;
 
+        private final CellNameType type;
+
         private int read;
         private ByteBuffer scName;
-        private Iterator<Column> subColumnsIterator;
+        private Iterator<Cell> subColumnsIterator;
 
-        private SCIterator(DataInput in, int superColumnCount, ColumnSerializer.Flag flag, int expireBefore)
+        private SCIterator(DataInput in, int superColumnCount, ColumnSerializer.Flag flag, int expireBefore, CellNameType type)
         {
             this.in = in;
             this.scCount = superColumnCount;
             this.flag = flag;
             this.expireBefore = expireBefore;
+            this.type = type;
         }
 
         public boolean hasNext()
@@ -166,8 +85,8 @@
             {
                 if (subColumnsIterator != null && subColumnsIterator.hasNext())
                 {
-                    Column c = subColumnsIterator.next();
-                    return c.withUpdatedName(CompositeType.build(scName, c.name()));
+                    Cell c = subColumnsIterator.next();
+                    return c.withUpdatedName(type.makeCellName(scName, c.name().toByteBuffer()));
                 }
 
                 // Read one more super column
@@ -175,16 +94,16 @@
 
                 scName = ByteBufferUtil.readWithShortLength(in);
                 DeletionInfo delInfo = new DeletionInfo(DeletionTime.serializer.deserialize(in));
-                assert !delInfo.rangeIterator().hasNext(); // We assume no range tombstone (there was no way to insert some in a SCF in 1.2)
 
                 /* read the number of columns */
                 int size = in.readInt();
-                List<Column> subColumns = new ArrayList<Column>(size);
+                List<Cell> subCells = new ArrayList<>(size);
 
+                ColumnSerializer colSer = subType(type).columnSerializer();
                 for (int i = 0; i < size; ++i)
-                    subColumns.add(Column.serializer.deserialize(in, flag, expireBefore));
+                    subCells.add(colSer.deserialize(in, flag, expireBefore));
 
-                subColumnsIterator = subColumns.iterator();
+                subColumnsIterator = subCells.iterator();
 
                 // If the SC was deleted, return that first, otherwise return the first subcolumn
                 DeletionTime dtime = delInfo.getTopLevelDeletion();
@@ -205,6 +124,16 @@
         }
     }
 
+    private static CellNameType subType(CellNameType type)
+    {
+        return new SimpleDenseCellNameType(type.subtype(1));
+    }
+
+    public static CellNameType scNameType(CellNameType type)
+    {
+        return new SimpleDenseCellNameType(type.subtype(0));
+    }
+
     public static AbstractType<?> getComparatorFor(CFMetaData metadata, ByteBuffer superColumn)
     {
         return getComparatorFor(metadata, superColumn != null);
@@ -213,147 +142,33 @@
     public static AbstractType<?> getComparatorFor(CFMetaData metadata, boolean subColumn)
     {
         return metadata.isSuper()
-             ? ((CompositeType)metadata.comparator).types.get(subColumn ? 1 : 0)
-             : metadata.comparator;
+             ? metadata.comparator.subtype(subColumn ? 1 : 0)
+             : metadata.comparator.asAbstractType();
     }
 
     // Extract the first component of a columnName, i.e. the super column name
-    public static ByteBuffer scName(ByteBuffer columnName)
+    public static ByteBuffer scName(Composite columnName)
     {
-        return CompositeType.extractComponent(columnName, 0);
+        return columnName.get(0);
     }
 
     // Extract the 2nd component of a columnName, i.e. the sub-column name
-    public static ByteBuffer subName(ByteBuffer columnName)
+    public static ByteBuffer subName(Composite columnName)
     {
-        return CompositeType.extractComponent(columnName, 1);
+        return columnName.get(1);
     }
 
-    // We don't use CompositeType.Builder mostly because we want to avoid having to provide the comparator.
-    public static ByteBuffer startOf(ByteBuffer scName)
+    public static Composite startOf(ByteBuffer scName)
     {
-        int length = scName.remaining();
-        ByteBuffer bb = ByteBuffer.allocate(2 + length + 1);
-
-        bb.put((byte) ((length >> 8) & 0xFF));
-        bb.put((byte) (length & 0xFF));
-        bb.put(scName.duplicate());
-        bb.put((byte) 0);
-        bb.flip();
-        return bb;
+        return CellNames.compositeDense(scName).start();
     }
 
-    public static ByteBuffer endOf(ByteBuffer scName)
+    public static Composite endOf(ByteBuffer scName)
     {
-        ByteBuffer bb = startOf(scName);
-        bb.put(bb.remaining() - 1, (byte)1);
-        return bb;
+        return CellNames.compositeDense(scName).end();
     }
 
-    public static SCFilter filterToSC(CompositeType type, IDiskAtomFilter filter)
-    {
-        if (filter instanceof NamesQueryFilter)
-            return namesFilterToSC(type, (NamesQueryFilter)filter);
-        else
-            return sliceFilterToSC(type, (SliceQueryFilter)filter);
-    }
-
-    public static SCFilter namesFilterToSC(CompositeType type, NamesQueryFilter filter)
-    {
-        SortedSet<ByteBuffer> newColumns = new TreeSet<>(type.types.get(1));
-        ByteBuffer scName = scName(filter.columns.first());
-        for (ByteBuffer name : filter.columns)
-        {
-            // If we're selecting column across multiple SC, it's not something we can translate for an old node
-            if (type.types.get(0).compare(scName, scName(name)) != 0)
-                throw new RuntimeException("Cannot convert filter to old super column format. Update all nodes to Cassandra 2.0 first.");
-
-            newColumns.add(subName(name));
-        }
-        return new SCFilter(scName, new NamesQueryFilter(newColumns));
-    }
-
-    public static SCFilter sliceFilterToSC(CompositeType type, SliceQueryFilter filter)
-    {
-        /*
-         * There is 3 main cases that we can translate back into super column
-         * queries:
-         *   1) We have only one slice where the first component of start and
-         *   finish is the same, we translate as a slice query on one SC.
-         *   2) We have only one slice, neither the start and finish have a 2nd
-         *   component, and end has the 'end of component' set, we translate
-         *   as a slice of SCs.
-         *   3) Each slice has the same first component for start and finish, no
-         *   2nd component and each finish has the 'end of component' set, we
-         *   translate as a names query of SCs (the filter must then not be reversed).
-         * Otherwise, we can't do much.
-         */
-
-        boolean reversed = filter.reversed;
-        if (filter.slices.length == 1)
-        {
-            ByteBuffer start = filter.slices[0].start;
-            ByteBuffer finish = filter.slices[0].start;
-
-            if (filter.compositesToGroup == 1)
-            {
-                // Note: all the resulting filter must have compositeToGroup == 0 because this
-                // make no sense for super column on the destination node otherwise
-                if (start.remaining() == 0)
-                {
-                    if (finish.remaining() == 0)
-                        // An 'IdentityFilter', keep as is (except for the compositeToGroup)
-                        return new SCFilter(null, new SliceQueryFilter(filter.start(), filter.finish(), reversed, filter.count));
-
-                    if (subName(finish) == null
-                            && ((!reversed && !firstEndOfComponent(finish)) || (reversed && firstEndOfComponent(finish))))
-                        return new SCFilter(null, new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, scName(finish), reversed, filter.count));
-                }
-                else if (finish.remaining() == 0)
-                {
-                    if (subName(start) == null
-                            && ((!reversed && firstEndOfComponent(start)) || (reversed && !firstEndOfComponent(start))))
-                        return new SCFilter(null, new SliceQueryFilter(scName(start), ByteBufferUtil.EMPTY_BYTE_BUFFER, reversed, filter.count));
-                }
-                else if (subName(start) == null && subName(finish) == null
-                        && ((   reversed && !firstEndOfComponent(start) &&  firstEndOfComponent(finish))
-                            || (!reversed &&  firstEndOfComponent(start) && !firstEndOfComponent(finish))))
-                {
-                    // A slice of supercolumns
-                    return new SCFilter(null, new SliceQueryFilter(scName(start), scName(finish), reversed, filter.count));
-                }
-            }
-            else if (filter.compositesToGroup == -1 && type.types.get(0).compare(scName(start), scName(finish)) == 0)
-            {
-                // A slice of subcolumns
-                ByteBuffer newStart = subName(start);
-                ByteBuffer newFinish = subName(finish);
-                return new SCFilter(scName(start),
-                                    filter.withUpdatedSlice(newStart  == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : newStart,
-                                                            newFinish == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : newFinish));
-            }
-        }
-        else if (!reversed)
-        {
-            SortedSet<ByteBuffer> columns = new TreeSet<ByteBuffer>(type.types.get(0));
-            for (int i = 0; i < filter.slices.length; ++i)
-            {
-                ByteBuffer start = filter.slices[i].start;
-                ByteBuffer finish = filter.slices[i].finish;
-
-                if (subName(start) != null || subName(finish) != null
-                  || type.types.get(0).compare(scName(start), scName(finish)) != 0
-                  || firstEndOfComponent(start) || !firstEndOfComponent(finish))
-                    throw new RuntimeException("Cannot convert filter to old super column format. Update all nodes to Cassandra 2.0 first.");
-
-                columns.add(scName(start));
-            }
-            return new SCFilter(null, new NamesQueryFilter(columns));
-        }
-        throw new RuntimeException("Cannot convert filter to old super column format. Update all nodes to Cassandra 2.0 first.");
-    }
-
-    public static IDiskAtomFilter fromSCFilter(CompositeType type, ByteBuffer scName, IDiskAtomFilter filter)
+    public static IDiskAtomFilter fromSCFilter(CellNameType type, ByteBuffer scName, IDiskAtomFilter filter)
     {
         if (filter instanceof NamesQueryFilter)
             return fromSCNamesFilter(type, scName, (NamesQueryFilter)filter);
@@ -361,74 +176,55 @@
             return fromSCSliceFilter(type, scName, (SliceQueryFilter)filter);
     }
 
-    public static IDiskAtomFilter fromSCNamesFilter(CompositeType type, ByteBuffer scName, NamesQueryFilter filter)
+    public static IDiskAtomFilter fromSCNamesFilter(CellNameType type, ByteBuffer scName, NamesQueryFilter filter)
     {
         if (scName == null)
         {
             ColumnSlice[] slices = new ColumnSlice[filter.columns.size()];
             int i = 0;
-            for (ByteBuffer bb : filter.columns)
+            for (CellName name : filter.columns)
             {
-                CompositeType.Builder builder = type.builder().add(bb);
-                slices[i++] = new ColumnSlice(builder.build(), builder.buildAsEndOfRange());
+                // Note that, because the filter in argument is the one from thrift, 'name' are SimpleDenseCellName.
+                // So calling name.slice() would be incorrect, as simple cell names don't handle the EOC properly.
+                // This is why we call toByteBuffer() and rebuild a  Composite of the right type before call slice().
+                slices[i++] = type.make(name.toByteBuffer()).slice();
             }
             return new SliceQueryFilter(slices, false, slices.length, 1);
         }
         else
         {
-            SortedSet<ByteBuffer> newColumns = new TreeSet<ByteBuffer>(type);
-            for (ByteBuffer c : filter.columns)
-                newColumns.add(CompositeType.build(scName, c));
+            SortedSet<CellName> newColumns = new TreeSet<>(type);
+            for (CellName c : filter.columns)
+                newColumns.add(type.makeCellName(scName, c.toByteBuffer()));
             return filter.withUpdatedColumns(newColumns);
         }
     }
 
-    public static SliceQueryFilter fromSCSliceFilter(CompositeType type, ByteBuffer scName, SliceQueryFilter filter)
+    public static SliceQueryFilter fromSCSliceFilter(CellNameType type, ByteBuffer scName, SliceQueryFilter filter)
     {
         assert filter.slices.length == 1;
         if (scName == null)
         {
-            ByteBuffer start = filter.start().remaining() == 0
-                             ? filter.start()
-                             : (filter.reversed ? type.builder().add(filter.start()).buildAsEndOfRange()
-                                                : type.builder().add(filter.start()).build());
-            ByteBuffer finish = filter.finish().remaining() == 0
-                              ? filter.finish()
-                              : (filter.reversed ? type.builder().add(filter.finish()).build()
-                                                 : type.builder().add(filter.finish()).buildAsEndOfRange());
+            // The filter is on the super column name
+            CBuilder builder = type.builder();
+            Composite start = filter.start().isEmpty()
+                            ? Composites.EMPTY
+                            : builder.buildWith(filter.start().toByteBuffer()).withEOC(filter.reversed ? Composite.EOC.END : Composite.EOC.START);
+            Composite finish = filter.finish().isEmpty()
+                             ? Composites.EMPTY
+                             : builder.buildWith(filter.finish().toByteBuffer()).withEOC(filter.reversed ? Composite.EOC.START : Composite.EOC.END);
             return new SliceQueryFilter(start, finish, filter.reversed, filter.count, 1);
         }
         else
         {
-            CompositeType.Builder builder = type.builder().add(scName);
-            ByteBuffer start = filter.start().remaining() == 0
-                             ? filter.reversed ? builder.buildAsEndOfRange() : builder.build()
-                             : builder.copy().add(filter.start()).build();
-            ByteBuffer end = filter.finish().remaining() == 0
-                             ? filter.reversed ? builder.build() : builder.buildAsEndOfRange()
-                             : builder.add(filter.finish()).build();
+            CBuilder builder = type.builder().add(scName);
+            Composite start = filter.start().isEmpty()
+                            ? builder.build().withEOC(filter.reversed ? Composite.EOC.END : Composite.EOC.START)
+                            : builder.buildWith(filter.start().toByteBuffer());
+            Composite end = filter.finish().isEmpty()
+                          ? builder.build().withEOC(filter.reversed ? Composite.EOC.START : Composite.EOC.END)
+                          : builder.buildWith(filter.finish().toByteBuffer());
             return new SliceQueryFilter(start, end, filter.reversed, filter.count);
         }
     }
-
-    private static boolean firstEndOfComponent(ByteBuffer bb)
-    {
-        bb = bb.duplicate();
-        int length = (bb.get() & 0xFF) << 8;
-        length |= (bb.get() & 0xFF);
-
-        return bb.get(length + 2) == 1;
-    }
-
-    public static class SCFilter
-    {
-        public final ByteBuffer scName;
-        public final IDiskAtomFilter updatedFilter;
-
-        public SCFilter(ByteBuffer scName, IDiskAtomFilter updatedFilter)
-        {
-            this.scName = scName;
-            this.updatedFilter = updatedFilter;
-        }
-    }
 }

diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java
index 5b77f63..3c647b6 100644
--- a/src/java/org/apache/cassandra/db/SystemKeyspace.java
+++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java

@@ -29,10 +29,10 @@
 import com.google.common.collect.Iterables;
 import com.google.common.collect.SetMultimap;
 import com.google.common.collect.Sets;
-import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.KSMetaData;
@@ -42,6 +42,7 @@
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
 import org.apache.cassandra.db.compaction.CompactionHistoryTabularData;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.dht.Range;
@@ -58,7 +59,8 @@
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.utils.*;
 
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
 
 public class SystemKeyspace
 {
@@ -69,7 +71,6 @@
     public static final String PEER_EVENTS_CF = "peer_events";
     public static final String LOCAL_CF = "local";
     public static final String INDEX_CF = "IndexInfo";
-    public static final String COUNTER_ID_CF = "NodeIdInfo";
     public static final String HINTS_CF = "hints";
     public static final String RANGE_XFERS_CF = "range_xfers";
     public static final String BATCHLOG_CF = "batchlog";
@@ -78,18 +79,19 @@
     public static final String SCHEMA_COLUMNFAMILIES_CF = "schema_columnfamilies";
     public static final String SCHEMA_COLUMNS_CF = "schema_columns";
     public static final String SCHEMA_TRIGGERS_CF = "schema_triggers";
+    public static final String SCHEMA_USER_TYPES_CF = "schema_usertypes";
     public static final String COMPACTION_LOG = "compactions_in_progress";
     public static final String PAXOS_CF = "paxos";
     public static final String SSTABLE_ACTIVITY_CF = "sstable_activity";
     public static final String COMPACTION_HISTORY_CF = "compaction_history";
 
     private static final String LOCAL_KEY = "local";
-    private static final ByteBuffer ALL_LOCAL_NODE_ID_KEY = ByteBufferUtil.bytes("Local");
 
     public static final List<String> allSchemaCfs = Arrays.asList(SCHEMA_KEYSPACES_CF,
                                                                   SCHEMA_COLUMNFAMILIES_CF,
                                                                   SCHEMA_COLUMNS_CF,
-                                                                  SCHEMA_TRIGGERS_CF);
+                                                                  SCHEMA_TRIGGERS_CF,
+                                                                  SCHEMA_USER_TYPES_CF);
 
     private static volatile Map<UUID, Pair<ReplayPosition, Long>> truncationRecords;
 
@@ -109,8 +111,8 @@
     {
         setupVersion();
 
-        copyAllAliasesToColumnsProper();
-
+        migrateIndexInterval();
+        migrateCachingOption();
         // add entries to system schema columnfamilies for the hardcoded system definitions
         for (String ksname : Schema.systemKeyspaceNames)
         {
@@ -118,30 +120,42 @@
 
             // delete old, possibly obsolete entries in schema columnfamilies
             for (String cfname : Arrays.asList(SystemKeyspace.SCHEMA_KEYSPACES_CF, SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF, SystemKeyspace.SCHEMA_COLUMNS_CF))
-            {
-                String req = String.format("DELETE FROM system.%s WHERE keyspace_name = '%s'", cfname, ksmd.name);
-                processInternal(req);
-            }
+                executeOnceInternal(String.format("DELETE FROM system.%s WHERE keyspace_name = ?", cfname), ksmd.name);
 
             // (+1 to timestamp to make sure we don't get shadowed by the tombstones we just added)
             ksmd.toSchema(FBUtilities.timestampMicros() + 1).apply();
         }
     }
 
-    // Starting with 2.0 (CASSANDRA-5125) we keep all the 'aliases' in system.schema_columns together with the regular columns,
-    // but only for the newly-created tables. This migration is for the pre-2.0 created tables.
-    private static void copyAllAliasesToColumnsProper()
+    private static void setupVersion()
     {
-        for (UntypedResultSet.Row row : processInternal(String.format("SELECT * FROM system.%s", SCHEMA_COLUMNFAMILIES_CF)))
+        String req = "INSERT INTO system.%s (key, release_version, cql_version, thrift_version, native_protocol_version, data_center, rack, partitioner) VALUES (?, ?, ?, ?, ?, ?, ?, ?)";
+        IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
+        executeOnceInternal(String.format(req, LOCAL_CF),
+                            LOCAL_KEY,
+                            FBUtilities.getReleaseVersionString(),
+                            QueryProcessor.CQL_VERSION.toString(),
+                            cassandraConstants.VERSION,
+                            String.valueOf(Server.CURRENT_VERSION),
+                            snitch.getDatacenter(FBUtilities.getBroadcastAddress()),
+                            snitch.getRack(FBUtilities.getBroadcastAddress()),
+                            DatabaseDescriptor.getPartitioner().getClass().getName());
+    }
+
+    // TODO: In 3.0, remove this and the index_interval column from system.schema_columnfamilies
+    /** Migrates index_interval values to min_index_interval and sets index_interval to null */
+    private static void migrateIndexInterval()
+    {
+        for (UntypedResultSet.Row row : executeOnceInternal(String.format("SELECT * FROM system.%s", SCHEMA_COLUMNFAMILIES_CF)))
         {
+            if (!row.has("index_interval"))
+                continue;
+
+            logger.debug("Migrating index_interval to min_index_interval");
+
             CFMetaData table = CFMetaData.fromSchema(row);
-            String query = String.format("SELECT writetime(type) "
-                                         + "FROM system.%s "
-                                         + "WHERE keyspace_name = '%s' AND columnfamily_name = '%s'",
-                                         SCHEMA_COLUMNFAMILIES_CF,
-                                         table.ksName,
-                                         table.cfName);
-            long timestamp = processInternal(query).one().getLong("writetime(type)");
+            String query = String.format("SELECT writetime(type) FROM system.%s WHERE keyspace_name = ? AND columnfamily_name = ?", SCHEMA_COLUMNFAMILIES_CF);
+            long timestamp = executeOnceInternal(query, table.ksName, table.cfName).one().getLong("writetime(type)");
             try
             {
                 table.toSchema(timestamp).apply();
@@ -153,19 +167,29 @@
         }
     }
 
-    private static void setupVersion()
+    private static void migrateCachingOption()
     {
-        String req = "INSERT INTO system.%s (key, release_version, cql_version, thrift_version, native_protocol_version, data_center, rack, partitioner) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')";
-        IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
-        processInternal(String.format(req, LOCAL_CF,
-                                         LOCAL_KEY,
-                                         FBUtilities.getReleaseVersionString(),
-                                         QueryProcessor.CQL_VERSION.toString(),
-                                         cassandraConstants.VERSION,
-                                         Server.CURRENT_VERSION,
-                                         snitch.getDatacenter(FBUtilities.getBroadcastAddress()),
-                                         snitch.getRack(FBUtilities.getBroadcastAddress()),
-                                         DatabaseDescriptor.getPartitioner().getClass().getName()));
+        for (UntypedResultSet.Row row : executeOnceInternal(String.format("SELECT * FROM system.%s", SCHEMA_COLUMNFAMILIES_CF)))
+        {
+            if (!row.has("caching"))
+                continue;
+
+            if (!CachingOptions.isLegacy(row.getString("caching")))
+                continue;
+            try
+            {
+                CachingOptions caching = CachingOptions.fromString(row.getString("caching"));
+                CFMetaData table = CFMetaData.fromSchema(row);
+                logger.info("Migrating caching option {} to {} for {}.{}", row.getString("caching"), caching.toString(), table.ksName, table.cfName);
+                String query = String.format("SELECT writetime(type) FROM system.%s WHERE keyspace_name = ? AND columnfamily_name = ?", SCHEMA_COLUMNFAMILIES_CF);
+                long timestamp = executeOnceInternal(query, table.ksName, table.cfName).one().getLong("writetime(type)");
+                table.toSchema(timestamp).apply();
+            }
+            catch (ConfigurationException e)
+            {
+                // shouldn't happen
+            }
+        }
     }
 
     /**
@@ -181,7 +205,6 @@
             return null;
 
         UUID compactionId = UUIDGen.getTimeUUID();
-        String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, inputs) VALUES (%s, '%s', '%s', {%s})";
         Iterable<Integer> generations = Iterables.transform(toCompact, new Function<SSTableReader, Integer>()
         {
             public Integer apply(SSTableReader sstable)
@@ -189,7 +212,8 @@
                 return sstable.descriptor.generation;
             }
         });
-        processInternal(String.format(req, COMPACTION_LOG, compactionId, cfs.keyspace.getName(), cfs.name, StringUtils.join(Sets.newHashSet(generations), ',')));
+        String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, inputs) VALUES (?, ?, ?, ?)";
+        executeInternal(String.format(req, COMPACTION_LOG), compactionId, cfs.keyspace.getName(), cfs.name, Sets.newHashSet(generations));
         forceBlockingFlush(COMPACTION_LOG);
         return compactionId;
     }
@@ -203,8 +227,7 @@
     {
         assert taskId != null;
 
-        String req = "DELETE FROM system.%s WHERE id = %s";
-        processInternal(String.format(req, COMPACTION_LOG, taskId));
+        executeInternal(String.format("DELETE FROM system.%s WHERE id = ?", COMPACTION_LOG), taskId);
         forceBlockingFlush(COMPACTION_LOG);
     }
 
@@ -215,7 +238,7 @@
     public static Map<Pair<String, String>, Map<Integer, UUID>> getUnfinishedCompactions()
     {
         String req = "SELECT * FROM system.%s";
-        UntypedResultSet resultSet = processInternal(String.format(req, COMPACTION_LOG));
+        UntypedResultSet resultSet = executeInternal(String.format(req, COMPACTION_LOG));
 
         Map<Pair<String, String>, Map<Integer, UUID>> unfinishedCompactions = new HashMap<>();
         for (UntypedResultSet.Row row : resultSet)
@@ -254,21 +277,20 @@
         // don't write anything when the history table itself is compacted, since that would in turn cause new compactions
         if (ksname.equals("system") && cfname.equals(COMPACTION_HISTORY_CF))
             return;
-        String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) "
-                     + "VALUES (%s, '%s', '%s', %d, %d, %d, {%s})";
-        processInternal(String.format(req, COMPACTION_HISTORY_CF, UUIDGen.getTimeUUID().toString(), ksname, cfname, compactedAt, bytesIn, bytesOut, FBUtilities.toString(rowsMerged)));
+        String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) VALUES (?, ?, ?, ?, ?, ?, ?)";
+        executeInternal(String.format(req, COMPACTION_HISTORY_CF), UUIDGen.getTimeUUID(), ksname, cfname, ByteBufferUtil.bytes(compactedAt), bytesIn, bytesOut, rowsMerged);
     }
 
     public static TabularData getCompactionHistory() throws OpenDataException
     {
-        UntypedResultSet queryResultSet = processInternal("SELECT * from system.compaction_history");
+        UntypedResultSet queryResultSet = executeInternal(String.format("SELECT * from system.%s", COMPACTION_HISTORY_CF));
         return CompactionHistoryTabularData.from(queryResultSet);
     }
 
     public static synchronized void saveTruncationRecord(ColumnFamilyStore cfs, long truncatedAt, ReplayPosition position)
     {
-        String req = "UPDATE system.%s SET truncated_at = truncated_at + %s WHERE key = '%s'";
-        processInternal(String.format(req, LOCAL_CF, truncationAsMapEntry(cfs, truncatedAt, position), LOCAL_KEY));
+        String req = "UPDATE system.%s SET truncated_at = truncated_at + ? WHERE key = '%s'";
+        executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY), truncationAsMapEntry(cfs, truncatedAt, position));
         truncationRecords = null;
         forceBlockingFlush(LOCAL_CF);
     }
@@ -278,13 +300,13 @@
      */
     public static synchronized void removeTruncationRecord(UUID cfId)
     {
-        String req = "DELETE truncated_at[%s] from system.%s WHERE key = '%s'";
-        processInternal(String.format(req, cfId, LOCAL_CF, LOCAL_KEY));
+        String req = "DELETE truncated_at[?] from system.%s WHERE key = '%s'";
+        executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY), cfId);
         truncationRecords = null;
         forceBlockingFlush(LOCAL_CF);
     }
 
-    private static String truncationAsMapEntry(ColumnFamilyStore cfs, long truncatedAt, ReplayPosition position)
+    private static Map<UUID, ByteBuffer> truncationAsMapEntry(ColumnFamilyStore cfs, long truncatedAt, ReplayPosition position)
     {
         DataOutputBuffer out = new DataOutputBuffer();
         try
@@ -296,9 +318,7 @@
         {
             throw new RuntimeException(e);
         }
-        return String.format("{%s: 0x%s}",
-                             cfs.metadata.cfId,
-                             ByteBufferUtil.bytesToHex(ByteBuffer.wrap(out.getData(), 0, out.getLength())));
+        return Collections.<UUID, ByteBuffer>singletonMap(cfs.metadata.cfId, ByteBuffer.wrap(out.getData(), 0, out.getLength()));
     }
 
     public static ReplayPosition getTruncatedPosition(UUID cfId)
@@ -322,9 +342,7 @@
 
     private static Map<UUID, Pair<ReplayPosition, Long>> readTruncationRecords()
     {
-        UntypedResultSet rows = processInternal(String.format("SELECT truncated_at FROM system.%s WHERE key = '%s'",
-                                                              LOCAL_CF,
-                                                              LOCAL_KEY));
+        UntypedResultSet rows = executeInternal(String.format("SELECT truncated_at FROM system.%s WHERE key = '%s'", LOCAL_CF, LOCAL_KEY));
 
         Map<UUID, Pair<ReplayPosition, Long>> records = new HashMap<>();
 
@@ -362,53 +380,46 @@
             return;
         }
 
-        String req = "INSERT INTO system.%s (peer, tokens) VALUES ('%s', %s)";
-        processInternal(String.format(req, PEERS_CF, ep.getHostAddress(), tokensAsSet(tokens)));
+        String req = "INSERT INTO system.%s (peer, tokens) VALUES (?, ?)";
+        executeInternal(String.format(req, PEERS_CF), ep, tokensAsSet(tokens));
     }
 
     public static synchronized void updatePreferredIP(InetAddress ep, InetAddress preferred_ip)
     {
-        String req = "INSERT INTO system.%s (peer, preferred_ip) VALUES ('%s', '%s')";
-        processInternal(String.format(req, PEERS_CF, ep.getHostAddress(), preferred_ip.getHostAddress()));
+        String req = "INSERT INTO system.%s (peer, preferred_ip) VALUES (?, ?)";
+        executeInternal(String.format(req, PEERS_CF), ep, preferred_ip);
         forceBlockingFlush(PEERS_CF);
     }
 
-    public static synchronized void updatePeerInfo(InetAddress ep, String columnName, String value)
+    public static synchronized void updatePeerInfo(InetAddress ep, String columnName, Object value)
     {
         if (ep.equals(FBUtilities.getBroadcastAddress()))
             return;
 
-        String req = "INSERT INTO system.%s (peer, %s) VALUES ('%s', %s)";
-        processInternal(String.format(req, PEERS_CF, columnName, ep.getHostAddress(), value));
+        String req = "INSERT INTO system.%s (peer, %s) VALUES (?, ?)";
+        executeInternal(String.format(req, PEERS_CF, columnName), ep, value);
     }
 
     public static synchronized void updateHintsDropped(InetAddress ep, UUID timePeriod, int value)
     {
         // with 30 day TTL
-        String req = "UPDATE system.%s USING TTL 2592000 SET hints_dropped[ %s ] = %s WHERE peer = '%s'";
-        processInternal(String.format(req, PEER_EVENTS_CF, timePeriod.toString(), value, ep.getHostAddress()));
+        String req = "UPDATE system.%s USING TTL 2592000 SET hints_dropped[ ? ] = ? WHERE peer = ?";
+        executeInternal(String.format(req, PEER_EVENTS_CF), timePeriod, value, ep);
     }
 
     public static synchronized void updateSchemaVersion(UUID version)
     {
-        String req = "INSERT INTO system.%s (key, schema_version) VALUES ('%s', %s)";
-        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, version.toString()));
+        String req = "INSERT INTO system.%s (key, schema_version) VALUES ('%s', ?)";
+        executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY), version);
     }
 
-    private static String tokensAsSet(Collection<Token> tokens)
+    private static Set<String> tokensAsSet(Collection<Token> tokens)
     {
         Token.TokenFactory factory = StorageService.getPartitioner().getTokenFactory();
-        StringBuilder sb = new StringBuilder();
-        sb.append("{");
-        Iterator<Token> iter = tokens.iterator();
-        while (iter.hasNext())
-        {
-            sb.append("'").append(factory.toString(iter.next())).append("'");
-            if (iter.hasNext())
-                sb.append(",");
-        }
-        sb.append("}");
-        return sb.toString();
+        Set<String> s = new HashSet<>(tokens.size());
+        for (Token tk : tokens)
+            s.add(factory.toString(tk));
+        return s;
     }
 
     private static Collection<Token> deserializeTokens(Collection<String> tokensStrings)
@@ -425,8 +436,8 @@
      */
     public static synchronized void removeEndpoint(InetAddress ep)
     {
-        String req = "DELETE FROM system.%s WHERE peer = '%s'";
-        processInternal(String.format(req, PEERS_CF, ep.getHostAddress()));
+        String req = "DELETE FROM system.%s WHERE peer = ?";
+        executeInternal(String.format(req, PEERS_CF), ep);
     }
 
     /**
@@ -435,8 +446,8 @@
     public static synchronized void updateTokens(Collection<Token> tokens)
     {
         assert !tokens.isEmpty() : "removeEndpoint should be used instead";
-        String req = "INSERT INTO system.%s (key, tokens) VALUES ('%s', %s)";
-        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, tokensAsSet(tokens)));
+        String req = "INSERT INTO system.%s (key, tokens) VALUES ('%s', ?)";
+        executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY), tokensAsSet(tokens));
         forceBlockingFlush(LOCAL_CF);
     }
 
@@ -469,7 +480,7 @@
     public static SetMultimap<InetAddress, Token> loadTokens()
     {
         SetMultimap<InetAddress, Token> tokenMap = HashMultimap.create();
-        for (UntypedResultSet.Row row : processInternal("SELECT peer, tokens FROM system." + PEERS_CF))
+        for (UntypedResultSet.Row row : executeInternal("SELECT peer, tokens FROM system." + PEERS_CF))
         {
             InetAddress peer = row.getInetAddress("peer");
             if (row.has("tokens"))
@@ -486,7 +497,7 @@
     public static Map<InetAddress, UUID> loadHostIds()
     {
         Map<InetAddress, UUID> hostIdMap = new HashMap<InetAddress, UUID>();
-        for (UntypedResultSet.Row row : processInternal("SELECT peer, host_id FROM system." + PEERS_CF))
+        for (UntypedResultSet.Row row : executeInternal("SELECT peer, host_id FROM system." + PEERS_CF))
         {
             InetAddress peer = row.getInetAddress("peer");
             if (row.has("host_id"))
@@ -499,8 +510,8 @@
 
     public static InetAddress getPreferredIP(InetAddress ep)
     {
-        String req = "SELECT preferred_ip FROM system.%s WHERE peer='%s'";
-        UntypedResultSet result = processInternal(String.format(req, PEERS_CF, ep.getHostAddress()));
+        String req = "SELECT preferred_ip FROM system.%s WHERE peer=?";
+        UntypedResultSet result = executeInternal(String.format(req, PEERS_CF), ep);
         if (!result.isEmpty() && result.one().has("preferred_ip"))
             return result.one().getInetAddress("preferred_ip");
         return null;
@@ -512,7 +523,7 @@
     public static Map<InetAddress, Map<String,String>> loadDcRackInfo()
     {
         Map<InetAddress, Map<String, String>> result = new HashMap<InetAddress, Map<String, String>>();
-        for (UntypedResultSet.Row row : processInternal("SELECT peer, data_center, rack from system." + PEERS_CF))
+        for (UntypedResultSet.Row row : executeInternal("SELECT peer, data_center, rack from system." + PEERS_CF))
         {
             InetAddress peer = row.getInetAddress("peer");
             if (row.has("data_center") && row.has("rack"))
@@ -550,7 +561,7 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(LOCAL_CF);
 
         String req = "SELECT cluster_name FROM system.%s WHERE key='%s'";
-        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
+        UntypedResultSet result = executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
 
         if (result.isEmpty() || !result.one().has("cluster_name"))
         {
@@ -559,8 +570,8 @@
                 throw new ConfigurationException("Found system keyspace files, but they couldn't be loaded!");
 
             // no system files.  this is a new node.
-            req = "INSERT INTO system.%s (key, cluster_name) VALUES ('%s', '%s')";
-            processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, DatabaseDescriptor.getClusterName()));
+            req = "INSERT INTO system.%s (key, cluster_name) VALUES ('%s', ?)";
+            executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY), DatabaseDescriptor.getClusterName());
             return;
         }
 
@@ -572,7 +583,7 @@
     public static Collection<Token> getSavedTokens()
     {
         String req = "SELECT tokens FROM system.%s WHERE key='%s'";
-        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
+        UntypedResultSet result = executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
         return result.isEmpty() || !result.one().has("tokens")
              ? Collections.<Token>emptyList()
              : deserializeTokens(result.one().<String>getSet("tokens", UTF8Type.instance));
@@ -581,7 +592,7 @@
     public static int incrementAndGetGeneration()
     {
         String req = "SELECT gossip_generation FROM system.%s WHERE key='%s'";
-        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
+        UntypedResultSet result = executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
 
         int generation;
         if (result.isEmpty() || !result.one().has("gossip_generation"))
@@ -608,8 +619,8 @@
             }
         }
 
-        req = "INSERT INTO system.%s (key, gossip_generation) VALUES ('%s', %d)";
-        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, generation));
+        req = "INSERT INTO system.%s (key, gossip_generation) VALUES ('%s', ?)";
+        executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY), generation);
         forceBlockingFlush(LOCAL_CF);
 
         return generation;
@@ -618,7 +629,7 @@
     public static BootstrapState getBootstrapState()
     {
         String req = "SELECT bootstrapped FROM system.%s WHERE key='%s'";
-        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
+        UntypedResultSet result = executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
 
         if (result.isEmpty() || !result.one().has("bootstrapped"))
             return BootstrapState.NEEDS_BOOTSTRAP;
@@ -638,8 +649,8 @@
 
     public static void setBootstrapState(BootstrapState state)
     {
-        String req = "INSERT INTO system.%s (key, bootstrapped) VALUES ('%s', '%s')";
-        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, state.name()));
+        String req = "INSERT INTO system.%s (key, bootstrapped) VALUES ('%s', ?)";
+        executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY), state.name());
         forceBlockingFlush(LOCAL_CF);
     }
 
@@ -648,7 +659,7 @@
         ColumnFamilyStore cfs = Keyspace.open(Keyspace.SYSTEM_KS).getColumnFamilyStore(INDEX_CF);
         QueryFilter filter = QueryFilter.getNamesFilter(decorate(ByteBufferUtil.bytes(keyspaceName)),
                                                         INDEX_CF,
-                                                        FBUtilities.singleton(ByteBufferUtil.bytes(indexName), cfs.getComparator()),
+                                                        FBUtilities.singleton(cfs.getComparator().makeCellName(indexName), cfs.getComparator()),
                                                         System.currentTimeMillis());
         return ColumnFamilyStore.removeDeleted(cfs.getColumnFamily(filter), Integer.MAX_VALUE) != null;
     }
@@ -656,16 +667,15 @@
     public static void setIndexBuilt(String keyspaceName, String indexName)
     {
         ColumnFamily cf = ArrayBackedSortedColumns.factory.create(Keyspace.SYSTEM_KS, INDEX_CF);
-        cf.addColumn(new Column(ByteBufferUtil.bytes(indexName), ByteBufferUtil.EMPTY_BYTE_BUFFER, FBUtilities.timestampMicros()));
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, ByteBufferUtil.bytes(keyspaceName), cf);
-        rm.apply();
+        cf.addColumn(new BufferCell(cf.getComparator().makeCellName(indexName), ByteBufferUtil.EMPTY_BYTE_BUFFER, FBUtilities.timestampMicros()));
+        new Mutation(Keyspace.SYSTEM_KS, ByteBufferUtil.bytes(keyspaceName), cf).apply();
     }
 
     public static void setIndexRemoved(String keyspaceName, String indexName)
     {
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, ByteBufferUtil.bytes(keyspaceName));
-        rm.delete(INDEX_CF, ByteBufferUtil.bytes(indexName), FBUtilities.timestampMicros());
-        rm.apply();
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, ByteBufferUtil.bytes(keyspaceName));
+        mutation.delete(INDEX_CF, CFMetaData.IndexCf.comparator.makeCellName(indexName), FBUtilities.timestampMicros());
+        mutation.apply();
     }
 
     /**
@@ -674,19 +684,15 @@
      */
     public static UUID getLocalHostId()
     {
-        UUID hostId = null;
-
         String req = "SELECT host_id FROM system.%s WHERE key='%s'";
-        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
+        UntypedResultSet result = executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
 
         // Look up the Host UUID (return it if found)
         if (!result.isEmpty() && result.one().has("host_id"))
-        {
             return result.one().getUUID("host_id");
-        }
 
         // ID not found, generate a new one, persist, and then return it.
-        hostId = UUID.randomUUID();
+        UUID hostId = UUID.randomUUID();
         logger.warn("No host ID found, created {} (Note: This should happen exactly once per node).", hostId);
         return setLocalHostId(hostId);
     }
@@ -696,73 +702,12 @@
      */
     public static UUID setLocalHostId(UUID hostId)
     {
-        String req = "INSERT INTO system.%s (key, host_id) VALUES ('%s', %s)";
-        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, hostId));
+        String req = "INSERT INTO system.%s (key, host_id) VALUES ('%s', ?)";
+        executeInternal(String.format(req, LOCAL_CF, LOCAL_KEY), hostId);
         return hostId;
     }
 
     /**
-     * Read the current local node id from the system keyspace or null if no
-     * such node id is recorded.
-     */
-    public static CounterId getCurrentLocalCounterId()
-    {
-        Keyspace keyspace = Keyspace.open(Keyspace.SYSTEM_KS);
-
-        // Get the last CounterId (since CounterId are timeuuid is thus ordered from the older to the newer one)
-        QueryFilter filter = QueryFilter.getSliceFilter(decorate(ALL_LOCAL_NODE_ID_KEY),
-                                                        COUNTER_ID_CF,
-                                                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                        true,
-                                                        1,
-                                                        System.currentTimeMillis());
-        ColumnFamily cf = keyspace.getColumnFamilyStore(COUNTER_ID_CF).getColumnFamily(filter);
-        if (cf != null && cf.getColumnCount() != 0)
-            return CounterId.wrap(cf.iterator().next().name());
-        else
-            return null;
-    }
-
-    /**
-     * Write a new current local node id to the system keyspace.
-     *
-     * @param newCounterId the new current local node id to record
-     * @param now microsecond time stamp.
-     */
-    public static void writeCurrentLocalCounterId(CounterId newCounterId, long now)
-    {
-        ByteBuffer ip = ByteBuffer.wrap(FBUtilities.getBroadcastAddress().getAddress());
-
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(Keyspace.SYSTEM_KS, COUNTER_ID_CF);
-        cf.addColumn(new Column(newCounterId.bytes(), ip, now));
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, ALL_LOCAL_NODE_ID_KEY, cf);
-        rm.apply();
-        forceBlockingFlush(COUNTER_ID_CF);
-    }
-
-    public static List<CounterId.CounterIdRecord> getOldLocalCounterIds()
-    {
-        List<CounterId.CounterIdRecord> l = new ArrayList<CounterId.CounterIdRecord>();
-
-        Keyspace keyspace = Keyspace.open(Keyspace.SYSTEM_KS);
-        QueryFilter filter = QueryFilter.getIdentityFilter(decorate(ALL_LOCAL_NODE_ID_KEY), COUNTER_ID_CF, System.currentTimeMillis());
-        ColumnFamily cf = keyspace.getColumnFamilyStore(COUNTER_ID_CF).getColumnFamily(filter);
-
-        CounterId previous = null;
-        for (Column c : cf)
-        {
-            if (previous != null)
-                l.add(new CounterId.CounterIdRecord(previous, c.timestamp()));
-
-            // this will ignore the last column on purpose since it is the
-            // current local node id
-            previous = CounterId.wrap(c.name());
-        }
-        return l;
-    }
-
-    /**
      * @param cfName The name of the ColumnFamily responsible for part of the schema (keyspace, ColumnFamily, columns)
      * @return CFS responsible to hold low-level serialized schema
      */
@@ -775,10 +720,8 @@
     {
         List<Row> schema = new ArrayList<>();
 
-        schema.addAll(serializedSchema(SCHEMA_KEYSPACES_CF));
-        schema.addAll(serializedSchema(SCHEMA_COLUMNFAMILIES_CF));
-        schema.addAll(serializedSchema(SCHEMA_COLUMNS_CF));
-        schema.addAll(serializedSchema(SCHEMA_TRIGGERS_CF));
+        for (String cf : allSchemaCfs)
+            schema.addAll(serializedSchema(cf));
 
         return schema;
     }
@@ -798,29 +741,27 @@
                                                      System.currentTimeMillis());
     }
 
-    public static Collection<RowMutation> serializeSchema()
+    public static Collection<Mutation> serializeSchema()
     {
-        Map<DecoratedKey, RowMutation> mutationMap = new HashMap<>();
+        Map<DecoratedKey, Mutation> mutationMap = new HashMap<>();
 
-        serializeSchema(mutationMap, SCHEMA_KEYSPACES_CF);
-        serializeSchema(mutationMap, SCHEMA_COLUMNFAMILIES_CF);
-        serializeSchema(mutationMap, SCHEMA_COLUMNS_CF);
-        serializeSchema(mutationMap, SCHEMA_TRIGGERS_CF);
+        for (String cf : allSchemaCfs)
+            serializeSchema(mutationMap, cf);
 
         return mutationMap.values();
     }
 
-    private static void serializeSchema(Map<DecoratedKey, RowMutation> mutationMap, String schemaCfName)
+    private static void serializeSchema(Map<DecoratedKey, Mutation> mutationMap, String schemaCfName)
     {
         for (Row schemaRow : serializedSchema(schemaCfName))
         {
             if (Schema.ignoredSchemaRow(schemaRow))
                 continue;
 
-            RowMutation mutation = mutationMap.get(schemaRow.key);
+            Mutation mutation = mutationMap.get(schemaRow.key);
             if (mutation == null)
             {
-                mutation = new RowMutation(Keyspace.SYSTEM_KS, schemaRow.key.key);
+                mutation = new Mutation(Keyspace.SYSTEM_KS, schemaRow.key.getKey());
                 mutationMap.put(schemaRow.key, mutation);
             }
 
@@ -828,12 +769,16 @@
         }
     }
 
-    public static Map<DecoratedKey, ColumnFamily> getSchema(String cfName)
+    public static Map<DecoratedKey, ColumnFamily> getSchema(String schemaCfName, Set<String> keyspaces)
     {
-        Map<DecoratedKey, ColumnFamily> schema = new HashMap<DecoratedKey, ColumnFamily>();
+        Map<DecoratedKey, ColumnFamily> schema = new HashMap<>();
 
-        for (Row schemaEntity : SystemKeyspace.serializedSchema(cfName))
-            schema.put(schemaEntity.key, schemaEntity.cf);
+        for (String keyspace : keyspaces)
+        {
+            Row schemaEntity = readSchemaRow(schemaCfName, keyspace);
+            if (schemaEntity.cf != null)
+                schema.put(schemaEntity.key, schemaEntity.cf);
+        }
 
         return schema;
     }
@@ -843,12 +788,19 @@
         return AsciiType.instance.fromString(ksName);
     }
 
-    public static Row readSchemaRow(String ksName)
+    /**
+     * Fetches a subset of schema (table data, columns metadata or triggers) for the keyspace.
+     *
+     * @param schemaCfName the schema table to get the data from (schema_keyspaces, schema_columnfamilies, schema_columns or schema_triggers)
+     * @param ksName the keyspace of the tables we are interested in
+     * @return a Row containing the schema data of a particular type for the keyspace
+     */
+    public static Row readSchemaRow(String schemaCfName, String ksName)
     {
         DecoratedKey key = StorageService.getPartitioner().decorateKey(getSchemaKSKey(ksName));
 
-        ColumnFamilyStore schemaCFS = SystemKeyspace.schemaCFS(SCHEMA_KEYSPACES_CF);
-        ColumnFamily result = schemaCFS.getColumnFamily(QueryFilter.getIdentityFilter(key, SCHEMA_KEYSPACES_CF, System.currentTimeMillis()));
+        ColumnFamilyStore schemaCFS = SystemKeyspace.schemaCFS(schemaCfName);
+        ColumnFamily result = schemaCFS.getColumnFamily(QueryFilter.getIdentityFilter(key, schemaCfName, System.currentTimeMillis()));
 
         return new Row(key, result);
     }
@@ -865,9 +817,10 @@
     {
         DecoratedKey key = StorageService.getPartitioner().decorateKey(getSchemaKSKey(ksName));
         ColumnFamilyStore schemaCFS = SystemKeyspace.schemaCFS(schemaCfName);
+        Composite prefix = schemaCFS.getComparator().make(cfName);
         ColumnFamily cf = schemaCFS.getColumnFamily(key,
-                                                    DefsTables.searchComposite(cfName, true),
-                                                    DefsTables.searchComposite(cfName, false),
+                                                    prefix,
+                                                    prefix.end(),
                                                     false,
                                                     Integer.MAX_VALUE,
                                                     System.currentTimeMillis());
@@ -876,13 +829,13 @@
 
     public static PaxosState loadPaxosState(ByteBuffer key, CFMetaData metadata)
     {
-        String req = "SELECT * FROM system.%s WHERE row_key = 0x%s AND cf_id = %s";
-        UntypedResultSet results = processInternal(String.format(req, PAXOS_CF, ByteBufferUtil.bytesToHex(key), metadata.cfId));
+        String req = "SELECT * FROM system.%s WHERE row_key = ? AND cf_id = ?";
+        UntypedResultSet results = executeInternal(String.format(req, PAXOS_CF), key, metadata.cfId);
         if (results.isEmpty())
             return new PaxosState(key, metadata);
         UntypedResultSet.Row row = results.one();
         Commit promised = row.has("in_progress_ballot")
-                        ? new Commit(key, row.getUUID("in_progress_ballot"), EmptyColumns.factory.create(metadata))
+                        ? new Commit(key, row.getUUID("in_progress_ballot"), ArrayBackedSortedColumns.factory.create(metadata))
                         : Commit.emptyCommit(key, metadata);
         // either we have both a recently accepted ballot and update or we have neither
         Commit accepted = row.has("proposal")
@@ -897,26 +850,24 @@
 
     public static void savePaxosPromise(Commit promise)
     {
-        String req = "UPDATE %s USING TIMESTAMP %d AND TTL %d SET in_progress_ballot = %s WHERE row_key = 0x%s AND cf_id = %s";
-        processInternal(String.format(req,
-                                      PAXOS_CF,
-                                      UUIDGen.microsTimestamp(promise.ballot),
-                                      paxosTtl(promise.update.metadata),
-                                      promise.ballot,
-                                      ByteBufferUtil.bytesToHex(promise.key),
-                                      promise.update.id()));
+        String req = "UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET in_progress_ballot = ? WHERE row_key = ? AND cf_id = ?";
+        executeInternal(String.format(req, PAXOS_CF),
+                        UUIDGen.microsTimestamp(promise.ballot),
+                        paxosTtl(promise.update.metadata),
+                        promise.ballot,
+                        promise.key,
+                        promise.update.id());
     }
 
     public static void savePaxosProposal(Commit proposal)
     {
-        processInternal(String.format("UPDATE %s USING TIMESTAMP %d AND TTL %d SET proposal_ballot = %s, proposal = 0x%s WHERE row_key = 0x%s AND cf_id = %s",
-                                      PAXOS_CF,
-                                      UUIDGen.microsTimestamp(proposal.ballot),
-                                      paxosTtl(proposal.update.metadata),
-                                      proposal.ballot,
-                                      ByteBufferUtil.bytesToHex(proposal.update.toBytes()),
-                                      ByteBufferUtil.bytesToHex(proposal.key),
-                                      proposal.update.id()));
+        executeInternal(String.format("UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS_CF),
+                        UUIDGen.microsTimestamp(proposal.ballot),
+                        paxosTtl(proposal.update.metadata),
+                        proposal.ballot,
+                        proposal.update.toBytes(),
+                        proposal.key,
+                        proposal.update.id());
     }
 
     private static int paxosTtl(CFMetaData metadata)
@@ -929,15 +880,14 @@
     {
         // We always erase the last proposal (with the commit timestamp to no erase more recent proposal in case the commit is old)
         // even though that's really just an optimization  since SP.beginAndRepairPaxos will exclude accepted proposal older than the mrc.
-        String cql = "UPDATE %s USING TIMESTAMP %d AND TTL %d SET proposal_ballot = null, proposal = null, most_recent_commit_at = %s, most_recent_commit = 0x%s WHERE row_key = 0x%s AND cf_id = %s";
-        processInternal(String.format(cql,
-                                      PAXOS_CF,
-                                      UUIDGen.microsTimestamp(commit.ballot),
-                                      paxosTtl(commit.update.metadata),
-                                      commit.ballot,
-                                      ByteBufferUtil.bytesToHex(commit.update.toBytes()),
-                                      ByteBufferUtil.bytesToHex(commit.key),
-                                      commit.update.id()));
+        String cql = "UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET proposal_ballot = null, proposal = null, most_recent_commit_at = ?, most_recent_commit = ? WHERE row_key = ? AND cf_id = ?";
+        executeInternal(String.format(cql, PAXOS_CF),
+                        UUIDGen.microsTimestamp(commit.ballot),
+                        paxosTtl(commit.update.metadata),
+                        commit.ballot,
+                        commit.update.toBytes(),
+                        commit.key,
+                        commit.update.id());
     }
 
     /**
@@ -949,12 +899,8 @@
      */
     public static RestorableMeter getSSTableReadMeter(String keyspace, String table, int generation)
     {
-        String cql = "SELECT * FROM %s WHERE keyspace_name='%s' and columnfamily_name='%s' and generation=%d";
-        UntypedResultSet results = processInternal(String.format(cql,
-                                                                 SSTABLE_ACTIVITY_CF,
-                                                                 keyspace,
-                                                                 table,
-                                                                 generation));
+        String cql = "SELECT * FROM system.%s WHERE keyspace_name=? and columnfamily_name=? and generation=?";
+        UntypedResultSet results = executeInternal(String.format(cql, SSTABLE_ACTIVITY_CF), keyspace, table, generation);
 
         if (results.isEmpty())
             return new RestorableMeter();
@@ -971,14 +917,13 @@
     public static void persistSSTableReadMeter(String keyspace, String table, int generation, RestorableMeter meter)
     {
         // Store values with a one-day TTL to handle corner cases where cleanup might not occur
-        String cql = "INSERT INTO %s (keyspace_name, columnfamily_name, generation, rate_15m, rate_120m) VALUES ('%s', '%s', %d, %f, %f) USING TTL 864000";
-        processInternal(String.format(cql,
-                                      SSTABLE_ACTIVITY_CF,
-                                      keyspace,
-                                      table,
-                                      generation,
-                                      meter.fifteenMinuteRate(),
-                                      meter.twoHourRate()));
+        String cql = "INSERT INTO system.%s (keyspace_name, columnfamily_name, generation, rate_15m, rate_120m) VALUES (?, ?, ?, ?, ?) USING TTL 864000";
+        executeInternal(String.format(cql, SSTABLE_ACTIVITY_CF),
+                        keyspace,
+                        table,
+                        generation,
+                        meter.fifteenMinuteRate(),
+                        meter.twoHourRate());
     }
 
     /**
@@ -986,7 +931,7 @@
      */
     public static void clearSSTableReadMeter(String keyspace, String table, int generation)
     {
-        String cql = "DELETE FROM %s WHERE keyspace_name='%s' AND columnfamily_name='%s' and generation=%d";
-        processInternal(String.format(cql, SSTABLE_ACTIVITY_CF, keyspace, table, generation));
+        String cql = "DELETE FROM system.%s WHERE keyspace_name=? AND columnfamily_name=? and generation=?";
+        executeInternal(String.format(cql, SSTABLE_ACTIVITY_CF), keyspace, table, generation);
     }
 }

diff --git a/src/java/org/apache/cassandra/db/TreeMapBackedSortedColumns.java b/src/java/org/apache/cassandra/db/TreeMapBackedSortedColumns.java
deleted file mode 100644
index ae6e798..0000000
--- a/src/java/org/apache/cassandra/db/TreeMapBackedSortedColumns.java
+++ /dev/null

@@ -1,176 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.SortedMap;
-import java.util.SortedSet;
-import java.util.TreeMap;
-
-import com.google.common.base.Function;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.utils.Allocator;
-
-public class TreeMapBackedSortedColumns extends AbstractThreadUnsafeSortedColumns
-{
-    private final TreeMap<ByteBuffer, Column> map;
-
-    public static final ColumnFamily.Factory<TreeMapBackedSortedColumns> factory = new Factory<TreeMapBackedSortedColumns>()
-    {
-        public TreeMapBackedSortedColumns create(CFMetaData metadata, boolean insertReversed)
-        {
-            assert !insertReversed;
-            return new TreeMapBackedSortedColumns(metadata);
-        }
-    };
-
-    public AbstractType<?> getComparator()
-    {
-        return (AbstractType<?>)map.comparator();
-    }
-
-    protected TreeMapBackedSortedColumns(CFMetaData metadata)
-    {
-        super(metadata);
-        this.map = new TreeMap<ByteBuffer, Column>(metadata.comparator);
-    }
-
-    private TreeMapBackedSortedColumns(CFMetaData metadata, SortedMap<ByteBuffer, Column> columns)
-    {
-        super(metadata);
-        this.map = new TreeMap<ByteBuffer, Column>(columns);
-    }
-
-    public ColumnFamily.Factory getFactory()
-    {
-        return factory;
-    }
-
-    public ColumnFamily cloneMe()
-    {
-        return new TreeMapBackedSortedColumns(metadata, map);
-    }
-
-    public boolean isInsertReversed()
-    {
-        return false;
-    }
-
-    /*
-     * If we find an old column that has the same name
-     * the ask it to resolve itself else add the new column
-    */
-    public void addColumn(Column column, Allocator allocator)
-    {
-        ByteBuffer name = column.name();
-        // this is a slightly unusual way to structure this; a more natural way is shown in ThreadSafeSortedColumns,
-        // but TreeMap lacks putAbsent.  Rather than split it into a "get, then put" check, we do it as follows,
-        // which saves the extra "get" in the no-conflict case [for both normal and super columns],
-        // in exchange for a re-put in the SuperColumn case.
-        Column oldColumn = map.put(name, column);
-        if (oldColumn == null)
-            return;
-
-        // calculate reconciled col from old (existing) col and new col
-        map.put(name, column.reconcile(oldColumn, allocator));
-    }
-
-    /**
-     * We need to go through each column in the column container and resolve it before adding
-     */
-    public void addAll(ColumnFamily cm, Allocator allocator, Function<Column, Column> transformation)
-    {
-        delete(cm.deletionInfo());
-        for (Column column : cm)
-            addColumn(transformation.apply(column), allocator);
-    }
-
-    public boolean replace(Column oldColumn, Column newColumn)
-    {
-        if (!oldColumn.name().equals(newColumn.name()))
-            throw new IllegalArgumentException();
-
-        // We are not supposed to put the newColumn is either there was not
-        // column or the column was not equal to oldColumn (to be coherent
-        // with other implementation). We optimize for the common case where
-        // oldColumn do is present though.
-        Column previous = map.put(oldColumn.name(), newColumn);
-        if (previous == null)
-        {
-            map.remove(oldColumn.name());
-            return false;
-        }
-        if (!previous.equals(oldColumn))
-        {
-            map.put(oldColumn.name(), previous);
-            return false;
-        }
-        return true;
-    }
-
-    public Column getColumn(ByteBuffer name)
-    {
-        return map.get(name);
-    }
-
-    public void clear()
-    {
-        setDeletionInfo(DeletionInfo.live());
-        map.clear();
-    }
-
-    public int getColumnCount()
-    {
-        return map.size();
-    }
-
-    public Collection<Column> getSortedColumns()
-    {
-        return map.values();
-    }
-
-    public Collection<Column> getReverseSortedColumns()
-    {
-        return map.descendingMap().values();
-    }
-
-    public SortedSet<ByteBuffer> getColumnNames()
-    {
-        return map.navigableKeySet();
-    }
-
-    public Iterator<Column> iterator()
-    {
-        return map.values().iterator();
-    }
-
-    public Iterator<Column> iterator(ColumnSlice[] slices)
-    {
-        return new ColumnSlice.NavigableMapIterator(map, slices);
-    }
-
-    public Iterator<Column> reverseIterator(ColumnSlice[] slices)
-    {
-        return new ColumnSlice.NavigableMapIterator(map.descendingMap(), slices);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/TruncateResponse.java b/src/java/org/apache/cassandra/db/TruncateResponse.java
index eda9955..d8f5ad2 100644
--- a/src/java/org/apache/cassandra/db/TruncateResponse.java
+++ b/src/java/org/apache/cassandra/db/TruncateResponse.java

@@ -18,10 +18,10 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 
@@ -51,7 +51,7 @@
 
     public static class TruncateResponseSerializer implements IVersionedSerializer<TruncateResponse>
     {
-        public void serialize(TruncateResponse tr, DataOutput out, int version) throws IOException
+        public void serialize(TruncateResponse tr, DataOutputPlus out, int version) throws IOException
         {
             out.writeUTF(tr.keyspace);
             out.writeUTF(tr.columnFamily);

diff --git a/src/java/org/apache/cassandra/db/Truncation.java b/src/java/org/apache/cassandra/db/Truncation.java
index dc219d6..88742cd 100644
--- a/src/java/org/apache/cassandra/db/Truncation.java
+++ b/src/java/org/apache/cassandra/db/Truncation.java

@@ -18,10 +18,10 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 
@@ -54,7 +54,7 @@
 
 class TruncationSerializer implements IVersionedSerializer<Truncation>
 {
-    public void serialize(Truncation t, DataOutput out, int version) throws IOException
+    public void serialize(Truncation t, DataOutputPlus out, int version) throws IOException
     {
         out.writeUTF(t.keyspace);
         out.writeUTF(t.columnFamily);

diff --git a/src/java/org/apache/cassandra/db/UnsortedColumns.java b/src/java/org/apache/cassandra/db/UnsortedColumns.java
deleted file mode 100644
index 2b33cd0..0000000
--- a/src/java/org/apache/cassandra/db/UnsortedColumns.java
+++ /dev/null

@@ -1,144 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Iterator;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.utils.Allocator;
-
-/**
- * A ColumnFamily that allows inserting in any order, even unsorted.
- *
- * Operations that require sorting (getSortedColumns) or that cannot be efficient without it
- * (replace, getColumn, etc.) are not supported.
- */
-public class UnsortedColumns extends AbstractThreadUnsafeSortedColumns
-{
-    private final ArrayList<Column> columns;
-
-    public static final Factory<UnsortedColumns> factory = new Factory<UnsortedColumns>()
-    {
-        public UnsortedColumns create(CFMetaData metadata, boolean insertReversed)
-        {
-            assert !insertReversed;
-            return new UnsortedColumns(metadata);
-        }
-    };
-
-    private UnsortedColumns(CFMetaData metadata)
-    {
-        this(metadata, new ArrayList<Column>());
-    }
-
-    private UnsortedColumns(CFMetaData metadata, ArrayList<Column> columns)
-    {
-        super(metadata);
-        this.columns = columns;
-    }
-
-    public Factory getFactory()
-    {
-        return factory;
-    }
-
-    public ColumnFamily cloneMe()
-    {
-        return new UnsortedColumns(metadata, new ArrayList<Column>(columns));
-    }
-
-    public boolean isInsertReversed()
-    {
-        return false;
-    }
-
-    public void clear()
-    {
-        columns.clear();
-    }
-
-    public void addColumn(Column column, Allocator allocator)
-    {
-        columns.add(column);
-    }
-
-    public void addAll(ColumnFamily cm, Allocator allocator, Function<Column, Column> transformation)
-    {
-        delete(cm.deletionInfo());
-        for (Column column : cm)
-            addColumn(column);
-    }
-
-    public Iterator<Column> iterator()
-    {
-        return columns.iterator();
-    }
-
-    public boolean replace(Column oldColumn, Column newColumn)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Column getColumn(ByteBuffer name)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Iterable<ByteBuffer> getColumnNames()
-    {
-        return Iterables.transform(columns, new Function<Column, ByteBuffer>()
-        {
-            public ByteBuffer apply(Column column)
-            {
-                return column.name;
-            }
-        });
-    }
-
-    public Collection<Column> getSortedColumns()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Collection<Column> getReverseSortedColumns()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public int getColumnCount()
-    {
-        return columns.size();
-    }
-
-    public Iterator<Column> iterator(ColumnSlice[] slices)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Iterator<Column> reverseIterator(ColumnSlice[] slices)
-    {
-        throw new UnsupportedOperationException();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/WriteResponse.java b/src/java/org/apache/cassandra/db/WriteResponse.java
index 83b579a..a7b108b 100644
--- a/src/java/org/apache/cassandra/db/WriteResponse.java
+++ b/src/java/org/apache/cassandra/db/WriteResponse.java

@@ -18,10 +18,10 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 
@@ -39,7 +39,7 @@
 
     public static class WriteResponseSerializer implements IVersionedSerializer<WriteResponse>
     {
-        public void serialize(WriteResponse wm, DataOutput out, int version) throws IOException
+        public void serialize(WriteResponse wm, DataOutputPlus out, int version) throws IOException
         {
         }
 

diff --git a/src/java/org/apache/cassandra/db/columniterator/IdentityQueryFilter.java b/src/java/org/apache/cassandra/db/columniterator/IdentityQueryFilter.java
index 828f7e5..7185eef 100644
--- a/src/java/org/apache/cassandra/db/columniterator/IdentityQueryFilter.java
+++ b/src/java/org/apache/cassandra/db/columniterator/IdentityQueryFilter.java

@@ -17,8 +17,8 @@
  */
 package org.apache.cassandra.db.columniterator;
 
+import org.apache.cassandra.db.composites.Composites;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class IdentityQueryFilter extends SliceQueryFilter
 {
@@ -27,7 +27,7 @@
      */
     public IdentityQueryFilter()
     {
-        super(ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, Integer.MAX_VALUE);
+        super(Composites.EMPTY, Composites.EMPTY, false, Integer.MAX_VALUE);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java b/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java
index b6aa085..7012321 100644
--- a/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java
+++ b/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java

@@ -18,17 +18,16 @@
 package org.apache.cassandra.db.columniterator;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.ArrayDeque;
 import java.util.Deque;
-import java.util.Iterator;
 import java.util.List;
 
 import com.google.common.collect.AbstractIterator;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.IndexHelper;
 import org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
@@ -54,7 +53,7 @@
     private final ColumnSlice[] slices;
     private final BlockFetcher fetcher;
     private final Deque<OnDiskAtom> blockColumns = new ArrayDeque<OnDiskAtom>();
-    private final AbstractType<?> comparator;
+    private final CellNameType comparator;
 
     // Holds range tombstone in reverse queries. See addColumn()
     private final Deque<OnDiskAtom> rangeTombstonesReversed;
@@ -78,7 +77,7 @@
         try
         {
             this.indexes = indexEntry.columnsIndex();
-            emptyColumnFamily = EmptyColumns.factory.create(sstable.metadata);
+            emptyColumnFamily = ArrayBackedSortedColumns.factory.create(sstable.metadata);
             if (indexes.isEmpty())
             {
                 setToRowStart(indexEntry, input);
@@ -113,8 +112,6 @@
             in.seek(rowEntry.position);
         }
         sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(file));
-        if (sstable.descriptor.version.hasRowSizeAndColumnCount)
-            file.readLong();
     }
 
     public ColumnFamily getColumnFamily()
@@ -179,34 +176,6 @@
         }
     }
 
-    static int indexFor(SSTableReader sstable, ByteBuffer name, List<IndexHelper.IndexInfo> indexes, AbstractType<?> comparator, boolean reversed, int startIdx)
-    {
-        // If it's a super CF and the sstable is from the old format, then the index will contain old format info, i.e. non composite
-        // SC names. So we need to 1) use only the SC name part of the comparator and 2) extract only that part from 'name'
-        if (sstable.metadata.isSuper() && sstable.descriptor.version.hasSuperColumns)
-        {
-            AbstractType<?> scComparator = SuperColumns.getComparatorFor(sstable.metadata, false);
-            ByteBuffer scName = SuperColumns.scName(name);
-            return IndexHelper.indexFor(scName, indexes, scComparator, reversed, startIdx);
-        }
-        return IndexHelper.indexFor(name, indexes, comparator, reversed, startIdx);
-    }
-
-    static ByteBuffer forIndexComparison(SSTableReader sstable, ByteBuffer name)
-    {
-        // See indexFor above.
-        return sstable.metadata.isSuper() && sstable.descriptor.version.hasSuperColumns
-             ? SuperColumns.scName(name)
-             : name;
-    }
-
-    static AbstractType<?> comparatorForIndex(SSTableReader sstable, AbstractType<?> comparator)
-    {
-        return sstable.metadata.isSuper() && sstable.descriptor.version.hasSuperColumns
-             ? SuperColumns.getComparatorFor(sstable.metadata, false)
-             : comparator;
-    }
-
     private abstract class BlockFetcher
     {
         protected int currentSliceIdx;
@@ -219,7 +188,7 @@
         /*
          * Return the smallest key selected by the current ColumnSlice.
          */
-        protected ByteBuffer currentStart()
+        protected Composite currentStart()
         {
             return reversed ? slices[currentSliceIdx].finish : slices[currentSliceIdx].start;
         }
@@ -227,7 +196,7 @@
         /*
          * Return the biggest key selected by the current ColumnSlice.
          */
-        protected ByteBuffer currentFinish()
+        protected Composite currentFinish()
         {
             return reversed ? slices[currentSliceIdx].start : slices[currentSliceIdx].finish;
         }
@@ -241,28 +210,22 @@
             return isBeforeSliceStart(column.name());
         }
 
-        protected boolean isBeforeSliceStart(ByteBuffer name)
+        protected boolean isBeforeSliceStart(Composite name)
         {
-            ByteBuffer start = currentStart();
-            return start.remaining() != 0 && comparator.compare(name, start) < 0;
-        }
-
-        protected boolean isIndexEntryBeforeSliceStart(ByteBuffer name)
-        {
-            ByteBuffer start = currentStart();
-            return start.remaining() != 0 && comparatorForIndex(sstable, comparator).compare(name, forIndexComparison(sstable, start)) < 0;
+            Composite start = currentStart();
+            return !start.isEmpty() && comparator.compare(name, start) < 0;
         }
 
         protected boolean isColumnBeforeSliceFinish(OnDiskAtom column)
         {
-            ByteBuffer finish = currentFinish();
-            return finish.remaining() == 0 || comparator.compare(column.name(), finish) <= 0;
+            Composite finish = currentFinish();
+            return finish.isEmpty() || comparator.compare(column.name(), finish) <= 0;
         }
 
-        protected boolean isIndexEntryAfterSliceFinish(ByteBuffer name)
+        protected boolean isAfterSliceFinish(Composite name)
         {
-            ByteBuffer finish = currentFinish();
-            return finish.remaining() != 0 && comparatorForIndex(sstable, comparator).compare(name, forIndexComparison(sstable, finish)) > 0;
+            Composite finish = currentFinish();
+            return !finish.isEmpty() && comparator.compare(name, finish) > 0;
         }
     }
 
@@ -293,7 +256,7 @@
         {
             while (++currentSliceIdx < slices.length)
             {
-                nextIndexIdx = indexFor(sstable, slices[currentSliceIdx].start, indexes, comparator, reversed, nextIndexIdx);
+                nextIndexIdx = IndexHelper.indexFor(slices[currentSliceIdx].start, indexes, comparator, reversed, nextIndexIdx);
                 if (nextIndexIdx < 0 || nextIndexIdx >= indexes.size())
                     // no index block for that slice
                     continue;
@@ -302,12 +265,12 @@
                 IndexInfo info = indexes.get(nextIndexIdx);
                 if (reversed)
                 {
-                    if (!isIndexEntryBeforeSliceStart(info.lastName))
+                    if (!isBeforeSliceStart(info.lastName))
                         return true;
                 }
                 else
                 {
-                    if (!isIndexEntryAfterSliceFinish(info.firstName))
+                    if (!isAfterSliceFinish(info.firstName))
                         return true;
                 }
             }
@@ -395,9 +358,8 @@
             if (file == null)
                 file = originalInput == null ? sstable.getFileDataInput(positionToSeek) : originalInput;
 
-            // Give a bogus atom count since we'll deserialize as long as we're
-            // within the index block but we don't know how much atom is there
-            Iterator<OnDiskAtom> atomIterator = emptyColumnFamily.metadata().getOnDiskIterator(file, Integer.MAX_VALUE, sstable.descriptor.version);
+            AtomDeserializer deserializer = emptyColumnFamily.metadata().getOnDiskDeserializer(file, sstable.descriptor.version);
+
             file.seek(positionToSeek);
             FileMark mark = file.mark();
 
@@ -405,63 +367,65 @@
             boolean inSlice = false;
 
             // scan from index start
-            OnDiskAtom column = null;
-            while (file.bytesPastMark(mark) < currentIndex.width || column != null)
+            while (file.bytesPastMark(mark) < currentIndex.width || deserializer.hasUnprocessed())
             {
-                // Only fetch a new column if we haven't dealt with the previous one.
-                if (column == null)
-                    column = atomIterator.next();
-
                 // col is before slice
                 // (If in slice, don't bother checking that until we change slice)
-                if (!inSlice && isColumnBeforeSliceStart(column))
+                Composite start = currentStart();
+                if (!inSlice && !start.isEmpty() && deserializer.compareNextTo(start) < 0)
                 {
                     if (reversed)
                     {
                         // the next slice select columns that are before the current one, so it may
                         // match this column, so keep it around.
-                        prefetched.addFirst(column);
+                        prefetched.addFirst(deserializer.readNext());
                     }
-                    column = null;
+                    else
+                    {
+                        deserializer.skipNext();
+                    }
                 }
                 // col is within slice
-                else if (isColumnBeforeSliceFinish(column))
-                {
-                    inSlice = true;
-                    addColumn(column);
-                    column = null;
-                }
-                // col is after slice.
                 else
                 {
-                    // When reading forward, if we hit a column that sorts after the current slice, it means we're done with this slice.
-                    // For reversed, this may either mean that we're done with the current slice, or that we need to read the previous
-                    // index block. However, we can be sure that we are in the first case though (the current slice is done) if the first
-                    // columns of the block were not part of the current slice, i.e. if we have columns in prefetched.
-                    if (reversed && prefetched.isEmpty())
-                        break;
-
-                    if (!setNextSlice())
-                        break;
-
-                    inSlice = false;
-
-                    // The next index block now corresponds to the first block that may have columns for the newly set slice.
-                    // So if it's different from the current block, we're done with this block. And in that case, we know
-                    // that our prefetched columns won't match.
-                    if (nextIndexIdx != lastDeserializedBlock)
+                    Composite finish = currentFinish();
+                    if (finish.isEmpty() || deserializer.compareNextTo(finish) <= 0)
                     {
-                        if (reversed)
-                            prefetched.clear();
-                        break;
+                        inSlice = true;
+                        addColumn(deserializer.readNext());
                     }
+                    // col is after slice.
+                    else
+                    {
+                        // When reading forward, if we hit a column that sorts after the current slice, it means we're done with this slice.
+                        // For reversed, this may either mean that we're done with the current slice, or that we need to read the previous
+                        // index block. However, we can be sure that we are in the first case though (the current slice is done) if the first
+                        // columns of the block were not part of the current slice, i.e. if we have columns in prefetched.
+                        if (reversed && prefetched.isEmpty())
+                            break;
 
-                    // Even if the next slice may have column in this blocks, if we're reversed, those columns have been
-                    // prefetched and we're done with that block
-                    if (reversed)
-                        break;
+                        if (!setNextSlice())
+                            break;
 
-                    // otherwise, we will deal with that column at the next iteration
+                        inSlice = false;
+
+                        // The next index block now corresponds to the first block that may have columns for the newly set slice.
+                        // So if it's different from the current block, we're done with this block. And in that case, we know
+                        // that our prefetched columns won't match.
+                        if (nextIndexIdx != lastDeserializedBlock)
+                        {
+                            if (reversed)
+                                prefetched.clear();
+                            break;
+                        }
+
+                        // Even if the next slice may have column in this blocks, if we're reversed, those columns have been
+                        // prefetched and we're done with that block
+                        if (reversed)
+                            break;
+
+                        // otherwise, we will deal with that column at the next iteration
+                    }
                 }
             }
             return true;
@@ -479,29 +443,24 @@
             // We remenber when we are whithin a slice to avoid some comparison
             boolean inSlice = false;
 
-            int columnCount = sstable.descriptor.version.hasRowSizeAndColumnCount ? file.readInt() : Integer.MAX_VALUE;
-            Iterator<OnDiskAtom> atomIterator = emptyColumnFamily.metadata().getOnDiskIterator(file, columnCount, sstable.descriptor.version);
-            OnDiskAtom column = null;
-            while (atomIterator.hasNext() || column != null)
+            AtomDeserializer deserializer = emptyColumnFamily.metadata().getOnDiskDeserializer(file, sstable.descriptor.version);
+            while (deserializer.hasNext())
             {
-                // Only fetch a new column if we haven't dealt with the previous one.
-                if (column == null)
-                    column = atomIterator.next();
-
                 // col is before slice
                 // (If in slice, don't bother checking that until we change slice)
-                if (!inSlice && isColumnBeforeSliceStart(column))
+                Composite start = currentStart();
+                if (!inSlice && !start.isEmpty() && deserializer.compareNextTo(start) < 0)
                 {
-                    column = null;
+                    deserializer.skipNext();
                     continue;
                 }
 
                 // col is within slice
-                if (isColumnBeforeSliceFinish(column))
+                Composite finish = currentFinish();
+                if (finish.isEmpty() || deserializer.compareNextTo(finish) <= 0)
                 {
                     inSlice = true;
-                    addColumn(column);
-                    column = null;
+                    addColumn(deserializer.readNext());
                 }
                 // col is after slice. more slices?
                 else

diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java
index 2e84d8d..224b63f 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java
+++ b/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java

@@ -18,17 +18,14 @@
 package org.apache.cassandra.db.columniterator;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.SortedSet;
+import java.util.*;
 
 import com.google.common.collect.AbstractIterator;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.IndexHelper;
 import org.apache.cassandra.io.sstable.SSTableReader;
@@ -43,10 +40,10 @@
     private final SSTableReader sstable;
     private FileDataInput fileToClose;
     private Iterator<OnDiskAtom> iter;
-    public final SortedSet<ByteBuffer> columns;
+    public final SortedSet<CellName> columns;
     public final DecoratedKey key;
 
-    public SSTableNamesIterator(SSTableReader sstable, DecoratedKey key, SortedSet<ByteBuffer> columns)
+    public SSTableNamesIterator(SSTableReader sstable, DecoratedKey key, SortedSet<CellName> columns)
     {
         assert columns != null;
         this.sstable = sstable;
@@ -73,7 +70,7 @@
         }
     }
 
-    public SSTableNamesIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, SortedSet<ByteBuffer> columns, RowIndexEntry indexEntry)
+    public SSTableNamesIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, SortedSet<CellName> columns, RowIndexEntry indexEntry)
     {
         assert columns != null;
         this.sstable = sstable;
@@ -112,8 +109,6 @@
 
             DecoratedKey keyInDisk = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(file));
             assert keyInDisk.equals(key) : String.format("%s != %s in %s", keyInDisk, key, file.getPath());
-            if (sstable.descriptor.version.hasRowSizeAndColumnCount)
-                file.readLong();
         }
 
         indexList = indexEntry.columnsIndex();
@@ -140,8 +135,7 @@
         List<OnDiskAtom> result = new ArrayList<OnDiskAtom>();
         if (indexList.isEmpty())
         {
-            int columnCount = sstable.descriptor.version.hasRowSizeAndColumnCount ? file.readInt() : Integer.MAX_VALUE;
-            readSimpleColumns(file, columns, result, columnCount);
+            readSimpleColumns(file, columns, result);
         }
         else
         {
@@ -152,14 +146,14 @@
         iter = result.iterator();
     }
 
-    private void readSimpleColumns(FileDataInput file, SortedSet<ByteBuffer> columnNames, List<OnDiskAtom> result, int columnCount)
+    private void readSimpleColumns(FileDataInput file, SortedSet<CellName> columnNames, List<OnDiskAtom> result)
     {
-        Iterator<OnDiskAtom> atomIterator = cf.metadata().getOnDiskIterator(file, columnCount, sstable.descriptor.version);
+        Iterator<OnDiskAtom> atomIterator = cf.metadata().getOnDiskIterator(file, sstable.descriptor.version);
         int n = 0;
         while (atomIterator.hasNext())
         {
             OnDiskAtom column = atomIterator.next();
-            if (column instanceof Column)
+            if (column instanceof Cell)
             {
                 if (columnNames.contains(column.name()))
                 {
@@ -177,25 +171,24 @@
 
     private void readIndexedColumns(CFMetaData metadata,
                                     FileDataInput file,
-                                    SortedSet<ByteBuffer> columnNames,
+                                    SortedSet<CellName> columnNames,
                                     List<IndexHelper.IndexInfo> indexList,
                                     long basePosition,
                                     List<OnDiskAtom> result)
     throws IOException
     {
         /* get the various column ranges we have to read */
-        AbstractType<?> comparator = metadata.comparator;
+        CellNameType comparator = metadata.comparator;
         List<IndexHelper.IndexInfo> ranges = new ArrayList<IndexHelper.IndexInfo>();
         int lastIndexIdx = -1;
-        for (ByteBuffer name : columns)
+        for (CellName name : columnNames)
         {
-            int index = IndexedSliceReader.indexFor(sstable, name, indexList, comparator, false, lastIndexIdx);
+            int index = IndexHelper.indexFor(name, indexList, comparator, false, lastIndexIdx);
             if (index < 0 || index == indexList.size())
                 continue;
             IndexHelper.IndexInfo indexInfo = indexList.get(index);
             // Check the index block does contain the column names and that we haven't inserted this block yet.
-            if (IndexedSliceReader.comparatorForIndex(sstable, comparator).compare(IndexedSliceReader.forIndexComparison(sstable, name), indexInfo.firstName) < 0
-              || index == lastIndexIdx)
+            if (comparator.compare(name, indexInfo.firstName) < 0 || index == lastIndexIdx)
                 continue;
 
             ranges.add(indexInfo);
@@ -205,6 +198,8 @@
         if (ranges.isEmpty())
             return;
 
+        Iterator<CellName> toFetch = columnNames.iterator();
+        CellName nextToFetch = toFetch.next();
         for (IndexHelper.IndexInfo indexInfo : ranges)
         {
             long positionToSeek = basePosition + indexInfo.offset;
@@ -213,17 +208,22 @@
             if (file == null)
                 file = createFileDataInput(positionToSeek);
 
-            // We'll read as much atom as there is in the index block, so provide a bogus atom count
-            Iterator<OnDiskAtom> atomIterator = cf.metadata().getOnDiskIterator(file, Integer.MAX_VALUE, sstable.descriptor.version);
+            AtomDeserializer deserializer = cf.metadata().getOnDiskDeserializer(file, sstable.descriptor.version);
             file.seek(positionToSeek);
             FileMark mark = file.mark();
-            // TODO only completely deserialize columns we are interested in
-            while (file.bytesPastMark(mark) < indexInfo.width)
+            while (file.bytesPastMark(mark) < indexInfo.width && nextToFetch != null)
             {
-                OnDiskAtom column = atomIterator.next();
-                // we check vs the original Set, not the filtered List, for efficiency
-                if (!(column instanceof Column) || columnNames.contains(column.name()))
-                    result.add(column);
+                int cmp = deserializer.compareNextTo(nextToFetch);
+                if (cmp == 0)
+                {
+                    nextToFetch = toFetch.hasNext() ? toFetch.next() : null;
+                    result.add(deserializer.readNext());
+                    continue;
+                }
+
+                deserializer.skipNext();
+                if (cmp > 0)
+                    nextToFetch = toFetch.hasNext() ? toFetch.next() : null;
             }
         }
     }

diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableSliceIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableSliceIterator.java
index 4faa651..0057d52 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SSTableSliceIterator.java
+++ b/src/java/org/apache/cassandra/db/columniterator/SSTableSliceIterator.java

@@ -28,7 +28,7 @@
 import org.apache.cassandra.io.util.FileDataInput;
 
 /**
- *  A Column Iterator over SSTable
+ *  A Cell Iterator over SSTable
  */
 public class SSTableSliceIterator implements OnDiskAtomIterator
 {
@@ -62,7 +62,7 @@
 
     private static OnDiskAtomIterator createReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput file, ColumnSlice[] slices, boolean reversed)
     {
-        return slices.length == 1 && slices[0].start.remaining() == 0 && !reversed
+        return slices.length == 1 && slices[0].start.isEmpty() && !reversed
              ? new SimpleSliceReader(sstable, indexEntry, file, slices[0].finish)
              : new IndexedSliceReader(sstable, indexEntry, file, slices, reversed);
     }

diff --git a/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java b/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java
index 48d20db..702bddc 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java
+++ b/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.db.columniterator;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.Iterator;
 
 import com.google.common.collect.AbstractIterator;
@@ -26,7 +25,8 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableReader;
@@ -40,12 +40,12 @@
 
     private final FileDataInput file;
     private final boolean needsClosing;
-    private final ByteBuffer finishColumn;
-    private final AbstractType<?> comparator;
+    private final Composite finishColumn;
+    private final CellNameType comparator;
     private final ColumnFamily emptyColumnFamily;
     private final Iterator<OnDiskAtom> atomIterator;
 
-    public SimpleSliceReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput input, ByteBuffer finishColumn)
+    public SimpleSliceReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput input, Composite finishColumn)
     {
         Tracing.trace("Seeking to partition beginning in data file");
         this.finishColumn = finishColumn;
@@ -64,17 +64,12 @@
                 this.needsClosing = false;
             }
 
-            Descriptor.Version version = sstable.descriptor.version;
-
             // Skip key and data size
             ByteBufferUtil.skipShortLength(file);
-            if (version.hasRowSizeAndColumnCount)
-                file.readLong();
 
-            emptyColumnFamily = EmptyColumns.factory.create(sstable.metadata);
+            emptyColumnFamily = ArrayBackedSortedColumns.factory.create(sstable.metadata);
             emptyColumnFamily.delete(DeletionTime.serializer.deserialize(file));
-            int columnCount = version.hasRowSizeAndColumnCount ? file.readInt() : Integer.MAX_VALUE;
-            atomIterator = emptyColumnFamily.metadata().getOnDiskIterator(file, columnCount, sstable.descriptor.version);
+            atomIterator = emptyColumnFamily.metadata().getOnDiskIterator(file, sstable.descriptor.version);
         }
         catch (IOException e)
         {
@@ -89,7 +84,7 @@
             return endOfData();
 
         OnDiskAtom column = atomIterator.next();
-        if (finishColumn.remaining() > 0 && comparator.compare(column.name(), finishColumn) > 0)
+        if (!finishColumn.isEmpty() && comparator.compare(column.name(), finishColumn) > 0)
             return endOfData();
 
         return column;

diff --git a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogExecutorService.java b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogExecutorService.java
deleted file mode 100644
index ec43114..0000000
--- a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogExecutorService.java
+++ /dev/null

@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.commitlog;
-
-import java.util.List;
-import java.util.concurrent.AbstractExecutorService;
-import java.util.concurrent.TimeUnit;
-
-public abstract class AbstractCommitLogExecutorService extends AbstractExecutorService implements ICommitLogExecutorService
-{
-    protected volatile long completedTaskCount = 0;
-
-    /**
-     * Get the number of completed tasks
-     */
-    public long getCompletedTasks()
-    {
-        return completedTaskCount;
-    }
-
-    public boolean isTerminated()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public boolean isShutdown()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public List<Runnable> shutdownNow()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException
-    {
-        throw new UnsupportedOperationException();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java
new file mode 100644
index 0000000..59bf691
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java

@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.commitlog;
+
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+import org.slf4j.*;
+
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation;
+
+public abstract class AbstractCommitLogService
+{
+    // how often should we log syngs that lag behind our desired period
+    private static final long LAG_REPORT_INTERVAL = TimeUnit.MINUTES.toMillis(5);
+
+    private final Thread thread;
+    private volatile boolean shutdown = false;
+
+    // all Allocations written before this time will be synced
+    protected volatile long lastSyncedAt = System.currentTimeMillis();
+
+    // counts of total written, and pending, log messages
+    private final AtomicLong written = new AtomicLong(0);
+    protected final AtomicLong pending = new AtomicLong(0);
+
+    // signal that writers can wait on to be notified of a completed sync
+    protected final WaitQueue syncComplete = new WaitQueue();
+    private final Semaphore haveWork = new Semaphore(1);
+
+    private static final Logger logger = LoggerFactory.getLogger(AbstractCommitLogService.class);
+
+    /**
+     * CommitLogService provides a fsync service for Allocations, fulfilling either the
+     * Batch or Periodic contract.
+     *
+     * Subclasses may be notified when a sync finishes by using the syncComplete WaitQueue.
+     */
+    AbstractCommitLogService(final CommitLog commitLog, final String name, final long pollIntervalMillis)
+    {
+        if (pollIntervalMillis < 1)
+            throw new IllegalArgumentException(String.format("Commit log flush interval must be positive: %dms", pollIntervalMillis));
+
+        Runnable runnable = new Runnable()
+        {
+            public void run()
+            {
+                long firstLagAt = 0;
+                long totalSyncDuration = 0; // total time spent syncing since firstLagAt
+                long syncExceededIntervalBy = 0; // time that syncs exceeded pollInterval since firstLagAt
+                int lagCount = 0;
+                int syncCount = 0;
+
+                boolean run = true;
+                while (run)
+                {
+                    try
+                    {
+                        // always run once after shutdown signalled
+                        run = !shutdown;
+
+                        // sync and signal
+                        long syncStarted = System.currentTimeMillis();
+                        commitLog.sync(shutdown);
+                        lastSyncedAt = syncStarted;
+                        syncComplete.signalAll();
+
+
+                        // sleep any time we have left before the next one is due
+                        long now = System.currentTimeMillis();
+                        long sleep = syncStarted + pollIntervalMillis - now;
+                        if (sleep < 0)
+                        {
+                            // if we have lagged noticeably, update our lag counter
+                            if (firstLagAt == 0)
+                            {
+                                firstLagAt = now;
+                                totalSyncDuration = syncExceededIntervalBy = syncCount = lagCount = 0;
+                            }
+                            syncExceededIntervalBy -= sleep;
+                            lagCount++;
+                        }
+                        syncCount++;
+                        totalSyncDuration += now - syncStarted;
+
+                        if (firstLagAt > 0 && now - firstLagAt >= LAG_REPORT_INTERVAL)
+                        {
+                            logger.warn(String.format("Out of %d commit log syncs over the past %ds with average duration of %.2fms, %d have exceeded the configured commit interval by an average of %.2fms",
+                                                      syncCount, (now - firstLagAt) / 1000, (double) totalSyncDuration / syncCount, lagCount, (double) syncExceededIntervalBy / lagCount));
+                            firstLagAt = 0;
+                        }
+
+                        // if we have lagged this round, we probably have work to do already so we don't sleep
+                        if (sleep < 0 || !run)
+                            continue;
+
+                        try
+                        {
+                            haveWork.tryAcquire(sleep, TimeUnit.MILLISECONDS);
+                        }
+                        catch (InterruptedException e)
+                        {
+                            throw new AssertionError();
+                        }
+                    }
+                    catch (Throwable t)
+                    {
+                        if (!CommitLog.handleCommitError("Failed to persist commits to disk", t))
+                            break;
+
+                        // sleep for full poll-interval after an error, so we don't spam the log file
+                        try
+                        {
+                            haveWork.tryAcquire(pollIntervalMillis, TimeUnit.MILLISECONDS);
+                        }
+                        catch (InterruptedException e)
+                        {
+                            throw new AssertionError();
+                        }
+                    }
+                }
+            }
+        };
+
+        thread = new Thread(runnable, name);
+        thread.start();
+    }
+
+    /**
+     * Block for @param alloc to be sync'd as necessary, and handle bookkeeping
+     */
+    public void finishWriteFor(Allocation alloc)
+    {
+        maybeWaitForSync(alloc);
+        written.incrementAndGet();
+    }
+
+    protected abstract void maybeWaitForSync(Allocation alloc);
+
+    /**
+     * Sync immediately, but don't block for the sync to cmplete
+     */
+    public WaitQueue.Signal requestExtraSync()
+    {
+        WaitQueue.Signal signal = syncComplete.register();
+        haveWork.release(1);
+        return signal;
+    }
+
+    public void shutdown()
+    {
+        shutdown = true;
+        haveWork.release(1);
+    }
+
+    public void awaitTermination() throws InterruptedException
+    {
+        thread.join();
+    }
+
+    public long getCompletedTasks()
+    {
+        return written.incrementAndGet();
+    }
+
+    public long getPendingTasks()
+    {
+        return pending.incrementAndGet();
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogExecutorService.java b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogExecutorService.java
deleted file mode 100644
index 9c2e2ac..0000000
--- a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogExecutorService.java
+++ /dev/null

@@ -1,189 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.commitlog;
-
-import java.util.ArrayList;
-import java.util.concurrent.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.io.FSError;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.WrappedRunnable;
-
-class BatchCommitLogExecutorService extends AbstractCommitLogExecutorService
-{
-
-    private final BlockingQueue<CheaterFutureTask> queue;
-    private final Thread appendingThread;
-    private volatile boolean run = true;
-
-    public BatchCommitLogExecutorService()
-    {
-        this(DatabaseDescriptor.getConcurrentWriters());
-    }
-
-    public BatchCommitLogExecutorService(int queueSize)
-    {
-        queue = new LinkedBlockingQueue<CheaterFutureTask>(queueSize);
-        Runnable runnable = new WrappedRunnable()
-        {
-            public void runMayThrow() throws Exception
-            {
-                while (run)
-                {
-                    try
-                    {
-                        if (processWithSyncBatch())
-                            completedTaskCount++;
-                    }
-                    catch (Throwable t)
-                    {
-                        if (!CommitLog.handleCommitError("Failed to persist commits to disk", t))
-                            return;
-                    }
-                }
-            }
-        };
-        appendingThread = new Thread(runnable, "COMMIT-LOG-WRITER");
-        appendingThread.start();
-
-    }
-
-    public long getPendingTasks()
-    {
-        return queue.size();
-    }
-
-    private final ArrayList<CheaterFutureTask> incompleteTasks = new ArrayList<CheaterFutureTask>();
-    private final ArrayList taskValues = new ArrayList(); // TODO not sure how to generify this
-    private boolean processWithSyncBatch() throws Exception
-    {
-        CheaterFutureTask firstTask = queue.poll(100, TimeUnit.MILLISECONDS);
-        if (firstTask == null)
-            return false;
-        if (!(firstTask.getRawCallable() instanceof CommitLog.LogRecordAdder))
-        {
-            firstTask.run();
-            return true;
-        }
-
-        // attempt to do a bunch of LogRecordAdder ops before syncing
-        // (this is a little clunky since there is no blocking peek method,
-        //  so we have to break it into firstTask / extra tasks)
-        incompleteTasks.clear();
-        taskValues.clear();
-        long start = System.nanoTime();
-        long window = (long)(1000000 * DatabaseDescriptor.getCommitLogSyncBatchWindow());
-
-        // it doesn't seem worth bothering future-izing the exception
-        // since if a commitlog op throws, we're probably screwed anyway
-        incompleteTasks.add(firstTask);
-        taskValues.add(firstTask.getRawCallable().call());
-        while (!queue.isEmpty()
-               && queue.peek().getRawCallable() instanceof CommitLog.LogRecordAdder
-               && System.nanoTime() - start < window)
-        {
-            CheaterFutureTask task = queue.remove();
-            incompleteTasks.add(task);
-            taskValues.add(task.getRawCallable().call());
-        }
-
-        // now sync and set the tasks' values (which allows thread calling get() to proceed)
-        CommitLog.instance.sync();
-        for (int i = 0; i < incompleteTasks.size(); i++)
-        {
-            incompleteTasks.get(i).set(taskValues.get(i));
-        }
-        return true;
-    }
-
-
-    @Override
-    protected <T> RunnableFuture<T> newTaskFor(Runnable runnable, T value)
-    {
-        return newTaskFor(Executors.callable(runnable, value));
-    }
-
-    @Override
-    protected <T> RunnableFuture<T> newTaskFor(Callable<T> callable)
-    {
-        return new CheaterFutureTask(callable);
-    }
-
-    public void execute(Runnable command)
-    {
-        try
-        {
-            queue.put((CheaterFutureTask)command);
-        }
-        catch (InterruptedException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public void add(CommitLog.LogRecordAdder adder)
-    {
-        FBUtilities.waitOnFuture(submit((Callable)adder));
-    }
-
-    public void shutdown()
-    {
-        new Thread(new WrappedRunnable()
-        {
-            public void runMayThrow() throws InterruptedException
-            {
-                while (!queue.isEmpty())
-                    Thread.sleep(100);
-                run = false;
-                appendingThread.join();
-            }
-        }, "Commitlog Shutdown").start();
-    }
-
-    public void awaitTermination() throws InterruptedException
-    {
-        appendingThread.join();
-    }
-
-    private static class CheaterFutureTask<V> extends FutureTask<V>
-    {
-        private final Callable rawCallable;
-
-        public CheaterFutureTask(Callable<V> callable)
-        {
-            super(callable);
-            rawCallable = callable;
-        }
-
-        public Callable getRawCallable()
-        {
-            return rawCallable;
-        }
-
-        @Override
-        public void set(V v)
-        {
-            super.set(v);
-        }
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java
new file mode 100644
index 0000000..65bee40
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java

@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.commitlog;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+class BatchCommitLogService extends AbstractCommitLogService
+{
+    public BatchCommitLogService(CommitLog commitLog)
+    {
+        super(commitLog, "COMMIT-LOG-WRITER", (int) DatabaseDescriptor.getCommitLogSyncBatchWindow());
+    }
+
+    protected void maybeWaitForSync(CommitLogSegment.Allocation alloc)
+    {
+        // wait until record has been safely persisted to disk
+        pending.incrementAndGet();
+        alloc.awaitDiskSync();
+        pending.decrementAndGet();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
index 0e2f5bf..d38c4ed 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java

@@ -19,28 +19,26 @@
 
 import java.io.*;
 import java.lang.management.ManagementFactory;
+import java.nio.ByteBuffer;
 import java.util.*;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
+
 import javax.management.MBeanServer;
 import javax.management.ObjectName;
 
-import com.google.common.util.concurrent.Uninterruptibles;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.io.FSError;
 import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.util.DataOutputByteBuffer;
 import org.apache.cassandra.metrics.CommitLogMetrics;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.PureJavaCrc32;
+
+import static org.apache.cassandra.db.commitlog.CommitLogSegment.*;
 
 /*
  * Commit Log tracks every write operation into the system. The aim of the commit log is to be able to
@@ -52,29 +50,24 @@
 
     public static final CommitLog instance = new CommitLog();
 
-    private final ICommitLogExecutorService executor;
+    // we only permit records HALF the size of a commit log, to ensure we don't spin allocating many mostly
+    // empty segments when writing large records
+    private static final long MAX_MUTATION_SIZE = DatabaseDescriptor.getCommitLogSegmentSize() >> 1;
 
-    public final CommitLogAllocator allocator;
-
+    public final CommitLogSegmentManager allocator;
     public final CommitLogArchiver archiver = new CommitLogArchiver();
-
-    public static final int END_OF_SEGMENT_MARKER = 0;          // this is written out at the end of a segment
-    public static final int END_OF_SEGMENT_MARKER_SIZE = 4;     // number of bytes of ^^^
-
-    public volatile CommitLogSegment activeSegment;
-
-    private final CommitLogMetrics metrics;
+    final CommitLogMetrics metrics;
+    final AbstractCommitLogService executor;
 
     private CommitLog()
     {
         DatabaseDescriptor.createAllDirectories();
 
-        allocator = new CommitLogAllocator();
-        activateNextSegment();
+        allocator = new CommitLogSegmentManager();
 
         executor = DatabaseDescriptor.getCommitLogSync() == Config.CommitLogSync.batch
-                 ? new BatchCommitLogExecutorService()
-                 : new PeriodicCommitLogExecutorService(this);
+                 ? new BatchCommitLogService(this)
+                 : new PeriodicCommitLogService(this);
 
         MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
         try
@@ -91,15 +84,6 @@
     }
 
     /**
-     * FOR TESTING PURPOSES. See CommitLogAllocator.
-     */
-    public void resetUnsafe()
-    {
-        allocator.resetUnsafe();
-        activateNextSegment();
-    }
-
-    /**
      * Perform recovery on commit logs located in the directory specified by the config file.
      *
      * @return the number of mutations replayed
@@ -135,10 +119,10 @@
         }
         else
         {
-            Arrays.sort(files, new CommitLogSegment.CommitLogSegmentFileComparator());
-            logger.info("Replaying " + StringUtils.join(files, ", "));
+            Arrays.sort(files, new CommitLogSegmentFileComparator());
+            logger.info("Replaying {}", StringUtils.join(files, ", "));
             replayed = recover(files);
-            logger.info("Log replay complete, " + replayed + " replayed mutations");
+            logger.info("Log replay complete, {} replayed mutations", replayed);
 
             for (File f : files)
                 CommitLog.instance.allocator.recycleSegment(f);
@@ -171,38 +155,98 @@
 
     /**
      * @return a Future representing a ReplayPosition such that when it is ready,
-     * all commitlog tasks enqueued prior to the getContext call will be complete (i.e., appended to the log)
+     * all Allocations created prior to the getContext call will be written to the log
      */
-    public Future<ReplayPosition> getContext()
+    public ReplayPosition getContext()
     {
-        Callable<ReplayPosition> task = new Callable<ReplayPosition>()
+        return allocator.allocatingFrom().getContext();
+    }
+
+    /**
+     * Flushes all dirty CFs, waiting for them to free and recycle any segments they were retaining
+     */
+    public void forceRecycleAllSegments(Iterable<UUID> droppedCfs)
+    {
+        allocator.forceRecycleAll(droppedCfs);
+    }
+
+    /**
+     * Flushes all dirty CFs, waiting for them to free and recycle any segments they were retaining
+     */
+    public void forceRecycleAllSegments()
+    {
+        allocator.forceRecycleAll(Collections.<UUID>emptyList());
+    }
+
+    /**
+     * Forces a disk flush on the commit log files that need it.  Blocking.
+     */
+    public void sync(boolean syncAllSegments)
+    {
+        CommitLogSegment current = allocator.allocatingFrom();
+        for (CommitLogSegment segment : allocator.getActiveSegments())
         {
-            public ReplayPosition call()
-            {
-                return activeSegment.getContext();
-            }
-        };
-        return executor.submit(task);
+            if (!syncAllSegments && segment.id > current.id)
+                return;
+            segment.sync();
+        }
     }
 
     /**
-     * Used by tests.
-     *
-     * @return the number of active segments (segments with unflushed data in them)
+     * Preempts the CLExecutor, telling to to sync immediately
      */
-    public int activeSegments()
+    public void requestExtraSync()
     {
-        return allocator.getActiveSegments().size();
+        executor.requestExtraSync();
     }
 
     /**
-     * Add a RowMutation to the commit log.
+     * Add a Mutation to the commit log.
      *
-     * @param rm the RowMutation to add to the log
+     * @param mutation the Mutation to add to the log
      */
-    public void add(RowMutation rm)
+    public ReplayPosition add(Mutation mutation)
     {
-        executor.add(new LogRecordAdder(rm));
+        assert mutation != null;
+
+        long size = Mutation.serializer.serializedSize(mutation, MessagingService.current_version);
+
+        long totalSize = size + ENTRY_OVERHEAD_SIZE;
+        if (totalSize > MAX_MUTATION_SIZE)
+        {
+            throw new IllegalArgumentException(String.format("Mutation of %s bytes is too large for the maxiumum size of %s",
+                                                             totalSize, MAX_MUTATION_SIZE));
+        }
+
+        Allocation alloc = allocator.allocate(mutation, (int) totalSize);
+        try
+        {
+            PureJavaCrc32 checksum = new PureJavaCrc32();
+            final ByteBuffer buffer = alloc.getBuffer();
+            DataOutputByteBuffer dos = new DataOutputByteBuffer(buffer);
+
+            // checksummed length
+            dos.writeInt((int) size);
+            checksum.update(buffer, buffer.position() - 4, 4);
+            buffer.putInt(checksum.getCrc());
+
+            int start = buffer.position();
+            // checksummed mutation
+            Mutation.serializer.serialize(mutation, dos, MessagingService.current_version);
+            checksum.update(buffer, start, (int) size);
+            buffer.putInt(checksum.getCrc());
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, alloc.getSegment().getPath());
+        }
+        finally
+        {
+            alloc.markWritten();
+        }
+
+        executor.finishWriteFor(alloc);
+        return alloc.getReplayPosition();
     }
 
     /**
@@ -214,104 +258,58 @@
      */
     public void discardCompletedSegments(final UUID cfId, final ReplayPosition context)
     {
-        Callable task = new Callable()
+        logger.debug("discard completed log segments for {}, column family {}", context, cfId);
+
+        // Go thru the active segment files, which are ordered oldest to newest, marking the
+        // flushed CF as clean, until we reach the segment file containing the ReplayPosition passed
+        // in the arguments. Any segments that become unused after they are marked clean will be
+        // recycled or discarded.
+        for (Iterator<CommitLogSegment> iter = allocator.getActiveSegments().iterator(); iter.hasNext();)
         {
-            public Object call()
+            CommitLogSegment segment = iter.next();
+            segment.markClean(cfId, context);
+
+            if (segment.isUnused())
             {
-                logger.debug("discard completed log segments for {}, column family {}", context, cfId);
-
-                // Go thru the active segment files, which are ordered oldest to newest, marking the
-                // flushed CF as clean, until we reach the segment file containing the ReplayPosition passed
-                // in the arguments. Any segments that become unused after they are marked clean will be
-                // recycled or discarded.
-                for (Iterator<CommitLogSegment> iter = allocator.getActiveSegments().iterator(); iter.hasNext();)
-                {
-                    CommitLogSegment segment = iter.next();
-                    segment.markClean(cfId, context);
-
-                    // If the segment is no longer needed, and we have another spare segment in the hopper
-                    // (to keep the last segment from getting discarded), pursue either recycling or deleting
-                    // this segment file.
-                    if (iter.hasNext())
-                    {
-                        if (segment.isUnused())
-                        {
-                            logger.debug("Commit log segment {} is unused", segment);
-                            allocator.recycleSegment(segment);
-                        }
-                        else
-                        {
-                            logger.debug("Not safe to delete commit log segment {}; dirty is {}",
-                                         segment, segment.dirtyString());
-                        }
-                    }
-                    else
-                    {
-                        logger.debug("Not deleting active commitlog segment {}", segment);
-                    }
-
-                    // Don't mark or try to delete any newer segments once we've reached the one containing the
-                    // position of the flush.
-                    if (segment.contains(context))
-                        break;
-                }
-
-                return null;
+                logger.debug("Commit log segment {} is unused", segment);
+                allocator.recycleSegment(segment);
             }
-        };
+            else
+            {
+                logger.debug("Not safe to delete{} commit log segment {}; dirty is {}",
+                        (iter.hasNext() ? "" : " active"), segment, segment.dirtyString());
+            }
 
-        FBUtilities.waitOnFuture(executor.submit(task));
-    }
-
-    /**
-     * Forces a disk flush on the commit log files that need it.
-     */
-    public void sync()
-    {
-        for (CommitLogSegment segment : allocator.getActiveSegments())
-        {
-            segment.sync();
+            // Don't mark or try to delete any newer segments once we've reached the one containing the
+            // position of the flush.
+            if (segment.contains(context))
+                break;
         }
     }
 
-    /**
-     * @return the number of tasks completed by the commit log executor
-     */
+    @Override
     public long getCompletedTasks()
     {
         return metrics.completedTasks.value();
     }
 
-    /**
-     * @return the depth of pending commit log executor queue
-     */
+    @Override
     public long getPendingTasks()
     {
         return metrics.pendingTasks.value();
     }
 
     /**
-     * @return the total size occupied by commitlo segments expressed in bytes. (used by MBean)
+     * @return the total size occupied by commitlog segments expressed in bytes. (used by MBean)
      */
     public long getTotalCommitlogSize()
     {
         return metrics.totalCommitLogSize.value();
     }
 
-    /**
-     * Fetches a new segment file from the allocator and activates it.
-     *
-     * @return the newly activated segment
-     */
-    private void activateNextSegment()
-    {
-        activeSegment = allocator.fetchSegment();
-        logger.debug("Active segment is now {}", activeSegment);
-    }
-
     public List<String> getActiveSegmentNames()
     {
-        List<String> segmentNames = new ArrayList<String>();
+        List<String> segmentNames = new ArrayList<>();
         for (CommitLogSegment segment : allocator.getActiveSegments())
             segmentNames.add(segment.getName());
         return segmentNames;
@@ -319,7 +317,7 @@
 
     public List<String> getArchivingSegmentNames()
     {
-        return new ArrayList<String>(archiver.archivePending.keySet());
+        return new ArrayList<>(archiver.archivePending.keySet());
     }
 
     /**
@@ -333,49 +331,22 @@
         allocator.awaitTermination();
     }
 
-    // TODO this should be a Runnable since it doesn't actually return anything, but it's difficult to do that
-    // without breaking the fragile CheaterFutureTask in BatchCLES.
-    class LogRecordAdder implements Callable, Runnable
+    /**
+     * FOR TESTING PURPOSES. See CommitLogAllocator.
+     */
+    public void resetUnsafe()
     {
-        final RowMutation rowMutation;
+        allocator.resetUnsafe();
+    }
 
-        LogRecordAdder(RowMutation rm)
-        {
-            this.rowMutation = rm;
-        }
-
-        public void run()
-        {
-            long totalSize = RowMutation.serializer.serializedSize(rowMutation, MessagingService.current_version) + CommitLogSegment.ENTRY_OVERHEAD_SIZE;
-            if (totalSize > DatabaseDescriptor.getCommitLogSegmentSize())
-            {
-                logger.warn("Skipping commitlog append of extremely large mutation ({} bytes)", totalSize);
-                return;
-            }
-
-            if (!activeSegment.hasCapacityFor(totalSize))
-            {
-                CommitLogSegment oldSegment = activeSegment;
-                activateNextSegment();
-                // Now we can run the user defined command just before switching to the new commit log.
-                // (Do this here instead of in the recycle call so we can get a head start on the archive.)
-                archiver.maybeArchive(oldSegment.getPath(), oldSegment.getName());
-            }
-            try
-            {
-                activeSegment.write(rowMutation);
-            }
-            catch (IOException e)
-            {
-                throw new FSWriteError(e, activeSegment.getPath());
-            }
-        }
-
-        public Object call()
-        {
-            run();
-            return null;
-        }
+    /**
+     * Used by tests.
+     *
+     * @return the number of active segments (segments with unflushed data in them)
+     */
+    public int activeSegments()
+    {
+        return allocator.getActiveSegments().size();
     }
 
     static boolean handleCommitError(String message, Throwable t)
@@ -389,7 +360,6 @@
                 return false;
             case ignore:
                 logger.error(message, t);
-                Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
                 return true;
             default:
                 throw new AssertionError(DatabaseDescriptor.getCommitFailurePolicy());

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogAllocator.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogAllocator.java
deleted file mode 100644
index 7ab062b..0000000
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogAllocator.java
+++ /dev/null

@@ -1,382 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.commitlog;
-
-import java.io.File;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.UUID;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.atomic.AtomicLong;
-
-import com.google.common.collect.Iterables;
-import com.google.common.util.concurrent.Uninterruptibles;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.Pair;
-import org.apache.cassandra.utils.WrappedRunnable;
-
-/**
- * Performs the pre-allocation of commit log segments in a background thread. All the
- * public methods are thread safe.
- */
-public class CommitLogAllocator
-{
-    static final Logger logger = LoggerFactory.getLogger(CommitLogAllocator.class);
-
-    /** The (theoretical) max milliseconds between loop runs to perform janitorial tasks */
-    public final static int TICK_CYCLE_TIME = 100;
-
-    /** Segments that are ready to be used */
-    private final BlockingQueue<CommitLogSegment> availableSegments = new LinkedBlockingQueue<CommitLogSegment>();
-
-    /** Allocations to be run by the thread */
-    private final BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>();
-
-    /** Active segments, containing unflushed data */
-    private final ConcurrentLinkedQueue<CommitLogSegment> activeSegments = new ConcurrentLinkedQueue<CommitLogSegment>();
-
-    /**
-     * Tracks commitlog size, in multiples of the segment size.  We need to do this so we can "promise" size
-     * adjustments ahead of actually adding/freeing segments on disk, so that the "evict oldest segment" logic
-     * can see the effect of recycling segments immediately (even though they're really happening asynchronously
-     * on the allocator thread, which will take a ms or two).
-     */
-    private final AtomicLong size = new AtomicLong();
-
-    /**
-     * New segment creation is initially disabled because we'll typically get some "free" segments
-     * recycled after log replay.
-     */
-    private volatile boolean createReserveSegments = false;
-
-    private final Thread allocationThread;
-    private volatile boolean run = true;
-
-    public CommitLogAllocator()
-    {
-        // The run loop for the allocation thread
-        Runnable runnable = new WrappedRunnable()
-        {
-            public void runMayThrow() throws Exception
-            {
-                while (run)
-                {
-                    try
-                    {
-
-                        Runnable r = queue.poll(TICK_CYCLE_TIME, TimeUnit.MILLISECONDS);
-
-                        if (r != null)
-                        {
-                            r.run();
-                        }
-                        else
-                        {
-                            // no job, so we're clear to check to see if we're out of segments
-                            // and ready a new one if needed. has the effect of ensuring there's
-                            // almost always a segment available when it's needed.
-                            if (availableSegments.isEmpty() && (activeSegments.isEmpty() || createReserveSegments))
-                            {
-                                logger.debug("No segments in reserve; creating a fresh one");
-                                createFreshSegment();
-                            }
-                        }
-
-                    }
-                    catch (Throwable t)
-                    {
-                        if (!CommitLog.handleCommitError("Failed to allocate new commit log segments", t))
-                            return;
-                    }
-                }
-            }
-        };
-
-        allocationThread = new Thread(runnable, "COMMIT-LOG-ALLOCATOR");
-        allocationThread.start();
-    }
-
-    /**
-     * Fetches an empty segment file.
-     *
-     * @return the next writable segment
-     */
-    public CommitLogSegment fetchSegment()
-    {
-        CommitLogSegment next;
-        try
-        {
-            next = availableSegments.take();
-        }
-        catch (InterruptedException e)
-        {
-            throw new AssertionError(e);
-        }
-
-        assert !activeSegments.contains(next);
-        activeSegments.add(next);
-        if (isCapExceeded())
-            flushOldestKeyspaces();
-
-        return next;
-    }
-
-    /**
-     * Indicates that a segment is no longer in use and that it should be recycled.
-     *
-     * @param segment segment that is no longer in use
-     */
-    public void recycleSegment(final CommitLogSegment segment)
-    {
-        activeSegments.remove(segment);
-        if (!CommitLog.instance.archiver.maybeWaitForArchiving(segment.getName()))
-        {
-            // if archiving (command) was not successful then leave the file alone. don't delete or recycle.
-            discardSegment(segment, false);
-            return;
-        }
-        if (isCapExceeded())
-        {
-            discardSegment(segment, true);
-            return;
-        }
-
-        logger.debug("Recycling {}", segment);
-        queue.add(new Runnable()
-        {
-            public void run()
-            {
-                CommitLogSegment recycled = segment.recycle();
-                internalAddReadySegment(recycled);
-            }
-        });
-    }
-
-    /**
-     * Differs from the above because it can work on any file instead of just existing
-     * commit log segments managed by this allocator.
-     *
-     * @param file segment file that is no longer in use.
-     */
-    public void recycleSegment(final File file)
-    {
-        // check against SEGMENT_SIZE avoids recycling odd-sized or empty segments from old C* versions and unit tests
-        if (isCapExceeded() || file.length() != DatabaseDescriptor.getCommitLogSegmentSize()
-                || CommitLogDescriptor.fromFileName(file.getName()).getMessagingVersion() != MessagingService.current_version)
-        {
-            // (don't decrease managed size, since this was never a "live" segment)
-            logger.debug("(Unopened) segment {} is no longer needed and will be deleted now", file);
-            FileUtils.deleteWithConfirm(file);
-            return;
-        }
-
-        logger.debug("Recycling {}", file);
-        // this wasn't previously a live segment, so add it to the managed size when we make it live
-        size.addAndGet(DatabaseDescriptor.getCommitLogSegmentSize());
-        queue.add(new Runnable()
-        {
-            public void run()
-            {
-                CommitLogSegment segment = new CommitLogSegment(file.getPath());
-                internalAddReadySegment(segment);
-            }
-        });
-    }
-
-    /**
-     * Indicates that a segment file should be deleted.
-     *
-     * @param segment segment to be discarded
-     */
-    private void discardSegment(final CommitLogSegment segment, final boolean deleteFile)
-    {
-        logger.debug("Segment {} is no longer active and will be deleted {}", segment, deleteFile ? "now" : "by the archive script");
-        size.addAndGet(-DatabaseDescriptor.getCommitLogSegmentSize());
-
-        queue.add(new Runnable()
-        {
-            public void run()
-            {
-                segment.discard(deleteFile);
-            }
-        });
-    }
-
-    /**
-     * @return the space (in bytes) used by all segment files.
-     */
-    public long bytesUsed()
-    {
-        return size.get();
-    }
-
-    /**
-     * @param name the filename to check
-     * @return true if file is managed by this allocator.
-     */
-    public boolean manages(String name)
-    {
-        for (CommitLogSegment segment : Iterables.concat(activeSegments, availableSegments))
-            if (segment.getName().equals(name))
-                return true;
-
-        return false;
-    }
-
-    /**
-     * Creates and readies a brand new segment.
-     *
-     * @return the newly minted segment
-     */
-    private CommitLogSegment createFreshSegment()
-    {
-        size.addAndGet(DatabaseDescriptor.getCommitLogSegmentSize());
-        return internalAddReadySegment(CommitLogSegment.freshSegment());
-    }
-
-    /**
-     * Adds a segment to our internal tracking list and makes it ready for consumption.
-     *
-     * @param   segment the segment to add
-     * @return  the newly added segment
-     */
-    private CommitLogSegment internalAddReadySegment(CommitLogSegment segment)
-    {
-        assert !activeSegments.contains(segment);
-        assert !availableSegments.contains(segment);
-        availableSegments.add(segment);
-        return segment;
-    }
-
-    /**
-     * Check to see if the speculative current size exceeds the cap.
-     *
-     * @return true if cap is exceeded
-     */
-    private boolean isCapExceeded()
-    {
-        long currentSize = size.get();
-        logger.debug("Total active commitlog segment space used is {}", currentSize);
-        return currentSize > DatabaseDescriptor.getTotalCommitlogSpaceInMB() * 1024 * 1024;
-    }
-
-    /**
-     * Throws a flag that enables the behavior of keeping at least one spare segment
-     * available at all times.
-     */
-    public void enableReserveSegmentCreation()
-    {
-        createReserveSegments = true;
-    }
-
-    /**
-     * Force a flush on all dirty CFs represented in the oldest commitlog segment
-     */
-    private void flushOldestKeyspaces()
-    {
-        CommitLogSegment oldestSegment = activeSegments.peek();
-
-        if (oldestSegment != null && oldestSegment != CommitLog.instance.activeSegment)
-        {
-            for (UUID dirtyCFId : oldestSegment.getDirtyCFIDs())
-            {
-                Pair<String,String> pair = Schema.instance.getCF(dirtyCFId);
-                if (pair == null)
-                {
-                    // even though we remove the schema entry before a final flush when dropping a CF,
-                    // it's still possible for a writer to race and finish his append after the flush.
-                    logger.debug("Marking clean CF {} that doesn't exist anymore", dirtyCFId);
-                    oldestSegment.markClean(dirtyCFId, oldestSegment.getContext());
-                }
-                else
-                {
-                    String keypace = pair.left;
-                    final ColumnFamilyStore cfs = Keyspace.open(keypace).getColumnFamilyStore(dirtyCFId);
-                    // flush shouldn't run on the commitlog executor, since it acquires Table.switchLock,
-                    // which may already be held by a thread waiting for the CL executor (via getContext),
-                    // causing deadlock
-                    Runnable runnable = new Runnable()
-                    {
-                        public void run()
-                        {
-                            cfs.forceFlush();
-                        }
-                    };
-                    StorageService.optionalTasks.execute(runnable);
-                }
-            }
-        }
-    }
-
-    /**
-     * Resets all the segments, for testing purposes. DO NOT USE THIS OUTSIDE OF TESTS.
-     */
-    public void resetUnsafe()
-    {
-        logger.debug("Closing and clearing existing commit log segments...");
-
-        while (StorageService.tasks.getActiveCount() > 0 || !queue.isEmpty())
-            Thread.yield();
-
-        Uninterruptibles.sleepUninterruptibly(10, TimeUnit.MILLISECONDS);
-
-        for (CommitLogSegment segment : Iterables.concat(activeSegments, availableSegments))
-            segment.close();
-
-        activeSegments.clear();
-        availableSegments.clear();
-    }
-
-    /**
-     * Initiates the shutdown process for the allocator thread.
-     */
-    public void shutdown()
-    {
-        run = false;
-    }
-
-    /**
-     * Returns when the allocator thread terminates.
-     */
-    public void awaitTermination() throws InterruptedException
-    {
-        allocationThread.join();
-    }
-
-    /**
-     * @return a read-only collection of the active commit log segments
-     */
-    public Collection<CommitLogSegment> getActiveSegments()
-    {
-        return Collections.unmodifiableCollection(activeSegments);
-    }
-}
-

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogArchiver.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogArchiver.java
index 8957643..1b1a1e0 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogArchiver.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogArchiver.java

@@ -52,7 +52,7 @@
     }
 
     public final Map<String, Future<?>> archivePending = new ConcurrentHashMap<String, Future<?>>();
-    public final ExecutorService executor = new JMXEnabledThreadPoolExecutor("commitlog_archiver");
+    public final ExecutorService executor = new JMXEnabledThreadPoolExecutor("CommitLogArchiver");
     private final String archiveCommand;
     private final String restoreCommand;
     private final String restoreDirectories;
@@ -118,6 +118,29 @@
         }
     }
 
+    public void maybeArchive(final CommitLogSegment segment)
+    {
+        if (Strings.isNullOrEmpty(archiveCommand))
+            return;
+
+        archivePending.put(segment.getName(), executor.submit(new WrappedRunnable()
+        {
+            protected void runMayThrow() throws IOException
+            {
+                segment.waitForFinalSync();
+                String command = archiveCommand.replace("%name", segment.getName());
+                command = command.replace("%path", segment.getPath());
+                exec(command);
+            }
+        }));
+    }
+
+    /**
+     * Differs from the above because it can be used on any file, rather than only
+     * managed commit log segments (and thus cannot call waitForFinalSync).
+     *
+     * Used to archive files present in the commit log directory at startup (CASSANDRA-6904)
+     */
     public void maybeArchive(final String path, final String name)
     {
         if (Strings.isNullOrEmpty(archiveCommand))
@@ -152,7 +175,7 @@
         {
             if (e.getCause() instanceof IOException)
             {
-                logger.info("Looks like the archiving of file {} failed earlier, cassandra is going to ignore this segment for now.", name);
+                logger.error("Looks like the archiving of file {} failed earlier, cassandra is going to ignore this segment for now.", name);
                 return false;
             }
             throw new RuntimeException(e);
@@ -175,7 +198,30 @@
             }
             for (File fromFile : files)
             {
-                File toFile = new File(DatabaseDescriptor.getCommitLogLocation(), new CommitLogDescriptor(CommitLogSegment.getNextId()).fileName());
+                CommitLogDescriptor fromHeader = CommitLogDescriptor.fromHeader(fromFile);
+                CommitLogDescriptor fromName = CommitLogDescriptor.isValid(fromFile.getName()) ? CommitLogDescriptor.fromFileName(fromFile.getName()) : null;
+                CommitLogDescriptor descriptor;
+                if (fromHeader == null && fromName == null)
+                    throw new IllegalStateException("Cannot safely construct descriptor for segment, either from its name or its header: " + fromFile.getPath());
+                else if (fromHeader != null && fromName != null && !fromHeader.equals(fromName))
+                    throw new IllegalStateException(String.format("Cannot safely construct descriptor for segment, as name and header descriptors do not match (%s vs %s): %s", fromHeader, fromName, fromFile.getPath()));
+                else if (fromName != null && fromHeader == null && fromName.version >= CommitLogDescriptor.VERSION_21)
+                    throw new IllegalStateException("Cannot safely construct descriptor for segment, as name descriptor implies a version that should contain a header descriptor, but that descriptor could not be read: " + fromFile.getPath());
+                else if (fromHeader != null)
+                    descriptor = fromHeader;
+                else descriptor = fromName;
+
+                if (descriptor.version > CommitLogDescriptor.VERSION_21)
+                    throw new IllegalStateException("Unsupported commit log version: " + descriptor.version);
+
+                File toFile = new File(DatabaseDescriptor.getCommitLogLocation(), descriptor.fileName());
+                if (toFile.exists())
+                {
+                    logger.debug("Skipping restore of archive {} as the segment already exists in the restore location {}",
+                                 fromFile.getPath(), toFile.getPath());
+                    continue;
+                }
+
                 String command = restoreCommand.replace("%from", fromFile.getPath());
                 command = command.replace("%to", toFile.getPath());
                 try

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
index 7488c20..91c81e1 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java

@@ -20,10 +20,18 @@
  */
 package org.apache.cassandra.db.commitlog;
 
+import java.io.EOFException;
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.PureJavaCrc32;
 
 public class CommitLogDescriptor
 {
@@ -35,13 +43,17 @@
 
     public static final int VERSION_12 = 2;
     public static final int VERSION_20 = 3;
+    public static final int VERSION_21 = 4;
     /**
      * Increment this number if there is a changes in the commit log disc layout or MessagingVersion changes.
      * Note: make sure to handle {@link #getMessagingVersion()}
      */
-    public static final int current_version = VERSION_20;
+    public static final int current_version = VERSION_21;
 
-    private final int version;
+    // [version, id, checksum]
+    static final int HEADER_SIZE = 4 + 8 + 4;
+
+    final int version;
     public final long id;
 
     public CommitLogDescriptor(int version, long id)
@@ -55,6 +67,43 @@
         this(current_version, id);
     }
 
+    static void writeHeader(ByteBuffer out, CommitLogDescriptor descriptor)
+    {
+        out.putInt(0, descriptor.version);
+        out.putLong(4, descriptor.id);
+        PureJavaCrc32 crc = new PureJavaCrc32();
+        crc.updateInt(descriptor.version);
+        crc.updateInt((int) (descriptor.id & 0xFFFFFFFFL));
+        crc.updateInt((int) (descriptor.id >>> 32));
+        out.putInt(12, crc.getCrc());
+    }
+
+    public static CommitLogDescriptor fromHeader(File file)
+    {
+        try (RandomAccessFile raf = new RandomAccessFile(file, "r"))
+        {
+            assert raf.getFilePointer() == 0;
+            int version = raf.readInt();
+            long id = raf.readLong();
+            int crc = raf.readInt();
+            PureJavaCrc32 checkcrc = new PureJavaCrc32();
+            checkcrc.updateInt(version);
+            checkcrc.updateInt((int) (id & 0xFFFFFFFFL));
+            checkcrc.updateInt((int) (id >>> 32));
+            if (crc == checkcrc.getCrc())
+                return new CommitLogDescriptor(version, id);
+            return null;
+        }
+        catch (EOFException e)
+        {
+            throw new RuntimeException(e);
+        }
+        catch (IOException e)
+        {
+            throw new FSReadError(e, file);
+        }
+    }
+
     public static CommitLogDescriptor fromFileName(String name)
     {
         Matcher matcher;
@@ -76,16 +125,13 @@
                 return MessagingService.VERSION_12;
             case VERSION_20:
                 return MessagingService.VERSION_20;
+            case VERSION_21:
+                return MessagingService.VERSION_21;
             default:
                 throw new IllegalStateException("Unknown commitlog version " + version);
         }
     }
 
-    public int getVersion()
-    {
-        return version;
-    }
-
     public String fileName()
     {
         return FILENAME_PREFIX + version + SEPARATOR + id + FILENAME_EXTENSION;
@@ -99,4 +145,20 @@
     {
         return COMMIT_LOG_FILE_PATTERN.matcher(filename).matches();
     }
+
+    public String toString()
+    {
+        return "(" + version + "," + id + ")";
+    }
+
+    public boolean equals(Object that)
+    {
+        return that instanceof CommitLogDescriptor && equals((CommitLogDescriptor) that);
+    }
+
+    public boolean equals(CommitLogDescriptor that)
+    {
+        return this.version == that.version && this.id == that.id;
+    }
+
 }

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java
index 579b6ee..e89338a 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java

@@ -22,7 +22,6 @@
 import java.util.*;
 import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.zip.Checksum;
 
 import com.google.common.base.Predicate;
 import com.google.common.collect.HashMultimap;
@@ -48,6 +47,7 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(CommitLogReplayer.class);
     private static final int MAX_OUTSTANDING_REPLAY_COUNT = 1024;
+    private static final int LEGACY_END_OF_SEGMENT_MARKER = 0;
 
     private final Set<Keyspace> keyspacesRecovered;
     private final List<Future<?>> futures;
@@ -55,7 +55,7 @@
     private final AtomicInteger replayedCount;
     private final Map<UUID, ReplayPosition> cfPositions;
     private final ReplayPosition globalPosition;
-    private final Checksum checksum;
+    private final PureJavaCrc32 checksum;
     private byte[] buffer;
 
     public CommitLogReplayer()
@@ -112,9 +112,60 @@
         return replayedCount.get();
     }
 
+    private int readSyncMarker(CommitLogDescriptor descriptor, int offset, RandomAccessReader reader) throws IOException
+    {
+        if (offset > reader.length() - CommitLogSegment.SYNC_MARKER_SIZE)
+        {
+            if (offset != reader.length() && offset != Integer.MAX_VALUE)
+                logger.warn("Encountered bad header at position {} of Commit log {}; not enough room for a header", offset, reader.getPath());
+            // cannot possibly be a header here. if we're == length(), assume it's a correctly written final segment
+            return -1;
+        }
+        reader.seek(offset);
+        PureJavaCrc32 crc = new PureJavaCrc32();
+        crc.updateInt((int) (descriptor.id & 0xFFFFFFFFL));
+        crc.updateInt((int) (descriptor.id >>> 32));
+        crc.updateInt((int) reader.getPosition());
+        int end = reader.readInt();
+        long filecrc;
+        if (descriptor.version < CommitLogDescriptor.VERSION_21)
+            filecrc = reader.readLong();
+        else
+            filecrc = reader.readInt() & 0xffffffffL;
+        if (crc.getValue() != filecrc)
+        {
+            if (end != 0 || filecrc != 0)
+            {
+                logger.warn("Encountered bad header at position {} of commit log {}, with invalid CRC. The end of segment marker should be zero.", offset, reader.getPath());
+            }
+            return -1;
+        }
+        else if (end < offset || end > reader.length())
+        {
+            logger.warn("Encountered bad header at position {} of commit log {}, with bad position but valid CRC", offset, reader.getPath());
+            return -1;
+        }
+        return end;
+    }
+
+    private int getStartOffset(long segmentId, int version)
+    {
+        if (globalPosition.segment < segmentId)
+        {
+            if (version >= CommitLogDescriptor.VERSION_21)
+                return CommitLogDescriptor.HEADER_SIZE + CommitLogSegment.SYNC_MARKER_SIZE;
+            else
+                return 0;
+        }
+        else if (globalPosition.segment == segmentId)
+            return globalPosition.position;
+        else
+            return -1;
+    }
+
     private abstract static class ReplayFilter
     {
-        public abstract Iterable<ColumnFamily> filter(RowMutation rm);
+        public abstract Iterable<ColumnFamily> filter(Mutation mutation);
 
         public static ReplayFilter create()
         {
@@ -143,9 +194,9 @@
 
     private static class AlwaysReplayFilter extends ReplayFilter
     {
-        public Iterable<ColumnFamily> filter(RowMutation rm)
+        public Iterable<ColumnFamily> filter(Mutation mutation)
         {
-            return rm.getColumnFamilies();
+            return mutation.getColumnFamilies();
         }
     }
 
@@ -158,13 +209,13 @@
             this.toReplay = toReplay;
         }
 
-        public Iterable<ColumnFamily> filter(RowMutation rm)
+        public Iterable<ColumnFamily> filter(Mutation mutation)
         {
-            final Collection<String> cfNames = toReplay.get(rm.getKeyspaceName());
+            final Collection<String> cfNames = toReplay.get(mutation.getKeyspaceName());
             if (cfNames == null)
                 return Collections.emptySet();
 
-            return Iterables.filter(rm.getColumnFamilies(), new Predicate<ColumnFamily>()
+            return Iterables.filter(mutation.getColumnFamilies(), new Predicate<ColumnFamily>()
             {
                 public boolean apply(ColumnFamily cf)
                 {
@@ -177,201 +228,226 @@
     public void recover(File file) throws IOException
     {
         final ReplayFilter replayFilter = ReplayFilter.create();
+        logger.info("Replaying {}", file.getPath());
         CommitLogDescriptor desc = CommitLogDescriptor.fromFileName(file.getName());
-        final long segment = desc.id;
+        final long segmentId = desc.id;
         logger.info("Replaying {} (CL version {}, messaging version {})",
                     file.getPath(),
-                    desc.getVersion(),
+                    desc.version,
                     desc.getMessagingVersion());
         RandomAccessReader reader = RandomAccessReader.open(new File(file.getAbsolutePath()));
+
         try
         {
             assert reader.length() <= Integer.MAX_VALUE;
-            int replayPosition;
-            if (globalPosition.segment < segment)
-            {
-                replayPosition = 0;
-            }
-            else if (globalPosition.segment == segment)
-            {
-                replayPosition = globalPosition.position;
-            }
-            else
+            int offset = getStartOffset(segmentId, desc.version);
+            if (offset < 0)
             {
                 logger.debug("skipping replay of fully-flushed {}", file);
                 return;
             }
 
-            if (logger.isDebugEnabled())
-                logger.debug("Replaying " + file + " starting at " + replayPosition);
-            reader.seek(replayPosition);
-
-            /* read the logs populate RowMutation and apply */
-            while (!reader.isEOF())
+            int prevEnd = CommitLogDescriptor.HEADER_SIZE;
+            main: while (true)
             {
+
+                int end = prevEnd;
+                if (desc.version < CommitLogDescriptor.VERSION_21)
+                    end = Integer.MAX_VALUE;
+                else
+                {
+                    do { end = readSyncMarker(desc, end, reader); }
+                    while (end < offset && end > prevEnd);
+                }
+
+                if (end < prevEnd)
+                    break;
+
                 if (logger.isDebugEnabled())
-                    logger.debug("Reading mutation at " + reader.getFilePointer());
+                    logger.debug("Replaying {} between {} and {}", file, offset, end);
 
-                long claimedCRC32;
-                int serializedSize;
-                try
+                reader.seek(offset);
+
+                 /* read the logs populate Mutation and apply */
+                while (reader.getPosition() < end && !reader.isEOF())
                 {
-                    // any of the reads may hit EOF
-                    serializedSize = reader.readInt();
-                    if (serializedSize == CommitLog.END_OF_SEGMENT_MARKER)
-                    {
-                        logger.debug("Encountered end of segment marker at " + reader.getFilePointer());
-                        break;
-                    }
+                    if (logger.isDebugEnabled())
+                        logger.debug("Reading mutation at {}", reader.getFilePointer());
 
-                    // RowMutation must be at LEAST 10 bytes:
-                    // 3 each for a non-empty Keyspace and Key (including the
-                    // 2-byte length from writeUTF/writeWithShortLength) and 4 bytes for column count.
-                    // This prevents CRC by being fooled by special-case garbage in the file; see CASSANDRA-2128
-                    if (serializedSize < 10)
-                        break;
-
-                    long claimedSizeChecksum = reader.readLong();
-                    checksum.reset();
-                    if (desc.getVersion() < CommitLogDescriptor.VERSION_20)
-                        checksum.update(serializedSize);
-                    else
-                        FBUtilities.updateChecksumInt(checksum, serializedSize);
-
-                    if (checksum.getValue() != claimedSizeChecksum)
-                        break; // entry wasn't synced correctly/fully. that's
-                               // ok.
-
-                    if (serializedSize > buffer.length)
-                        buffer = new byte[(int) (1.2 * serializedSize)];
-                    reader.readFully(buffer, 0, serializedSize);
-                    claimedCRC32 = reader.readLong();
-                }
-                catch (EOFException eof)
-                {
-                    break; // last CL entry didn't get completely written. that's ok.
-                }
-
-                checksum.update(buffer, 0, serializedSize);
-                if (claimedCRC32 != checksum.getValue())
-                {
-                    // this entry must not have been fsynced. probably the rest is bad too,
-                    // but just in case there is no harm in trying them (since we still read on an entry boundary)
-                    continue;
-                }
-
-                /* deserialize the commit log entry */
-                FastByteArrayInputStream bufIn = new FastByteArrayInputStream(buffer, 0, serializedSize);
-                final RowMutation rm;
-                try
-                {
-                    rm = RowMutation.serializer.deserialize(new DataInputStream(bufIn),
-                                                            desc.getMessagingVersion(),
-                                                            ColumnSerializer.Flag.LOCAL);
-                    // doublecheck that what we read is [still] valid for the current schema
-                    for (ColumnFamily cf : rm.getColumnFamilies())
-                        for (Column cell : cf)
-                            cf.getComparator().validate(cell.name());
-                }
-                catch (UnknownColumnFamilyException ex)
-                {
-                    if (ex.cfId == null)
-                        continue;
-                    AtomicInteger i = invalidMutations.get(ex.cfId);
-                    if (i == null)
-                    {
-                        i = new AtomicInteger(1);
-                        invalidMutations.put(ex.cfId, i);
-                    }
-                    else
-                        i.incrementAndGet();
-                    continue;
-                }
-                catch (Throwable t)
-                {
-                    File f = File.createTempFile("mutation", "dat");
-                    DataOutputStream out = new DataOutputStream(new FileOutputStream(f));
+                    long claimedCRC32;
+                    int serializedSize;
                     try
                     {
-                        out.write(buffer, 0, serializedSize);
-                    }
-                    finally
-                    {
-                        out.close();
-                    }
-                    String st = String.format("Unexpected error deserializing mutation; saved to %s and ignored.  This may be caused by replaying a mutation against a table with the same name but incompatible schema.  Exception follows: ",
-                                              f.getAbsolutePath());
-                    logger.error(st, t);
-                    continue;
-                }
-
-                if (logger.isDebugEnabled())
-                    logger.debug(String.format("replaying mutation for %s.%s: %s", rm.getKeyspaceName(), ByteBufferUtil.bytesToHex(rm.key()), "{" + StringUtils.join(rm.getColumnFamilies().iterator(), ", ")
-                            + "}"));
-
-                final long entryLocation = reader.getFilePointer();
-                Runnable runnable = new WrappedRunnable()
-                {
-                    public void runMayThrow() throws IOException
-                    {
-                        if (Schema.instance.getKSMetaData(rm.getKeyspaceName()) == null)
-                            return;
-                        if (pointInTimeExceeded(rm))
-                            return;
-
-                        final Keyspace keyspace = Keyspace.open(rm.getKeyspaceName());
-
-                        // Rebuild the row mutation, omitting column families that
-                        //    a) the user has requested that we ignore,
-                        //    b) have already been flushed,
-                        // or c) are part of a cf that was dropped.
-                        // Keep in mind that the cf.name() is suspect. do every thing based on the cfid instead.
-                        RowMutation newRm = null;
-                        for (ColumnFamily columnFamily : replayFilter.filter(rm))
+                        // any of the reads may hit EOF
+                        serializedSize = reader.readInt();
+                        if (serializedSize == LEGACY_END_OF_SEGMENT_MARKER)
                         {
-                            if (Schema.instance.getCF(columnFamily.id()) == null)
-                                continue; // dropped
+                            logger.debug("Encountered end of segment marker at {}", reader.getFilePointer());
+                            break main;
+                        }
 
-                            ReplayPosition rp = cfPositions.get(columnFamily.id());
+                        // Mutation must be at LEAST 10 bytes:
+                        // 3 each for a non-empty Keyspace and Key (including the
+                        // 2-byte length from writeUTF/writeWithShortLength) and 4 bytes for column count.
+                        // This prevents CRC by being fooled by special-case garbage in the file; see CASSANDRA-2128
+                        if (serializedSize < 10)
+                            break main;
 
-                            // replay if current segment is newer than last flushed one or,
-                            // if it is the last known segment, if we are after the replay position
-                            if (segment > rp.segment || (segment == rp.segment && entryLocation > rp.position))
+                        long claimedSizeChecksum;
+                        if (desc.version < CommitLogDescriptor.VERSION_21)
+                            claimedSizeChecksum = reader.readLong();
+                        else
+                            claimedSizeChecksum = reader.readInt() & 0xffffffffL;
+                        checksum.reset();
+                        if (desc.version < CommitLogDescriptor.VERSION_20)
+                            checksum.update(serializedSize);
+                        else
+                            checksum.updateInt(serializedSize);
+
+                        if (checksum.getValue() != claimedSizeChecksum)
+                            break main; // entry wasn't synced correctly/fully. that's
+                        // ok.
+
+                        if (serializedSize > buffer.length)
+                            buffer = new byte[(int) (1.2 * serializedSize)];
+                        reader.readFully(buffer, 0, serializedSize);
+                        if (desc.version < CommitLogDescriptor.VERSION_21)
+                            claimedCRC32 = reader.readLong();
+                        else
+                            claimedCRC32 = reader.readInt() & 0xffffffffL;
+                    }
+                    catch (EOFException eof)
+                    {
+                        break main; // last CL entry didn't get completely written. that's ok.
+                    }
+
+                    checksum.update(buffer, 0, serializedSize);
+                    if (claimedCRC32 != checksum.getValue())
+                    {
+                        // this entry must not have been fsynced. probably the rest is bad too,
+                        // but just in case there is no harm in trying them (since we still read on an entry boundary)
+                        continue;
+                    }
+
+                    /* deserialize the commit log entry */
+                    FastByteArrayInputStream bufIn = new FastByteArrayInputStream(buffer, 0, serializedSize);
+                    final Mutation mutation;
+                    try
+                    {
+                        mutation = Mutation.serializer.deserialize(new DataInputStream(bufIn),
+                                                                   desc.getMessagingVersion(),
+                                                                   ColumnSerializer.Flag.LOCAL);
+                        // doublecheck that what we read is [still] valid for the current schema
+                        for (ColumnFamily cf : mutation.getColumnFamilies())
+                            for (Cell cell : cf)
+                                cf.getComparator().validate(cell.name());
+                    }
+                    catch (UnknownColumnFamilyException ex)
+                    {
+                        if (ex.cfId == null)
+                            continue;
+                        AtomicInteger i = invalidMutations.get(ex.cfId);
+                        if (i == null)
+                        {
+                            i = new AtomicInteger(1);
+                            invalidMutations.put(ex.cfId, i);
+                        }
+                        else
+                            i.incrementAndGet();
+                        continue;
+                    }
+                    catch (Throwable t)
+                    {
+                        JVMStabilityInspector.inspectThrowable(t);
+                        File f = File.createTempFile("mutation", "dat");
+                        DataOutputStream out = new DataOutputStream(new FileOutputStream(f));
+                        try
+                        {
+                            out.write(buffer, 0, serializedSize);
+                        }
+                        finally
+                        {
+                            out.close();
+                        }
+                        String st = String.format("Unexpected error deserializing mutation; saved to %s and ignored.  This may be caused by replaying a mutation against a table with the same name but incompatible schema.  Exception follows: ",
+                                                  f.getAbsolutePath());
+                        logger.error(st, t);
+                        continue;
+                    }
+
+                    if (logger.isDebugEnabled())
+                        logger.debug("replaying mutation for {}.{}: {}", mutation.getKeyspaceName(), ByteBufferUtil.bytesToHex(mutation.key()), "{" + StringUtils.join(mutation.getColumnFamilies().iterator(), ", ") + "}");
+
+                    final long entryLocation = reader.getFilePointer();
+                    Runnable runnable = new WrappedRunnable()
+                    {
+                        public void runMayThrow() throws IOException
+                        {
+                            if (Schema.instance.getKSMetaData(mutation.getKeyspaceName()) == null)
+                                return;
+                            if (pointInTimeExceeded(mutation))
+                                return;
+
+                            final Keyspace keyspace = Keyspace.open(mutation.getKeyspaceName());
+
+                            // Rebuild the mutation, omitting column families that
+                            //    a) the user has requested that we ignore,
+                            //    b) have already been flushed,
+                            // or c) are part of a cf that was dropped.
+                            // Keep in mind that the cf.name() is suspect. do every thing based on the cfid instead.
+                            Mutation newMutation = null;
+                            for (ColumnFamily columnFamily : replayFilter.filter(mutation))
                             {
-                                if (newRm == null)
-                                    newRm = new RowMutation(rm.getKeyspaceName(), rm.key());
-                                newRm.add(columnFamily);
-                                replayedCount.incrementAndGet();
+                                if (Schema.instance.getCF(columnFamily.id()) == null)
+                                    continue; // dropped
+
+                                ReplayPosition rp = cfPositions.get(columnFamily.id());
+
+                                // replay if current segment is newer than last flushed one or,
+                                // if it is the last known segment, if we are after the replay position
+                                if (segmentId > rp.segment || (segmentId == rp.segment && entryLocation > rp.position))
+                                {
+                                    if (newMutation == null)
+                                        newMutation = new Mutation(mutation.getKeyspaceName(), mutation.key());
+                                    newMutation.add(columnFamily);
+                                    replayedCount.incrementAndGet();
+                                }
+                            }
+                            if (newMutation != null)
+                            {
+                                assert !newMutation.isEmpty();
+                                Keyspace.open(newMutation.getKeyspaceName()).apply(newMutation, false);
+                                keyspacesRecovered.add(keyspace);
                             }
                         }
-                        if (newRm != null)
-                        {
-                            assert !newRm.isEmpty();
-                            Keyspace.open(newRm.getKeyspaceName()).apply(newRm, false);
-                            keyspacesRecovered.add(keyspace);
-                        }
+                    };
+                    futures.add(StageManager.getStage(Stage.MUTATION).submit(runnable));
+                    if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT)
+                    {
+                        FBUtilities.waitOnFutures(futures);
+                        futures.clear();
                     }
-                };
-                futures.add(StageManager.getStage(Stage.MUTATION).submit(runnable));
-                if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT)
-                {
-                    FBUtilities.waitOnFutures(futures);
-                    futures.clear();
                 }
+
+                if (desc.version < CommitLogDescriptor.VERSION_21)
+                    break;
+
+                offset = end + CommitLogSegment.SYNC_MARKER_SIZE;
+                prevEnd = end;
             }
         }
         finally
         {
             FileUtils.closeQuietly(reader);
-            logger.info("Finished reading " + file);
+            logger.info("Finished reading {}", file);
         }
     }
 
-    protected boolean pointInTimeExceeded(RowMutation frm)
+    protected boolean pointInTimeExceeded(Mutation fm)
     {
         long restoreTarget = CommitLog.instance.archiver.restorePointInTime;
 
-        for (ColumnFamily families : frm.getColumnFamilies())
+        for (ColumnFamily families : fm.getColumnFamilies())
         {
             if (CommitLog.instance.archiver.precision.toMillis(families.maxTimestamp()) > restoreTarget)
                 return true;

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegment.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegment.java
index 45cc77d..a70327b 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegment.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegment.java

@@ -17,9 +17,7 @@
  */
 package org.apache.cassandra.db.commitlog;
 
-import java.io.DataOutputStream;
 import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
@@ -28,13 +26,15 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Comparator;
-import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.zip.Checksum;
 
+import org.cliffc.high_scale_lib.NonBlockingHashMap;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -42,16 +42,16 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.RowMutation;
+import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.io.FSWriteError;
-import org.apache.cassandra.io.util.ByteBufferOutputStream;
-import org.apache.cassandra.io.util.ChecksummedOutputStream;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.CLibrary;
 import org.apache.cassandra.utils.PureJavaCrc32;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.concurrent.WaitQueue;
 
 /*
- * A single commit log file on disk. Manages creation of the file and writing row mutations to disk,
+ * A single commit log file on disk. Manages creation of the file and writing mutations to disk,
  * as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment
  * files are initially allocated to a fixed size and can grow to accomidate a larger value if necessary.
  */
@@ -59,41 +59,71 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(CommitLogSegment.class);
 
-    private final static long idBase = System.currentTimeMillis();
+    private final static long idBase;
     private final static AtomicInteger nextId = new AtomicInteger(1);
+    static
+    {
+        long maxId = Long.MIN_VALUE;
+        for (File file : new File(DatabaseDescriptor.getCommitLogLocation()).listFiles())
+        {
+            if (CommitLogDescriptor.isValid(file.getName()))
+                maxId = Math.max(CommitLogDescriptor.fromFileName(file.getName()).id, maxId);
+        }
+        idBase = Math.max(System.currentTimeMillis(), maxId + 1);
+    }
 
-    // The commit log entry overhead in bytes (int: length + long: head checksum + long: tail checksum)
-    static final int ENTRY_OVERHEAD_SIZE = 4 + 8 + 8;
+    // The commit log entry overhead in bytes (int: length + int: head checksum + int: tail checksum)
+    public static final int ENTRY_OVERHEAD_SIZE = 4 + 4 + 4;
 
-    // cache which cf is dirty in this segment to avoid having to lookup all ReplayPositions to decide if we can delete this segment
-    private final Map<UUID, Integer> cfLastWrite = new HashMap<>();
+    // The commit log (chained) sync marker/header size in bytes (int: length + int: checksum [segmentId, position])
+    static final int SYNC_MARKER_SIZE = 4 + 4;
+
+    // The OpOrder used to order appends wrt sync
+    private final OpOrder appendOrder = new OpOrder();
+
+    private final AtomicInteger allocatePosition = new AtomicInteger();
+
+    // Everything before this offset has been synced and written.  The SYNC_MARKER_SIZE bytes after
+    // each sync are reserved, and point forwards to the next such offset.  The final
+    // sync marker in a segment will be zeroed out, or point to EOF.
+    private volatile int lastSyncedOffset;
+
+    // the amount of the tail of the file we have allocated but not used - this is used when we discard a log segment
+    // to ensure nobody writes to it after we've decided we're done with it
+    private int discardedTailFrom;
+
+    // a signal for writers to wait on to confirm the log message they provided has been written to disk
+    private final WaitQueue syncComplete = new WaitQueue();
+
+    // a map of Cf->dirty position; this is used to permit marking Cfs clean whilst the log is still in use
+    private final NonBlockingHashMap<UUID, AtomicInteger> cfDirty = new NonBlockingHashMap<>(1024);
+
+    // a map of Cf->clean position; this is used to permit marking Cfs clean whilst the log is still in use
+    private final ConcurrentHashMap<UUID, AtomicInteger> cfClean = new ConcurrentHashMap<>();
 
     public final long id;
 
     private final File logFile;
     private final RandomAccessFile logFileAccessor;
-
-    private boolean needsSync = false;
+    private final int fd;
 
     private final MappedByteBuffer buffer;
-    private final Checksum checksum;
-    private final DataOutputStream bufferStream;
-    private boolean closed;
 
     public final CommitLogDescriptor descriptor;
 
     /**
      * @return a newly minted segment file
      */
-    public static CommitLogSegment freshSegment()
+    static CommitLogSegment freshSegment()
     {
         return new CommitLogSegment(null);
     }
 
-    public static long getNextId()
+    static long getNextId()
     {
         return idBase + nextId.getAndIncrement();
     }
+
     /**
      * Constructs a new segment file.
      *
@@ -127,16 +157,20 @@
             if (isCreating)
                 logger.debug("Creating new commit log segment {}", logFile.getPath());
 
-            // Map the segment, extending or truncating it to the standard segment size
+            // Map the segment, extending or truncating it to the standard segment size.
+            // (We may have restarted after a segment size configuration change, leaving "incorrectly"
+            // sized segments on disk.)
             logFileAccessor.setLength(DatabaseDescriptor.getCommitLogSegmentSize());
+            fd = CLibrary.getfd(logFileAccessor.getFD());
 
             buffer = logFileAccessor.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, DatabaseDescriptor.getCommitLogSegmentSize());
-            checksum = new PureJavaCrc32();
-            bufferStream = new DataOutputStream(new ChecksummedOutputStream(new ByteBufferOutputStream(buffer), checksum));
-            buffer.putInt(CommitLog.END_OF_SEGMENT_MARKER);
-            buffer.position(0);
-            needsSync = true;
-            sync();
+            // write the header
+            CommitLogDescriptor.writeHeader(buffer, descriptor);
+            // mark the initial sync marker as uninitialised
+            buffer.putInt(CommitLogDescriptor.HEADER_SIZE, 0);
+            buffer.putLong(CommitLogDescriptor.HEADER_SIZE + 4, 0);
+            allocatePosition.set(CommitLogDescriptor.HEADER_SIZE + SYNC_MARKER_SIZE);
+            lastSyncedOffset = CommitLogDescriptor.HEADER_SIZE;
         }
         catch (IOException e)
         {
@@ -145,60 +179,168 @@
     }
 
     /**
+     * Allocate space in this buffer for the provided mutation, and return the allocated Allocation object.
+     * Returns null if there is not enough space in this segment, and a new segment is needed.
+     */
+    Allocation allocate(Mutation mutation, int size)
+    {
+        final OpOrder.Group opGroup = appendOrder.start();
+        try
+        {
+            int position = allocate(size);
+            if (position < 0)
+            {
+                opGroup.close();
+                return null;
+            }
+            markDirty(mutation, position);
+            return new Allocation(this, opGroup, position, (ByteBuffer) buffer.duplicate().position(position).limit(position + size));
+        }
+        catch (Throwable t)
+        {
+            opGroup.close();
+            throw t;
+        }
+    }
+
+    // allocate bytes in the segment, or return -1 if not enough space
+    private int allocate(int size)
+    {
+        while (true)
+        {
+            int prev = allocatePosition.get();
+            int next = prev + size;
+            if (next >= buffer.capacity())
+                return -1;
+            if (allocatePosition.compareAndSet(prev, next))
+                return prev;
+        }
+    }
+
+    // ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded
+    void discardUnusedTail()
+    {
+        // we guard this with the OpOrdering instead of synchronised due to potential dead-lock with CLSM.advanceAllocatingFrom()
+        // this actually isn't strictly necessary, as currently all calls to discardUnusedTail occur within a block
+        // already protected by this OpOrdering, but to prevent future potential mistakes, we duplicate the protection here
+        // so that the contract between discardUnusedTail() and sync() is more explicit.
+        try (OpOrder.Group group = appendOrder.start())
+        {
+            while (true)
+            {
+                int prev = allocatePosition.get();
+                // we set allocatePosition past buffer.capacity() to make sure we always set discardedTailFrom
+                int next = buffer.capacity() + 1;
+                if (prev == next)
+                    return;
+                if (allocatePosition.compareAndSet(prev, next))
+                {
+                    discardedTailFrom = prev;
+                    return;
+                }
+            }
+        }
+    }
+
+    /**
+     * Wait for any appends or discardUnusedTail() operations started before this method was called
+     */
+    void waitForModifications()
+    {
+        // issue a barrier and wait for it
+        appendOrder.awaitNewBarrier();
+    }
+
+    /**
+     * Forces a disk flush for this segment file.
+     */
+    synchronized void sync()
+    {
+        try
+        {
+            // check we have more work to do
+            if (allocatePosition.get() <= lastSyncedOffset + SYNC_MARKER_SIZE)
+                return;
+
+            // allocate a new sync marker; this is both necessary in itself, but also serves to demarcate
+            // the point at which we can safely consider records to have been completely written to
+            int nextMarker;
+            nextMarker = allocate(SYNC_MARKER_SIZE);
+            boolean close = false;
+            if (nextMarker < 0)
+            {
+                // ensure no more of this CLS is writeable, and mark ourselves for closing
+                discardUnusedTail();
+                close = true;
+
+                // wait for modifications guards both discardedTailFrom, and any outstanding appends
+                waitForModifications();
+
+                if (discardedTailFrom < buffer.capacity() - SYNC_MARKER_SIZE)
+                {
+                    // if there's room in the discard section to write an empty header, use that as the nextMarker
+                    nextMarker = discardedTailFrom;
+                }
+                else
+                {
+                    // not enough space left in the buffer, so mark the next sync marker as the EOF position
+                    nextMarker = buffer.capacity();
+                }
+            }
+            else
+            {
+                waitForModifications();
+            }
+
+            assert nextMarker > lastSyncedOffset;
+
+            // write previous sync marker to point to next sync marker
+            // we don't chain the crcs here to ensure this method is idempotent if it fails
+            int offset = lastSyncedOffset;
+            final PureJavaCrc32 crc = new PureJavaCrc32();
+            crc.updateInt((int) (id & 0xFFFFFFFFL));
+            crc.updateInt((int) (id >>> 32));
+            crc.updateInt(offset);
+            buffer.putInt(offset, nextMarker);
+            buffer.putInt(offset + 4, crc.getCrc());
+
+            // zero out the next sync marker so replayer can cleanly exit
+            if (nextMarker < buffer.capacity())
+            {
+                buffer.putInt(nextMarker, 0);
+                buffer.putInt(nextMarker + 4, 0);
+            }
+
+            // actually perform the sync and signal those waiting for it
+            buffer.force();
+
+            if (close)
+                nextMarker = buffer.capacity();
+
+            lastSyncedOffset = nextMarker;
+            syncComplete.signalAll();
+
+            CLibrary.trySkipCache(fd, offset, nextMarker);
+            if (close)
+                close();
+        }
+        catch (Exception e) // MappedByteBuffer.force() does not declare IOException but can actually throw it
+        {
+            throw new FSWriteError(e, getPath());
+        }
+    }
+
+    public boolean isStillAllocating()
+    {
+        return allocatePosition.get() < buffer.capacity();
+    }
+
+    /**
      * Completely discards a segment file by deleting it. (Potentially blocking operation)
      */
-    public void discard(boolean deleteFile)
+    void delete()
     {
-        // TODO shouldn't we close the file when we're done writing to it, which comes (potentially) much earlier than it's eligible for recyling?
-        close();
-        // it's safe to simply try (and maybe fail) to delete the log file because we should only ever close()/discard() once
-        // the global ReplayPosition is past the current log file position, so we will never replay it; however to be on the
-        // safe side we attempt to rename/zero it if delete fails
-        if (deleteFile)
-        {
-            try
-            {
-                FileUtils.deleteWithConfirm(logFile);
-            }
-            catch (FSWriteError e)
-            {
-                // attempt to rename the file and zero its start, if possible, before throwing the error
-                File file = logFile;
-                try
-                {
-                    File newFile = new File(file.getPath() + ".discarded");
-                    FileUtils.renameWithConfirm(file, newFile);
-                    file = newFile;
-                }
-                catch (Throwable t)
-                {
-                }
-
-                try
-                {
-                    RandomAccessFile raf = new RandomAccessFile(file, "rw");
-                    ByteBuffer write = ByteBuffer.allocate(8);
-                    write.putInt(CommitLog.END_OF_SEGMENT_MARKER);
-                    write.position(0);
-                    raf.getChannel().write(write);
-                    raf.close();
-                    logger.error("{} {}, as we failed to delete it.", file == logFile ? "Zeroed" : "Renamed and zeroed", file);
-                }
-                catch (Throwable t)
-                {
-                    if (logFile == file)
-                    {
-                        logger.error("Could not rename or zero {}, which we also failed to delete. In the face of other issues this could result in unnecessary log replay.", t, file);
-                    }
-                    else
-                    {
-                        logger.error("Renamed {} to {}, as we failed to delete it, however we failed to zero its header.", t, logFile, file);
-                    }
-                }
-                throw e;
-            }
-
-        }
+       FileUtils.deleteWithConfirm(logFile);
     }
 
     /**
@@ -206,100 +348,29 @@
      *
      * @return a new CommitLogSegment representing the newly reusable segment.
      */
-    public CommitLogSegment recycle()
+    CommitLogSegment recycle()
     {
+        try
+        {
+            sync();
+        }
+        catch (FSWriteError e)
+        {
+            logger.error("I/O error flushing {} {}", this, e.getMessage());
+            throw e;
+        }
+
         close();
+
         return new CommitLogSegment(getPath());
     }
 
     /**
-     * @return true if there is room to write() @param size to this segment
-     */
-    public boolean hasCapacityFor(long size)
-    {
-        return size <= buffer.remaining();
-    }
-
-    /**
-     * mark all of the column families we're modifying as dirty at this position
-     */
-    private void markDirty(RowMutation rowMutation, ReplayPosition repPos)
-    {
-        for (ColumnFamily columnFamily : rowMutation.getColumnFamilies())
-        {
-            // check for null cfm in case a cl write goes through after the cf is
-            // defined but before a new segment is created.
-            CFMetaData cfm = Schema.instance.getCFMetaData(columnFamily.id());
-            if (cfm == null)
-            {
-                logger.error("Attempted to write commit log entry for unrecognized column family: " + columnFamily.id());
-            }
-            else
-            {
-                markCFDirty(cfm.cfId, repPos.position);
-            }
-        }
-    }
-
-   /**
-     * Appends a row mutation onto the commit log.  Requres that hasCapacityFor has already been checked.
-     *
-     * @param   mutation   the mutation to append to the commit log.
-     * @return  the position of the appended mutation
-     */
-    public ReplayPosition write(RowMutation mutation) throws IOException
-    {
-        assert !closed;
-        ReplayPosition repPos = getContext();
-        markDirty(mutation, repPos);
-
-        checksum.reset();
-
-        // checksummed length
-        int length = (int) RowMutation.serializer.serializedSize(mutation, MessagingService.current_version);
-        bufferStream.writeInt(length);
-        buffer.putLong(checksum.getValue());
-
-        // checksummed mutation
-        RowMutation.serializer.serialize(mutation, bufferStream, MessagingService.current_version);
-        buffer.putLong(checksum.getValue());
-
-        if (buffer.remaining() >= 4)
-        {
-            // writes end of segment marker and rewinds back to position where it starts
-            buffer.putInt(CommitLog.END_OF_SEGMENT_MARKER);
-            buffer.position(buffer.position() - CommitLog.END_OF_SEGMENT_MARKER_SIZE);
-        }
-
-        needsSync = true;
-        return repPos;
-    }
-
-    /**
-     * Forces a disk flush for this segment file.
-     */
-    public synchronized void sync()
-    {
-        if (needsSync)
-        {
-            try
-            {
-                buffer.force();
-            }
-            catch (Exception e) // MappedByteBuffer.force() does not declare IOException but can actually throw it
-            {
-                throw new FSWriteError(e, getPath());
-            }
-            needsSync = false;
-        }
-    }
-
-    /**
      * @return the current ReplayPosition for this log segment
      */
     public ReplayPosition getContext()
     {
-        return new ReplayPosition(id, buffer.position());
+        return new ReplayPosition(id, allocatePosition.get());
     }
 
     /**
@@ -318,20 +389,33 @@
         return logFile.getName();
     }
 
+    void waitForFinalSync()
+    {
+        while (true)
+        {
+            WaitQueue.Signal signal = syncComplete.register();
+            if (lastSyncedOffset < buffer.capacity())
+            {
+                signal.awaitUninterruptibly();
+            }
+            else
+            {
+                signal.cancel();
+                break;
+            }
+        }
+    }
+
     /**
      * Close the segment file.
      */
-    public synchronized void close()
+    void close()
     {
-        if (closed)
-            return;
-
-        needsSync = false;
         try
         {
-            FileUtils.clean(buffer);
+            if (FileUtils.isCleanerAvailable())
+                FileUtils.clean(buffer);
             logFileAccessor.close();
-            closed = true;
         }
         catch (IOException e)
         {
@@ -339,15 +423,17 @@
         }
     }
 
-    /**
-     * Records the CF as dirty at a certain position.
-     *
-     * @param cfId      the column family ID that is now dirty
-     * @param position  the position the last write for this CF was written at
-     */
-    private void markCFDirty(UUID cfId, Integer position)
+    void markDirty(Mutation mutation, int allocatedPosition)
     {
-        cfLastWrite.put(cfId, position);
+        for (ColumnFamily columnFamily : mutation.getColumnFamilies())
+        {
+            // check for deleted CFS
+            CFMetaData cfm = columnFamily.metadata();
+            if (cfm.isPurged())
+                logger.error("Attempted to write commit log entry for unrecognized column family: {}", columnFamily.id());
+            else
+                ensureAtleast(cfDirty, cfm.cfId, allocatedPosition);
+        }
     }
 
     /**
@@ -360,11 +446,57 @@
      */
     public synchronized void markClean(UUID cfId, ReplayPosition context)
     {
-        Integer lastWritten = cfLastWrite.get(cfId);
+        if (!cfDirty.containsKey(cfId))
+            return;
+        if (context.segment == id)
+            markClean(cfId, context.position);
+        else if (context.segment > id)
+            markClean(cfId, Integer.MAX_VALUE);
+    }
 
-        if (lastWritten != null && (!contains(context) || lastWritten < context.position))
+    private void markClean(UUID cfId, int position)
+    {
+        ensureAtleast(cfClean, cfId, position);
+        removeCleanFromDirty();
+    }
+
+    private static void ensureAtleast(ConcurrentMap<UUID, AtomicInteger> map, UUID cfId, int value)
+    {
+        AtomicInteger i = map.get(cfId);
+        if (i == null)
         {
-            cfLastWrite.remove(cfId);
+            AtomicInteger i2 = map.putIfAbsent(cfId, i = new AtomicInteger());
+            if (i2 != null)
+                i = i2;
+        }
+        while (true)
+        {
+            int cur = i.get();
+            if (cur > value)
+                break;
+            if (i.compareAndSet(cur, value))
+                break;
+        }
+    }
+
+    private void removeCleanFromDirty()
+    {
+        // if we're still allocating from this segment, don't touch anything since it can't be done thread-safely
+        if (isStillAllocating())
+            return;
+
+        Iterator<Map.Entry<UUID, AtomicInteger>> iter = cfClean.entrySet().iterator();
+        while (iter.hasNext())
+        {
+            Map.Entry<UUID, AtomicInteger> clean = iter.next();
+            UUID cfId = clean.getKey();
+            AtomicInteger cleanPos = clean.getValue();
+            AtomicInteger dirtyPos = cfDirty.get(cfId);
+            if (dirtyPos != null && dirtyPos.intValue() <= cleanPos.intValue())
+            {
+                cfDirty.remove(cfId);
+                iter.remove();
+            }
         }
     }
 
@@ -373,7 +505,19 @@
      */
     public synchronized Collection<UUID> getDirtyCFIDs()
     {
-        return new ArrayList<>(cfLastWrite.keySet());
+        if (cfClean.isEmpty() || cfDirty.isEmpty())
+            return cfDirty.keySet();
+
+        List<UUID> r = new ArrayList<>(cfDirty.size());
+        for (Map.Entry<UUID, AtomicInteger> dirty : cfDirty.entrySet())
+        {
+            UUID cfId = dirty.getKey();
+            AtomicInteger dirtyPos = dirty.getValue();
+            AtomicInteger cleanPos = cfClean.get(cfId);
+            if (cleanPos == null || cleanPos.intValue() < dirtyPos.intValue())
+                r.add(dirty.getKey());
+        }
+        return r;
     }
 
     /**
@@ -381,7 +525,13 @@
      */
     public synchronized boolean isUnused()
     {
-        return cfLastWrite.isEmpty();
+        // if room to allocate, we're still in use as the active allocatingFrom,
+        // so we don't want to race with updates to cfClean with removeCleanFromDirty
+        if (isStillAllocating())
+            return false;
+
+        removeCleanFromDirty();
+        return cfDirty.isEmpty();
     }
 
     /**
@@ -413,11 +563,6 @@
         return "CommitLogSegment(" + getPath() + ')';
     }
 
-    public int position()
-    {
-        return buffer.position();
-    }
-
     public static class CommitLogSegmentFileComparator implements Comparator<File>
     {
         public int compare(File f, File f2)
@@ -427,4 +572,61 @@
             return Long.compare(desc.id, desc2.id);
         }
     }
+
+    /**
+     * A simple class for tracking information about the portion of a segment that has been allocated to a log write.
+     * The constructor leaves the fields uninitialized for population by CommitlogManager, so that it can be
+     * stack-allocated by escape analysis in CommitLog.add.
+     */
+    static class Allocation
+    {
+
+        private final CommitLogSegment segment;
+        private final OpOrder.Group appendOp;
+        private final int position;
+        private final ByteBuffer buffer;
+
+        Allocation(CommitLogSegment segment, OpOrder.Group appendOp, int position, ByteBuffer buffer)
+        {
+            this.segment = segment;
+            this.appendOp = appendOp;
+            this.position = position;
+            this.buffer = buffer;
+        }
+
+        CommitLogSegment getSegment()
+        {
+            return segment;
+        }
+
+        ByteBuffer getBuffer()
+        {
+            return buffer;
+        }
+
+        // markWritten() MUST be called once we are done with the segment or the CL will never flush
+        // but must not be called more than once
+        void markWritten()
+        {
+            appendOp.close();
+        }
+
+        void awaitDiskSync()
+        {
+            while (segment.lastSyncedOffset < position)
+            {
+                WaitQueue.Signal signal = segment.syncComplete.register(CommitLog.instance.metrics.waitingOnCommit.time());
+                if (segment.lastSyncedOffset < position)
+                    signal.awaitUninterruptibly();
+                else
+                    signal.cancel();
+            }
+        }
+
+        public ReplayPosition getReplayPosition()
+        {
+            return new ReplayPosition(segment.id, buffer.limit());
+        }
+
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentManager.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentManager.java
new file mode 100644
index 0000000..0771b7a
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentManager.java

@@ -0,0 +1,558 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.commitlog;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.Future;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+
+import com.google.common.collect.Iterables;
+import com.google.common.util.concurrent.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.WrappedRunnable;
+
+import static org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation;
+
+/**
+ * Performs eager-creation of commit log segments in a background thread. All the
+ * public methods are thread safe.
+ */
+public class CommitLogSegmentManager
+{
+    static final Logger logger = LoggerFactory.getLogger(CommitLogSegmentManager.class);
+
+    /**
+     * Queue of work to be done by the manager thread.  This is usually a recycle operation, which returns
+     * a CommitLogSegment, or a delete operation, which returns null.
+     */
+    private final BlockingQueue<Callable<CommitLogSegment>> segmentManagementTasks = new LinkedBlockingQueue<>();
+
+    /** Segments that are ready to be used. Head of the queue is the one we allocate writes to */
+    private final ConcurrentLinkedQueue<CommitLogSegment> availableSegments = new ConcurrentLinkedQueue<>();
+
+    /** Active segments, containing unflushed data */
+    private final ConcurrentLinkedQueue<CommitLogSegment> activeSegments = new ConcurrentLinkedQueue<>();
+
+    /** The segment we are currently allocating commit log records to */
+    private volatile CommitLogSegment allocatingFrom = null;
+
+    private final WaitQueue hasAvailableSegments = new WaitQueue();
+
+    /**
+     * Tracks commitlog size, in multiples of the segment size.  We need to do this so we can "promise" size
+     * adjustments ahead of actually adding/freeing segments on disk, so that the "evict oldest segment" logic
+     * can see the effect of recycling segments immediately (even though they're really happening asynchronously
+     * on the manager thread, which will take a ms or two).
+     */
+    private final AtomicLong size = new AtomicLong();
+
+    /**
+     * New segment creation is initially disabled because we'll typically get some "free" segments
+     * recycled after log replay.
+     */
+    private volatile boolean createReserveSegments = false;
+
+    private final Thread managerThread;
+    private volatile boolean run = true;
+
+    public CommitLogSegmentManager()
+    {
+        // The run loop for the manager thread
+        Runnable runnable = new WrappedRunnable()
+        {
+            public void runMayThrow() throws Exception
+            {
+                while (run)
+                {
+                    try
+                    {
+                        Callable<CommitLogSegment> task = segmentManagementTasks.poll();
+                        if (task == null)
+                        {
+                            // if we have no more work to do, check if we should create a new segment
+                            if (availableSegments.isEmpty() && (activeSegments.isEmpty() || createReserveSegments))
+                            {
+                                logger.debug("No segments in reserve; creating a fresh one");
+                                size.addAndGet(DatabaseDescriptor.getCommitLogSegmentSize());
+                                // TODO : some error handling in case we fail to create a new segment
+                                availableSegments.add(CommitLogSegment.freshSegment());
+                                hasAvailableSegments.signalAll();
+                            }
+
+                            // flush old Cfs if we're full
+                            long unused = unusedCapacity();
+                            if (unused < 0)
+                            {
+                                List<CommitLogSegment> segmentsToRecycle = new ArrayList<>();
+                                long spaceToReclaim = 0;
+                                for (CommitLogSegment segment : activeSegments)
+                                {
+                                    if (segment == allocatingFrom)
+                                        break;
+                                    segmentsToRecycle.add(segment);
+                                    spaceToReclaim += DatabaseDescriptor.getCommitLogSegmentSize();
+                                    if (spaceToReclaim + unused >= 0)
+                                        break;
+                                }
+                                flushDataFrom(segmentsToRecycle, false);
+                            }
+
+                            try
+                            {
+                                // wait for new work to be provided
+                                task = segmentManagementTasks.take();
+                            }
+                            catch (InterruptedException e)
+                            {
+                                // shutdown signal; exit cleanly
+                                continue;
+                            }
+                        }
+
+                        CommitLogSegment recycled = task.call();
+                        if (recycled != null)
+                        {
+                            // if the work resulted in a segment to recycle, publish it
+                            availableSegments.add(recycled);
+                            hasAvailableSegments.signalAll();
+                        }
+                    }
+                    catch (Throwable t)
+                    {
+                        JVMStabilityInspector.inspectThrowable(t);
+                        if (!CommitLog.handleCommitError("Failed managing commit log segments", t))
+                            return;
+                        // sleep some arbitrary period to avoid spamming CL
+                        Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
+                    }
+                }
+            }
+        };
+
+        managerThread = new Thread(runnable, "COMMIT-LOG-ALLOCATOR");
+        managerThread.start();
+    }
+
+    /**
+     * Reserve space in the current segment for the provided mutation or, if there isn't space available,
+     * create a new segment.
+     *
+     * @return the provided Allocation object
+     */
+    public Allocation allocate(Mutation mutation, int size)
+    {
+        CommitLogSegment segment = allocatingFrom();
+
+        Allocation alloc;
+        while ( null == (alloc = segment.allocate(mutation, size)) )
+        {
+            // failed to allocate, so move to a new segment with enough room
+            advanceAllocatingFrom(segment);
+            segment = allocatingFrom;
+        }
+
+        return alloc;
+    }
+
+    // simple wrapper to ensure non-null value for allocatingFrom; only necessary on first call
+    CommitLogSegment allocatingFrom()
+    {
+        CommitLogSegment r = allocatingFrom;
+        if (r == null)
+        {
+            advanceAllocatingFrom(null);
+            r = allocatingFrom;
+        }
+        return r;
+    }
+
+    /**
+     * Fetches a new segment from the queue, creating a new one if necessary, and activates it
+     */
+    private void advanceAllocatingFrom(CommitLogSegment old)
+    {
+        while (true)
+        {
+            CommitLogSegment next;
+            synchronized (this)
+            {
+                // do this in a critical section so we can atomically remove from availableSegments and add to allocatingFrom/activeSegments
+                // see https://issues.apache.org/jira/browse/CASSANDRA-6557?focusedCommentId=13874432&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13874432
+                if (allocatingFrom != old)
+                    return;
+                next = availableSegments.poll();
+                if (next != null)
+                {
+                    allocatingFrom = next;
+                    activeSegments.add(next);
+                }
+            }
+
+            if (next != null)
+            {
+                if (old != null)
+                {
+                    // Now we can run the user defined command just after switching to the new commit log.
+                    // (Do this here instead of in the recycle call so we can get a head start on the archive.)
+                    CommitLog.instance.archiver.maybeArchive(old);
+
+                    // ensure we don't continue to use the old file; not strictly necessary, but cleaner to enforce it
+                    old.discardUnusedTail();
+                }
+
+                // request that the CL be synced out-of-band, as we've finished a segment
+                CommitLog.instance.requestExtraSync();
+                return;
+            }
+
+            // no more segments, so register to receive a signal when not empty
+            WaitQueue.Signal signal = hasAvailableSegments.register(CommitLog.instance.metrics.waitingOnSegmentAllocation.time());
+
+            // trigger the management thread; this must occur after registering
+            // the signal to ensure we are woken by any new segment creation
+            wakeManager();
+
+            // check if the queue has already been added to before waiting on the signal, to catch modifications
+            // that happened prior to registering the signal; *then* check to see if we've been beaten to making the change
+            if (!availableSegments.isEmpty() || allocatingFrom != old)
+            {
+                signal.cancel();
+                // if we've been beaten, just stop immediately
+                if (allocatingFrom != old)
+                    return;
+                // otherwise try again, as there should be an available segment
+                continue;
+            }
+
+            // can only reach here if the queue hasn't been inserted into
+            // before we registered the signal, as we only remove items from the queue
+            // after updating allocatingFrom. Can safely block until we are signalled
+            // by the allocator that new segments have been published
+            signal.awaitUninterruptibly();
+        }
+    }
+
+    private void wakeManager()
+    {
+        // put a NO-OP on the queue, to trigger management thread (and create a new segment if necessary)
+        segmentManagementTasks.add(new Callable<CommitLogSegment>()
+        {
+            public CommitLogSegment call()
+            {
+                return null;
+            }
+        });
+    }
+
+    /**
+     * Switch to a new segment, regardless of how much is left in the current one.
+     *
+     * Flushes any dirty CFs for this segment and any older segments, and then recycles
+     * the segments
+     */
+    void forceRecycleAll(Iterable<UUID> droppedCfs)
+    {
+        List<CommitLogSegment> segmentsToRecycle = new ArrayList<>(activeSegments);
+        CommitLogSegment last = segmentsToRecycle.get(segmentsToRecycle.size() - 1);
+        advanceAllocatingFrom(last);
+
+        // wait for the commit log modifications
+        last.waitForModifications();
+
+        // make sure the writes have materialized inside of the memtables by waiting for all outstanding writes
+        // on the relevant keyspaces to complete
+        Set<Keyspace> keyspaces = new HashSet<>();
+        for (UUID cfId : last.getDirtyCFIDs())
+        {
+            ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(cfId);
+            if (cfs != null)
+                keyspaces.add(cfs.keyspace);
+        }
+        for (Keyspace keyspace : keyspaces)
+            keyspace.writeOrder.awaitNewBarrier();
+
+        // flush and wait for all CFs that are dirty in segments up-to and including 'last'
+        Future<?> future = flushDataFrom(segmentsToRecycle, true);
+        try
+        {
+            future.get();
+
+            for (CommitLogSegment segment : activeSegments)
+                for (UUID cfId : droppedCfs)
+                    segment.markClean(cfId, segment.getContext());
+
+            // now recycle segments that are unused, as we may not have triggered a discardCompletedSegments()
+            // if the previous active segment was the only one to recycle (since an active segment isn't
+            // necessarily dirty, and we only call dCS after a flush).
+            for (CommitLogSegment segment : activeSegments)
+                if (segment.isUnused())
+                    recycleSegment(segment);
+
+            CommitLogSegment first;
+            if ((first = activeSegments.peek()) != null && first.id <= last.id)
+                logger.error("Failed to force-recycle all segments; at least one segment is still in use with dirty CFs.");
+        }
+        catch (Throwable t)
+        {
+            // for now just log the error and return false, indicating that we failed
+            logger.error("Failed waiting for a forced recycle of in-use commit log segments", t);
+        }
+    }
+
+    /**
+     * Indicates that a segment is no longer in use and that it should be recycled.
+     *
+     * @param segment segment that is no longer in use
+     */
+    void recycleSegment(final CommitLogSegment segment)
+    {
+        boolean archiveSuccess = CommitLog.instance.archiver.maybeWaitForArchiving(segment.getName());
+        activeSegments.remove(segment);
+        if (!archiveSuccess)
+        {
+            // if archiving (command) was not successful then leave the file alone. don't delete or recycle.
+            discardSegment(segment, false);
+            return;
+        }
+        if (isCapExceeded())
+        {
+            discardSegment(segment, true);
+            return;
+        }
+
+        logger.debug("Recycling {}", segment);
+        segmentManagementTasks.add(new Callable<CommitLogSegment>()
+        {
+            public CommitLogSegment call()
+            {
+                return segment.recycle();
+            }
+        });
+    }
+
+    /**
+     * Differs from the above because it can work on any file instead of just existing
+     * commit log segments managed by this manager.
+     *
+     * @param file segment file that is no longer in use.
+     */
+    void recycleSegment(final File file)
+    {
+        if (isCapExceeded()
+            || CommitLogDescriptor.fromFileName(file.getName()).getMessagingVersion() != MessagingService.current_version)
+        {
+            // (don't decrease managed size, since this was never a "live" segment)
+            logger.debug("(Unopened) segment {} is no longer needed and will be deleted now", file);
+            FileUtils.deleteWithConfirm(file);
+            return;
+        }
+
+        logger.debug("Recycling {}", file);
+        // this wasn't previously a live segment, so add it to the managed size when we make it live
+        size.addAndGet(DatabaseDescriptor.getCommitLogSegmentSize());
+        segmentManagementTasks.add(new Callable<CommitLogSegment>()
+        {
+            public CommitLogSegment call()
+            {
+                return new CommitLogSegment(file.getPath());
+            }
+        });
+    }
+
+    /**
+     * Indicates that a segment file should be deleted.
+     *
+     * @param segment segment to be discarded
+     */
+    private void discardSegment(final CommitLogSegment segment, final boolean deleteFile)
+    {
+        logger.debug("Segment {} is no longer active and will be deleted {}", segment, deleteFile ? "now" : "by the archive script");
+        size.addAndGet(-DatabaseDescriptor.getCommitLogSegmentSize());
+
+        segmentManagementTasks.add(new Callable<CommitLogSegment>()
+        {
+            public CommitLogSegment call()
+            {
+                segment.close();
+                if (deleteFile)
+                    segment.delete();
+                return null;
+            }
+        });
+    }
+
+    /**
+     * @return the space (in bytes) used by all segment files.
+     */
+    public long bytesUsed()
+    {
+        return size.get();
+    }
+
+    /**
+     * @param name the filename to check
+     * @return true if file is managed by this manager.
+     */
+    public boolean manages(String name)
+    {
+        for (CommitLogSegment segment : Iterables.concat(activeSegments, availableSegments))
+            if (segment.getName().equals(name))
+                return true;
+        return false;
+    }
+
+    /**
+     * Check to see if the speculative current size exceeds the cap.
+     *
+     * @return true if cap is exceeded
+     */
+    private boolean isCapExceeded()
+    {
+        return unusedCapacity() < 0;
+    }
+
+    private long unusedCapacity()
+    {
+        long currentSize = size.get();
+        logger.debug("Total active commitlog segment space used is {}", currentSize);
+        return DatabaseDescriptor.getTotalCommitlogSpaceInMB() * 1024 * 1024 - currentSize;
+    }
+
+    /**
+     * Throws a flag that enables the behavior of keeping at least one spare segment
+     * available at all times.
+     */
+    public void enableReserveSegmentCreation()
+    {
+        createReserveSegments = true;
+        wakeManager();
+    }
+
+    /**
+     * Force a flush on all CFs that are still dirty in @param segments.
+     *
+     * @return a Future that will finish when all the flushes are complete.
+     */
+    private Future<?> flushDataFrom(List<CommitLogSegment> segments, boolean force)
+    {
+        if (segments.isEmpty())
+            return Futures.immediateFuture(null);
+        final ReplayPosition maxReplayPosition = segments.get(segments.size() - 1).getContext();
+
+        // a map of CfId -> forceFlush() to ensure we only queue one flush per cf
+        final Map<UUID, ListenableFuture<?>> flushes = new LinkedHashMap<>();
+
+        for (CommitLogSegment segment : segments)
+        {
+            for (UUID dirtyCFId : segment.getDirtyCFIDs())
+            {
+                Pair<String,String> pair = Schema.instance.getCF(dirtyCFId);
+                if (pair == null)
+                {
+                    // even though we remove the schema entry before a final flush when dropping a CF,
+                    // it's still possible for a writer to race and finish his append after the flush.
+                    logger.debug("Marking clean CF {} that doesn't exist anymore", dirtyCFId);
+                    segment.markClean(dirtyCFId, segment.getContext());
+                }
+                else if (!flushes.containsKey(dirtyCFId))
+                {
+                    String keyspace = pair.left;
+                    final ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(dirtyCFId);
+                    // can safely call forceFlush here as we will only ever block (briefly) for other attempts to flush,
+                    // no deadlock possibility since switchLock removal
+                    flushes.put(dirtyCFId, force ? cfs.forceFlush() : cfs.forceFlush(maxReplayPosition));
+                }
+            }
+        }
+
+        return Futures.allAsList(flushes.values());
+    }
+
+    /**
+     * Resets all the segments, for testing purposes. DO NOT USE THIS OUTSIDE OF TESTS.
+     */
+    public void resetUnsafe()
+    {
+        logger.debug("Closing and clearing existing commit log segments...");
+
+        while (!segmentManagementTasks.isEmpty())
+            Thread.yield();
+
+        for (CommitLogSegment segment : activeSegments)
+            segment.close();
+        activeSegments.clear();
+
+        for (CommitLogSegment segment : availableSegments)
+            segment.close();
+        availableSegments.clear();
+
+        allocatingFrom = null;
+    }
+
+    /**
+     * Initiates the shutdown process for the management thread.
+     */
+    public void shutdown()
+    {
+        run = false;
+        managerThread.interrupt();
+    }
+
+    /**
+     * Returns when the management thread terminates.
+     */
+    public void awaitTermination() throws InterruptedException
+    {
+        managerThread.join();
+    }
+
+    /**
+     * @return a read-only collection of the active commit log segments
+     */
+    Collection<CommitLogSegment> getActiveSegments()
+    {
+        return Collections.unmodifiableCollection(activeSegments);
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/db/commitlog/ICommitLogExecutorService.java b/src/java/org/apache/cassandra/db/commitlog/ICommitLogExecutorService.java
deleted file mode 100644
index e2d0b0f..0000000
--- a/src/java/org/apache/cassandra/db/commitlog/ICommitLogExecutorService.java
+++ /dev/null

@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.commitlog;
-
-import java.util.concurrent.Callable;
-import java.util.concurrent.Future;
-
-/**
- * Like ExecutorService, but customized for batch and periodic commitlog execution.
- */
-public interface ICommitLogExecutorService
-{
-    /**
-     * Get the number of completed tasks
-     */
-    public long getCompletedTasks();
-
-    /**
-     * Get the number of tasks waiting to be executed
-     */
-    public long getPendingTasks();
-
-
-    public <T> Future<T> submit(Callable<T> task);
-
-    /**
-     * submits the adder for execution and blocks for it to be synced, if necessary
-     */
-    public void add(CommitLog.LogRecordAdder adder);
-
-    /** shuts down the CommitLogExecutor in an orderly fashion */
-    public void shutdown();
-
-    /** Blocks until shutdown is complete. */
-    public void awaitTermination() throws InterruptedException;
-}

diff --git a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogExecutorService.java b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogExecutorService.java
deleted file mode 100644
index 00507c2..0000000
--- a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogExecutorService.java
+++ /dev/null

@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.commitlog;
-
-import java.io.IOException;
-import java.util.concurrent.*;
-
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.WrappedRunnable;
-
-import com.google.common.util.concurrent.Uninterruptibles;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-class PeriodicCommitLogExecutorService implements ICommitLogExecutorService
-{
-
-    private final BlockingQueue<Runnable> queue;
-    protected volatile long completedTaskCount = 0;
-    private final Thread appendingThread;
-    private volatile boolean run = true;
-
-    public PeriodicCommitLogExecutorService(final CommitLog commitLog)
-    {
-        queue = new LinkedBlockingQueue<Runnable>(DatabaseDescriptor.getCommitLogPeriodicQueueSize());
-        Runnable runnable = new WrappedRunnable()
-        {
-            public void runMayThrow() throws Exception
-            {
-                while (run)
-                {
-                    Runnable r = queue.poll(100, TimeUnit.MILLISECONDS);
-                    if (r == null)
-                        continue;
-                    r.run();
-                    completedTaskCount++;
-                }
-                commitLog.sync();
-            }
-        };
-        appendingThread = new Thread(runnable, "COMMIT-LOG-WRITER");
-        appendingThread.start();
-
-        final Callable syncer = new Callable()
-        {
-            public Object call() throws Exception
-            {
-                commitLog.sync();
-                return null;
-            }
-        };
-
-        new Thread(new Runnable()
-        {
-            public void run()
-            {
-                while (run)
-                {
-                    try
-                    {
-                        FBUtilities.waitOnFuture(submit(syncer));
-                        Uninterruptibles.sleepUninterruptibly(DatabaseDescriptor.getCommitLogSyncPeriod(), TimeUnit.MILLISECONDS);
-                    }
-                    catch (Throwable t)
-                    {
-                        if (!CommitLog.handleCommitError("Failed to persist commits to disk", t))
-                        {
-                            PeriodicCommitLogExecutorService.this.run = false;
-                            try
-                            {
-                                appendingThread.join();
-                            }
-                            catch (InterruptedException e)
-                            {
-                                throw new IllegalStateException();
-                            }
-                            return;
-                        }
-                    }
-                }
-            }
-        }, "PERIODIC-COMMIT-LOG-SYNCER").start();
-
-    }
-
-    public void add(CommitLog.LogRecordAdder adder)
-    {
-        try
-        {
-            queue.put(adder);
-        }
-        catch (InterruptedException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public <T> Future<T> submit(Callable<T> task)
-    {
-        FutureTask<T> ft = new FutureTask<T>(task);
-        try
-        {
-            queue.put(ft);
-        }
-        catch (InterruptedException e)
-        {
-            throw new RuntimeException(e);
-        }
-        return ft;
-    }
-
-    public void shutdown()
-    {
-        new Thread(new WrappedRunnable()
-        {
-            public void runMayThrow() throws InterruptedException, IOException
-            {
-                while (!queue.isEmpty())
-                    Thread.sleep(100);
-                run = false;
-                appendingThread.join();
-            }
-        }, "Commitlog Shutdown").start();
-    }
-
-    public void awaitTermination() throws InterruptedException
-    {
-        appendingThread.join();
-    }
-
-    public long getPendingTasks()
-    {
-        return queue.size();
-    }
-
-    public long getCompletedTasks()
-    {
-        return completedTaskCount;
-    }
-
-}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java
new file mode 100644
index 0000000..14bb367
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java

@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.commitlog;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+
+class PeriodicCommitLogService extends AbstractCommitLogService
+{
+
+    private static final int blockWhenSyncLagsMillis = (int) (DatabaseDescriptor.getCommitLogSyncPeriod() * 1.5);
+
+    public PeriodicCommitLogService(final CommitLog commitLog)
+    {
+        super(commitLog, "PERIODIC-COMMIT-LOG-SYNCER", DatabaseDescriptor.getCommitLogSyncPeriod());
+    }
+
+    protected void maybeWaitForSync(CommitLogSegment.Allocation alloc)
+    {
+        if (waitForSyncToCatchUp(Long.MAX_VALUE))
+        {
+            // wait until periodic sync() catches up with its schedule
+            long started = System.currentTimeMillis();
+            pending.incrementAndGet();
+            while (waitForSyncToCatchUp(started))
+            {
+                WaitQueue.Signal signal = syncComplete.register(CommitLog.instance.metrics.waitingOnCommit.time());
+                if (waitForSyncToCatchUp(started))
+                    signal.awaitUninterruptibly();
+                else
+                    signal.cancel();
+            }
+            pending.decrementAndGet();
+        }
+    }
+
+    /**
+     * @return true if sync is currently lagging behind inserts
+     */
+    private boolean waitForSyncToCatchUp(long started)
+    {
+        return started > lastSyncedAt + blockWhenSyncLagsMillis;
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/db/commitlog/ReplayPosition.java b/src/java/org/apache/cassandra/db/commitlog/ReplayPosition.java
index 354444b..31fc28e 100644
--- a/src/java/org/apache/cassandra/db/commitlog/ReplayPosition.java
+++ b/src/java/org/apache/cassandra/db/commitlog/ReplayPosition.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.db.commitlog;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Comparator;
 
@@ -29,6 +28,7 @@
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class ReplayPosition implements Comparable<ReplayPosition>
 {
@@ -117,9 +117,14 @@
                ')';
     }
 
+    public ReplayPosition clone()
+    {
+        return new ReplayPosition(segment, position);
+    }
+
     public static class ReplayPositionSerializer implements ISerializer<ReplayPosition>
     {
-        public void serialize(ReplayPosition rp, DataOutput out) throws IOException
+        public void serialize(ReplayPosition rp, DataOutputPlus out) throws IOException
         {
             out.writeLong(rp.segment);
             out.writeInt(rp.position);
@@ -130,9 +135,9 @@
             return new ReplayPosition(in.readLong(), in.readInt());
         }
 
-        public long serializedSize(ReplayPosition object, TypeSizes typeSizes)
+        public long serializedSize(ReplayPosition rp, TypeSizes typeSizes)
         {
-            throw new UnsupportedOperationException();
+            return typeSizes.sizeof(rp.segment) + typeSizes.sizeof(rp.position);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactedRow.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactedRow.java
index 734155e..c49bee5 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactedRow.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactedRow.java

@@ -25,6 +25,7 @@
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.io.sstable.ColumnStats;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
  * a CompactedRow is an object that takes a bunch of rows (keys + columnfamilies)
@@ -47,7 +48,7 @@
      *
      * @return index information for the written row, or null if the compaction resulted in only expired tombstones.
      */
-    public abstract RowIndexEntry write(long currentPosition, DataOutput out) throws IOException;
+    public abstract RowIndexEntry write(long currentPosition, DataOutputPlus out) throws IOException;
 
     /**
      * update @param digest with the data bytes of the row (not including row key or row size).

diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
index dc7e43a..97696a8 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java

@@ -19,6 +19,7 @@
 
 import java.util.*;
 
+import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.base.Predicate;
 import com.google.common.collect.ImmutableMap;
@@ -159,7 +160,7 @@
      *
      * Is responsible for marking its sstables as compaction-pending.
      */
-    public abstract AbstractCompactionTask getMaximalTask(final int gcBefore);
+    public abstract Collection<AbstractCompactionTask> getMaximalTask(final int gcBefore);
 
     /**
      * @param sstables SSTables to compact. Must be marked as compacting.
@@ -174,7 +175,7 @@
 
     public AbstractCompactionTask getCompactionTask(Collection<SSTableReader> sstables, final int gcBefore, long maxSSTableBytes)
     {
-        return new CompactionTask(cfs, sstables, gcBefore);
+        return new CompactionTask(cfs, sstables, gcBefore, false);
     }
 
     /**
@@ -264,16 +265,61 @@
      * allow for a more memory efficient solution if we know the sstable don't overlap (see
      * LeveledCompactionStrategy for instance).
      */
-    public List<ICompactionScanner> getScanners(Collection<SSTableReader> sstables, Range<Token> range)
+    public ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token> range)
     {
         RateLimiter limiter = CompactionManager.instance.getRateLimiter();
         ArrayList<ICompactionScanner> scanners = new ArrayList<ICompactionScanner>();
-        for (SSTableReader sstable : sstables)
-            scanners.add(sstable.getScanner(range, limiter));
-        return scanners;
+        try
+        {
+            for (SSTableReader sstable : sstables)
+                scanners.add(sstable.getScanner(range, limiter));
+        }
+        catch (Throwable t)
+        {
+            try
+            {
+                new ScannerList(scanners).close();
+            }
+            catch (Throwable t2)
+            {
+                t.addSuppressed(t2);
+            }
+            throw t;
+        }
+        return new ScannerList(scanners);
     }
 
-    public List<ICompactionScanner> getScanners(Collection<SSTableReader> toCompact)
+    public static class ScannerList implements AutoCloseable
+    {
+        public final List<ICompactionScanner> scanners;
+        public ScannerList(List<ICompactionScanner> scanners)
+        {
+            this.scanners = scanners;
+        }
+
+        public void close()
+        {
+            Throwable t = null;
+            for (ICompactionScanner scanner : scanners)
+            {
+                try
+                {
+                    scanner.close();
+                }
+                catch (Throwable t2)
+                {
+                    if (t == null)
+                        t = t2;
+                    else
+                        t.addSuppressed(t2);
+                }
+            }
+            if (t != null)
+                throw Throwables.propagate(t);
+        }
+    }
+
+    public ScannerList getScanners(Collection<SSTableReader> toCompact)
     {
         return getScanners(toCompact, null);
     }
@@ -315,7 +361,7 @@
         else
         {
             // what percentage of columns do we expect to compact outside of overlap?
-            if (sstable.getKeySampleSize() < 2)
+            if (sstable.getIndexSummarySize() < 2)
             {
                 // we have too few samples to estimate correct percentage
                 return false;
@@ -324,7 +370,7 @@
             long keys = sstable.estimatedKeys();
             Set<Range<Token>> ranges = new HashSet<Range<Token>>(overlaps.size());
             for (SSTableReader overlap : overlaps)
-                ranges.add(new Range<Token>(overlap.first.token, overlap.last.token, overlap.partitioner));
+                ranges.add(new Range<Token>(overlap.first.getToken(), overlap.last.getToken(), overlap.partitioner));
             long remainingKeys = keys - sstable.estimatedKeysForRanges(ranges);
             // next, calculate what percentage of columns we have within those keys
             long columns = sstable.getEstimatedColumnCount().mean() * remainingKeys;

diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
index a4c101f..59338f4 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java

@@ -28,7 +28,7 @@
 public abstract class AbstractCompactionTask extends DiskAwareRunnable
 {
     protected final ColumnFamilyStore cfs;
-    protected Iterable<SSTableReader> sstables;
+    protected Set<SSTableReader> sstables;
     protected boolean isUserDefined;
     protected OperationType compactionType;
 
@@ -36,7 +36,7 @@
      * @param cfs
      * @param sstables must be marked compacting
      */
-    public AbstractCompactionTask(ColumnFamilyStore cfs, Iterable<SSTableReader> sstables)
+    public AbstractCompactionTask(ColumnFamilyStore cfs, Set<SSTableReader> sstables)
     {
         this.cfs = cfs;
         this.sstables = sstables;

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
index fba659d..ef27805 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java

@@ -27,32 +27,26 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DataTracker;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.io.sstable.SSTable;
-import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.utils.AlwaysPresentFilter;
 
 /**
  * Manage compaction options.
  */
-public class CompactionController
+public class CompactionController implements AutoCloseable
 {
     private static final Logger logger = LoggerFactory.getLogger(CompactionController.class);
 
     public final ColumnFamilyStore cfs;
-    private final DataTracker.SSTableIntervalTree overlappingTree;
-    private final Set<SSTableReader> overlappingSSTables;
+    private DataTracker.SSTableIntervalTree overlappingTree;
+    private Set<SSTableReader> overlappingSSTables;
     private final Set<SSTableReader> compacting;
 
     public final int gcBefore;
 
-    /**
-     * Constructor that subclasses may use when overriding shouldPurge to not need overlappingTree
-     */
     protected CompactionController(ColumnFamilyStore cfs, int maxValue)
     {
         this(cfs, null, maxValue);
@@ -64,6 +58,26 @@
         this.cfs = cfs;
         this.gcBefore = gcBefore;
         this.compacting = compacting;
+        refreshOverlaps();
+    }
+
+    void maybeRefreshOverlaps()
+    {
+        for (SSTableReader reader : overlappingSSTables)
+        {
+            if (reader.isMarkedCompacted())
+            {
+                refreshOverlaps();
+                return;
+            }
+        }
+    }
+
+    private void refreshOverlaps()
+    {
+        if (this.overlappingSSTables != null)
+            SSTableReader.releaseReferences(overlappingSSTables);
+
         Set<SSTableReader> overlapping = compacting == null ? null : cfs.getAndReferenceOverlappingSSTables(compacting);
         this.overlappingSSTables = overlapping == null ? Collections.<SSTableReader>emptySet() : overlapping;
         this.overlappingTree = overlapping == null ? null : DataTracker.buildIntervalTree(overlapping);
@@ -116,7 +130,7 @@
         // we still need to keep candidates that might shadow something in a
         // non-candidate sstable. And if we remove a sstable from the candidates, we
         // must take it's timestamp into account (hence the sorting below).
-        Collections.sort(candidates, SSTable.maxTimestampComparator);
+        Collections.sort(candidates, SSTableReader.maxTimestampComparator);
 
         Iterator<SSTableReader> iterator = candidates.iterator();
         while (iterator.hasNext())
@@ -129,11 +143,11 @@
             }
             else
             {
-                logger.debug("Dropping expired SSTable {} (maxLocalDeletionTime={}, gcBefore={})",
-                             candidate, candidate.getSSTableMetadata().maxLocalDeletionTime, gcBefore);
+               logger.debug("Dropping expired SSTable {} (maxLocalDeletionTime={}, gcBefore={})",
+                        candidate, candidate.getSSTableMetadata().maxLocalDeletionTime, gcBefore);
             }
         }
-        return new HashSet<SSTableReader>(candidates);
+        return new HashSet<>(candidates);
     }
 
     public String getKeyspace()
@@ -147,25 +161,25 @@
     }
 
     /**
-     * @return true if it's okay to drop tombstones for the given row, i.e., if we know all the verisons of the row
-     * older than @param maxDeletionTimestamp are included in the compaction set
+     * @return the largest timestamp before which it's okay to drop tombstones for the given partition;
+     * i.e., after the maxPurgeableTimestamp there may exist newer data that still needs to be suppressed
+     * in other sstables.  This returns the minimum timestamp for any SSTable that contains this partition and is not
+     * participating in this compaction, or LONG.MAX_VALUE if no such SSTable exists.
      */
-    public boolean shouldPurge(DecoratedKey key, long maxDeletionTimestamp)
+    public long maxPurgeableTimestamp(DecoratedKey key)
     {
         List<SSTableReader> filteredSSTables = overlappingTree.search(key);
+        long min = Long.MAX_VALUE;
         for (SSTableReader sstable : filteredSSTables)
         {
-            if (sstable.getMinTimestamp() <= maxDeletionTimestamp)
-            {
-                // if we don't have bloom filter(bf_fp_chance=1.0 or filter file is missing),
-                // we check index file instead.
-                if (sstable.getBloomFilter() instanceof AlwaysPresentFilter && sstable.getPosition(key, SSTableReader.Operator.EQ, false) != null)
-                    return false;
-                else if (sstable.getBloomFilter().isPresent(key.key))
-                    return false;
-            }
+            // if we don't have bloom filter(bf_fp_chance=1.0 or filter file is missing),
+            // we check index file instead.
+            if (sstable.getBloomFilter() instanceof AlwaysPresentFilter && sstable.getPosition(key, SSTableReader.Operator.EQ, false) != null)
+                min = Math.min(min, sstable.getMinTimestamp());
+            else if (sstable.getBloomFilter().isPresent(key.getKey()))
+                min = Math.min(min, sstable.getMinTimestamp());
         }
-        return true;
+        return min;
     }
 
     public void invalidateCachedRow(DecoratedKey key)
@@ -173,35 +187,6 @@
         cfs.invalidateCachedRow(key);
     }
 
-    /**
-     * @return an AbstractCompactedRow implementation to write the merged rows in question.
-     *
-     * If there is a single source row, the data is from a current-version sstable, we don't
-     * need to purge and we aren't forcing deserialization for scrub, write it unchanged.
-     * Otherwise, we deserialize, purge tombstones, and reserialize in the latest version.
-     */
-    public AbstractCompactedRow getCompactedRow(List<SSTableIdentityIterator> rows)
-    {
-        long rowSize = 0;
-        for (SSTableIdentityIterator row : rows)
-            rowSize += row.dataSize;
-
-        if (rowSize > DatabaseDescriptor.getInMemoryCompactionLimit())
-        {
-            String keyString = cfs.metadata.getKeyValidator().getString(rows.get(0).getKey().key);
-            logger.info(String.format("Compacting large row %s/%s:%s (%d bytes) incrementally",
-                                      cfs.keyspace.getName(), cfs.name, keyString, rowSize));
-            return new LazilyCompactedRow(this, rows);
-        }
-        return new PrecompactedRow(this, rows);
-    }
-
-    /** convenience method for single-sstable compactions */
-    public AbstractCompactedRow getCompactedRow(SSTableIdentityIterator row)
-    {
-        return getCompactedRow(Collections.singletonList(row));
-    }
-
     public void close()
     {
         SSTableReader.releaseReferences(overlappingSSTables);

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterable.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterable.java
index 866907b..0c9b52a 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionIterable.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionIterable.java

@@ -21,14 +21,14 @@
 import java.util.Comparator;
 import java.util.List;
 
+import com.google.common.collect.ImmutableList;
+
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
 import org.apache.cassandra.utils.CloseableIterator;
 import org.apache.cassandra.utils.MergeIterator;
 
 public class CompactionIterable extends AbstractCompactionIterable
 {
-
     private static final Comparator<OnDiskAtomIterator> comparator = new Comparator<OnDiskAtomIterator>()
     {
         public int compare(OnDiskAtomIterator i1, OnDiskAtomIterator i2)
@@ -54,11 +54,11 @@
 
     protected class Reducer extends MergeIterator.Reducer<OnDiskAtomIterator, AbstractCompactedRow>
     {
-        protected final List<SSTableIdentityIterator> rows = new ArrayList<SSTableIdentityIterator>();
+        protected final List<OnDiskAtomIterator> rows = new ArrayList<>();
 
         public void reduce(OnDiskAtomIterator current)
         {
-            rows.add((SSTableIdentityIterator) current);
+            rows.add(current);
         }
 
         protected AbstractCompactedRow getReduced()
@@ -71,7 +71,7 @@
                 // create a new container for rows, since we're going to clear ours for the next one,
                 // and the AbstractCompactionRow code should be able to assume that the collection it receives
                 // won't be pulled out from under it.
-                return controller.getCompactedRow(new ArrayList<SSTableIdentityIterator>(rows));
+                return new LazilyCompactedRow(controller, ImmutableList.copyOf(rows));
             }
             finally
             {

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
index 5a13e34..51f45b8 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java

@@ -20,15 +20,37 @@
 import java.io.File;
 import java.io.IOException;
 import java.lang.management.ManagementFactory;
-import java.util.*;
-import java.util.concurrent.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.TimeUnit;
 import javax.management.MBeanServer;
 import javax.management.ObjectName;
 import javax.management.openmbean.OpenDataException;
 import javax.management.openmbean.TabularData;
 
+import com.google.common.base.Predicate;
 import com.google.common.base.Throwables;
-import com.google.common.collect.*;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ConcurrentHashMultiset;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Sets;
 import com.google.common.util.concurrent.RateLimiter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -40,17 +62,29 @@
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.OnDiskAtom;
+import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.compaction.CompactionInfo.Holder;
 import org.apache.cassandra.db.index.SecondaryIndexBuilder;
 import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
+import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.sstable.SSTableRewriter;
+import org.apache.cassandra.io.sstable.SSTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.metrics.CompactionMetrics;
 import org.apache.cassandra.repair.Validator;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.utils.*;
 
 /**
@@ -205,76 +239,209 @@
         }
     }
 
-    private static interface AllSSTablesOperation
+    private AllSSTableOpStatus parallelAllSSTableOperation(final ColumnFamilyStore cfs, final OneSSTableOperation operation) throws ExecutionException, InterruptedException
     {
-        public void perform(ColumnFamilyStore store, Iterable<SSTableReader> sstables) throws IOException;
+        Iterable<SSTableReader> compactingSSTables = cfs.markAllCompacting();
+        if (compactingSSTables == null)
+        {
+            logger.info("Aborting operation on {}.{} after failing to interrupt other compaction operations", cfs.keyspace.getName(), cfs.name);
+            return AllSSTableOpStatus.ABORTED;
+        }
+        if (Iterables.isEmpty(compactingSSTables))
+        {
+            logger.info("No sstables for {}.{}", cfs.keyspace.getName(), cfs.name);
+            return AllSSTableOpStatus.SUCCESSFUL;
+        }
+        try
+        {
+            Iterable<SSTableReader> sstables = operation.filterSSTables(compactingSSTables);
+            List<Future<Object>> futures = new ArrayList<>();
+
+            for (final SSTableReader sstable : sstables)
+            {
+                futures.add(executor.submit(new Callable<Object>()
+                {
+                    @Override
+                    public Object call() throws Exception
+                    {
+                        operation.execute(sstable);
+                        return this;
+                    }
+                }));
+            }
+
+            for (Future<Object> f : futures)
+                f.get();
+        }
+        finally
+        {
+            cfs.getDataTracker().unmarkCompacting(compactingSSTables);
+        }
+        return AllSSTableOpStatus.SUCCESSFUL;
     }
 
-    private void performAllSSTableOperation(final ColumnFamilyStore cfs, final AllSSTablesOperation operation) throws InterruptedException, ExecutionException
+    private static interface OneSSTableOperation
     {
-        final Iterable<SSTableReader> sstables = cfs.markAllCompacting();
-        if (sstables == null)
-            return;
+        Iterable<SSTableReader> filterSSTables(Iterable<SSTableReader> input);
+        void execute(SSTableReader input) throws IOException;
+    }
 
-        Callable<Object> runnable = new Callable<Object>()
+    public enum AllSSTableOpStatus { ABORTED(1), SUCCESSFUL(0);
+        public final int statusCode;
+
+        AllSSTableOpStatus(int statusCode)
         {
-            public Object call() throws IOException
+            this.statusCode = statusCode;
+        }
+    }
+
+    public AllSSTableOpStatus performScrub(final ColumnFamilyStore cfs, final boolean skipCorrupted) throws InterruptedException, ExecutionException
+    {
+        assert !cfs.isIndex();
+        return parallelAllSSTableOperation(cfs, new OneSSTableOperation()
+        {
+            @Override
+            public Iterable<SSTableReader> filterSSTables(Iterable<SSTableReader> input)
             {
-                operation.perform(cfs, sstables);
-                cfs.getDataTracker().unmarkCompacting(sstables);
-                return this;
+                return input;
+            }
+
+            @Override
+            public void execute(SSTableReader input) throws IOException
+            {
+                scrubOne(cfs, input, skipCorrupted);
+            }
+        });
+    }
+
+    public AllSSTableOpStatus performSSTableRewrite(final ColumnFamilyStore cfs, final boolean excludeCurrentVersion) throws InterruptedException, ExecutionException
+    {
+        return parallelAllSSTableOperation(cfs, new OneSSTableOperation()
+        {
+            @Override
+            public Iterable<SSTableReader> filterSSTables(Iterable<SSTableReader> input)
+            {
+                return Iterables.filter(input, new Predicate<SSTableReader>()
+                {
+                    @Override
+                    public boolean apply(SSTableReader sstable)
+                    {
+                        return !(excludeCurrentVersion && sstable.descriptor.version.equals(Descriptor.Version.CURRENT));
+                    }
+                });
+            }
+
+            @Override
+            public void execute(SSTableReader input) throws IOException
+            {
+                AbstractCompactionTask task = cfs.getCompactionStrategy().getCompactionTask(Collections.singleton(input), NO_GC, Long.MAX_VALUE);
+                task.setUserDefined(true);
+                task.setCompactionType(OperationType.UPGRADE_SSTABLES);
+                task.execute(metrics);
+            }
+        });
+    }
+
+    public AllSSTableOpStatus performCleanup(final ColumnFamilyStore cfStore) throws InterruptedException, ExecutionException
+    {
+        assert !cfStore.isIndex();
+        Keyspace keyspace = cfStore.keyspace;
+        final Collection<Range<Token>> ranges = StorageService.instance.getLocalRanges(keyspace.getName());
+        if (ranges.isEmpty())
+        {
+            logger.info("Cleanup cannot run before a node has joined the ring");
+            return AllSSTableOpStatus.ABORTED;
+        }
+        final boolean hasIndexes = cfStore.indexManager.hasIndexes();
+        final CleanupStrategy cleanupStrategy = CleanupStrategy.get(cfStore, ranges);
+        return parallelAllSSTableOperation(cfStore, new OneSSTableOperation()
+        {
+            @Override
+            public Iterable<SSTableReader> filterSSTables(Iterable<SSTableReader> input)
+            {
+                List<SSTableReader> sortedSSTables = Lists.newArrayList(input);
+                Collections.sort(sortedSSTables, new SSTableReader.SizeComparator());
+                return sortedSSTables;
+            }
+
+            @Override
+            public void execute(SSTableReader input) throws IOException
+            {
+                doCleanupOne(cfStore, input, cleanupStrategy, ranges, hasIndexes);
+            }
+        });
+    }
+
+    public Future<?> submitAntiCompaction(final ColumnFamilyStore cfs,
+                                          final Collection<Range<Token>> ranges,
+                                          final Collection<SSTableReader> validatedForRepair,
+                                          final long repairedAt)
+    {
+        Runnable runnable = new WrappedRunnable() {
+
+            @Override
+            public void runMayThrow() throws Exception
+            {
+                performAnticompaction(cfs, ranges, validatedForRepair, repairedAt);
             }
         };
-        executor.submit(runnable).get();
+        return executor.submit(runnable);
     }
 
-    public void performScrub(ColumnFamilyStore cfStore, final boolean skipCorrupted) throws InterruptedException, ExecutionException
+    /**
+     * Make sure the {validatedForRepair} are marked for compaction before calling this.
+     *
+     * @param cfs
+     * @param ranges Ranges that the repair was carried out on
+     * @param validatedForRepair SSTables containing the repaired ranges. Should be referenced before passing them.
+     * @throws InterruptedException, ExecutionException, IOException
+     */
+    public void performAnticompaction(ColumnFamilyStore cfs,
+                                      Collection<Range<Token>> ranges,
+                                      Collection<SSTableReader> validatedForRepair,
+                                      long repairedAt) throws InterruptedException, ExecutionException, IOException
     {
-        performAllSSTableOperation(cfStore, new AllSSTablesOperation()
+        logger.info("Starting anticompaction");
+        logger.debug("Starting anticompaction for ranges {}", ranges);
+        Set<SSTableReader> sstables = new HashSet<>(validatedForRepair);
+        Set<SSTableReader> mutatedRepairStatuses = new HashSet<>();
+        Set<SSTableReader> nonAnticompacting = new HashSet<>();
+        Iterator<SSTableReader> sstableIterator = sstables.iterator();
+        while (sstableIterator.hasNext())
         {
-            public void perform(ColumnFamilyStore store, Iterable<SSTableReader> sstables) throws IOException
+            SSTableReader sstable = sstableIterator.next();
+            for (Range<Token> r : Range.normalize(ranges))
             {
-                doScrub(store, sstables, skipCorrupted);
-            }
-        });
-    }
-
-    public void performSSTableRewrite(ColumnFamilyStore cfStore, final boolean excludeCurrentVersion) throws InterruptedException, ExecutionException
-    {
-        performAllSSTableOperation(cfStore, new AllSSTablesOperation()
-        {
-            public void perform(ColumnFamilyStore cfs, Iterable<SSTableReader> sstables)
-            {
-                for (final SSTableReader sstable : sstables)
+                Range<Token> sstableRange = new Range<>(sstable.first.getToken(), sstable.last.getToken(), sstable.partitioner);
+                if (r.contains(sstableRange))
                 {
-                    if (excludeCurrentVersion && sstable.descriptor.version.equals(Descriptor.Version.CURRENT))
-                        continue;
-
-                    // SSTables are marked by the caller
-                    // NOTE: it is important that the task create one and only one sstable, even for Leveled compaction (see LeveledManifest.replace())
-                    AbstractCompactionTask task = cfs.getCompactionStrategy().getCompactionTask(Collections.singleton(sstable), NO_GC, Long.MAX_VALUE);
-                    task.setUserDefined(true);
-                    task.setCompactionType(OperationType.UPGRADE_SSTABLES);
-                    task.execute(metrics);
+                    logger.info("SSTable {} fully contained in range {}, mutating repairedAt instead of anticompacting", sstable, r);
+                    sstable.descriptor.getMetadataSerializer().mutateRepairedAt(sstable.descriptor, repairedAt);
+                    sstable.reloadSSTableMetadata();
+                    mutatedRepairStatuses.add(sstable);
+                    sstableIterator.remove();
+                    break;
+                }
+                else if (!sstableRange.intersects(r))
+                {
+                    logger.info("SSTable {} ({}) does not intersect repaired range {}, not touching repairedAt.", sstable, sstableRange, r);
+                    nonAnticompacting.add(sstable);
+                    sstableIterator.remove();
+                    break;
+                }
+                else
+                {
+                    logger.info("SSTable {} ({}) will be anticompacted on range {}", sstable, sstableRange, r);
                 }
             }
-        });
-    }
-
-    public void performCleanup(ColumnFamilyStore cfStore, final CounterId.OneShotRenewer renewer) throws InterruptedException, ExecutionException
-    {
-        performAllSSTableOperation(cfStore, new AllSSTablesOperation()
-        {
-            public void perform(ColumnFamilyStore store, Iterable<SSTableReader> sstables) throws IOException
-            {
-                // Sort the column families in order of SSTable size, so cleanup of smaller CFs
-                // can free up space for larger ones
-                List<SSTableReader> sortedSSTables = Lists.newArrayList(sstables);
-                Collections.sort(sortedSSTables, new SSTableReader.SizeComparator());
-
-                doCleanupCompaction(store, sortedSSTables, renewer);
-            }
-        });
+        }
+        cfs.getDataTracker().notifySSTableRepairedStatusChanged(mutatedRepairStatuses);
+        cfs.getDataTracker().unmarkCompacting(Sets.union(nonAnticompacting, mutatedRepairStatuses));
+        if (!sstables.isEmpty())
+            doAntiCompaction(cfs, ranges, sstables, repairedAt);
+        SSTableReader.releaseReferences(sstables);
+        cfs.getDataTracker().unmarkCompacting(sstables);
+        logger.info(String.format("Completed anticompaction successfully"));
     }
 
     public void performMaximal(final ColumnFamilyStore cfStore) throws InterruptedException, ExecutionException
@@ -287,14 +454,15 @@
         // here we compute the task off the compaction executor, so having that present doesn't
         // confuse runWithCompactionsDisabled -- i.e., we don't want to deadlock ourselves, waiting
         // for ourselves to finish/acknowledge cancellation before continuing.
-        final AbstractCompactionTask task = cfStore.getCompactionStrategy().getMaximalTask(gcBefore);
+        final Collection<AbstractCompactionTask> tasks = cfStore.getCompactionStrategy().getMaximalTask(gcBefore);
         Runnable runnable = new WrappedRunnable()
         {
             protected void runMayThrow() throws IOException
             {
-                if (task == null)
+                if (tasks == null)
                     return;
-                task.execute(metrics);
+                for (AbstractCompactionTask task : tasks)
+                    task.execute(metrics);
             }
         };
         return executor.submit(runnable);
@@ -303,7 +471,7 @@
     public void forceUserDefinedCompaction(String dataFiles)
     {
         String[] filenames = dataFiles.split(",");
-        Multimap<Pair<String, String>, Descriptor> descriptors = ArrayListMultimap.create();
+        Multimap<ColumnFamilyStore, Descriptor> descriptors = ArrayListMultimap.create();
 
         for (String filename : filenames)
         {
@@ -314,19 +482,14 @@
                 logger.warn("Schema does not exist for file {}. Skipping.", filename);
                 continue;
             }
-            File directory = new File(desc.ksname + File.separator + desc.cfname);
             // group by keyspace/columnfamily
-            Pair<Descriptor, String> p = Descriptor.fromFilename(directory, filename.trim());
-            Pair<String, String> key = Pair.create(p.left.ksname, p.left.cfname);
-            descriptors.put(key, p.left);
+            ColumnFamilyStore cfs = Keyspace.open(desc.ksname).getColumnFamilyStore(desc.cfname);
+            descriptors.put(cfs, cfs.directories.find(filename.trim()));
         }
 
         List<Future<?>> futures = new ArrayList<>();
-        for (Pair<String, String> key : descriptors.keySet())
-        {
-            ColumnFamilyStore cfs = Keyspace.open(key.left).getColumnFamilyStore(key.right);
-            futures.add(submitUserDefined(cfs, descriptors.get(key), getDefaultGcBefore(cfs)));
-        }
+        for (ColumnFamilyStore cfs : descriptors.keySet())
+            futures.add(submitUserDefined(cfs, descriptors.get(cfs), getDefaultGcBefore(cfs)));
         FBUtilities.waitOnFutures(futures);
     }
 
@@ -369,16 +532,12 @@
     }
 
     // This acquire a reference on the sstable
-    // This is not efficent, do not use in any critical path
+    // This is not efficient, do not use in any critical path
     private SSTableReader lookupSSTable(final ColumnFamilyStore cfs, Descriptor descriptor)
     {
         for (SSTableReader sstable : cfs.getSSTables())
         {
-            // .equals() with no other changes won't work because in sstable.descriptor, the directory is an absolute path.
-            // We could construct descriptor with an absolute path too but I haven't found any satisfying way to do that
-            // (DB.getDataFileLocationForTable() may not return the right path if you have multiple volumes). Hence the
-            // endsWith.
-            if (sstable.descriptor.toString().endsWith(descriptor.toString()))
+            if (sstable.descriptor.equals(descriptor))
                 return sstable;
         }
         return null;
@@ -419,23 +578,9 @@
         }
     }
 
-    /**
-     * Deserialize everything in the CFS and re-serialize w/ the newest version.  Also attempts to recover
-     * from bogus row keys / sizes using data from the index, and skips rows with garbage columns that resulted
-     * from early ByteBuffer bugs.
-     *
-     * @throws IOException
-     */
-    private void doScrub(ColumnFamilyStore cfs, Iterable<SSTableReader> sstables, boolean skipCorrupted) throws IOException
-    {
-        assert !cfs.isIndex();
-        for (final SSTableReader sstable : sstables)
-            scrubOne(cfs, sstable, skipCorrupted);
-    }
-
     private void scrubOne(ColumnFamilyStore cfs, SSTableReader sstable, boolean skipCorrupted) throws IOException
     {
-        Scrubber scrubber = new Scrubber(cfs, sstable, skipCorrupted);
+        Scrubber scrubber = new Scrubber(cfs, sstable, skipCorrupted, false);
 
         CompactionInfo.Holder scrubInfo = scrubber.getScrubInfo();
         metrics.beginCompaction(scrubInfo);
@@ -448,14 +593,6 @@
             scrubber.close();
             metrics.finishCompaction(scrubInfo);
         }
-
-        if (scrubber.getNewInOrderSSTable() != null)
-            cfs.addSSTable(scrubber.getNewInOrderSSTable());
-
-        if (scrubber.getNewSSTable() == null)
-            cfs.markObsolete(Collections.singletonList(sstable), OperationType.SCRUB);
-        else
-            cfs.replaceCompactedSSTables(Collections.singletonList(sstable), Collections.singletonList(scrubber.getNewSSTable()), OperationType.SCRUB);
     }
 
     /**
@@ -472,7 +609,7 @@
         // see if there are any keys LTE the token for the start of the first range
         // (token range ownership is exclusive on the LHS.)
         Range<Token> firstRange = sortedRanges.get(0);
-        if (sstable.first.token.compareTo(firstRange.left) <= 0)
+        if (sstable.first.getToken().compareTo(firstRange.left) <= 0)
             return true;
 
         // then, iterate over all owned ranges and see if the next key beyond the end of the owned
@@ -501,7 +638,7 @@
             }
 
             Range<Token> nextRange = sortedRanges.get(i + 1);
-            if (!nextRange.contains(firstBeyondRange.token))
+            if (!nextRange.contains(firstBeyondRange.getToken()))
             {
                 // we found a key in between the owned ranges
                 return true;
@@ -512,123 +649,103 @@
     }
 
     /**
-     * This function goes over each file and removes the keys that the node is not responsible for
+     * This function goes over a file and removes the keys that the node is not responsible for
      * and only keeps keys that this node is responsible for.
      *
      * @throws IOException
      */
-    private void doCleanupCompaction(final ColumnFamilyStore cfs, Collection<SSTableReader> sstables, CounterId.OneShotRenewer renewer) throws IOException
+    private void doCleanupOne(final ColumnFamilyStore cfs, SSTableReader sstable, CleanupStrategy cleanupStrategy, Collection<Range<Token>> ranges, boolean hasIndexes) throws IOException
     {
         assert !cfs.isIndex();
-        Keyspace keyspace = cfs.keyspace;
-        Collection<Range<Token>> ranges = StorageService.instance.getLocalRanges(keyspace.getName());
-        if (ranges.isEmpty())
+
+        if (!hasIndexes && !new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(ranges))
         {
-            logger.info("Cleanup cannot run before a node has joined the ring");
+            cfs.getDataTracker().markCompactedSSTablesReplaced(Arrays.asList(sstable), Collections.<SSTableReader>emptyList(), OperationType.CLEANUP);
+            return;
+        }
+        if (!needsCleanup(sstable, ranges))
+        {
+            logger.debug("Skipping {} for cleanup; all rows should be kept", sstable);
             return;
         }
 
-        boolean hasIndexes = cfs.indexManager.hasIndexes();
-        CleanupStrategy cleanupStrategy = CleanupStrategy.get(cfs, ranges, renewer);
+        long start = System.nanoTime();
 
-        for (SSTableReader sstable : sstables)
+        long totalkeysWritten = 0;
+
+        int expectedBloomFilterSize = Math.max(cfs.metadata.getMinIndexInterval(),
+                                               (int) (SSTableReader.getApproximateKeyCount(Arrays.asList(sstable))));
+        if (logger.isDebugEnabled())
+            logger.debug("Expected bloom filter size : {}", expectedBloomFilterSize);
+
+        logger.info("Cleaning up {}", sstable);
+
+        File compactionFileLocation = cfs.directories.getDirectoryForNewSSTables();
+        if (compactionFileLocation == null)
+            throw new IOException("disk full");
+
+        ICompactionScanner scanner = cleanupStrategy.getScanner(sstable, getRateLimiter());
+        CleanupInfo ci = new CleanupInfo(sstable, scanner);
+
+        metrics.beginCompaction(ci);
+        SSTableRewriter writer = new SSTableRewriter(cfs, new HashSet<>(ImmutableSet.of(sstable)), sstable.maxDataAge, OperationType.CLEANUP, false);
+
+        try (CompactionController controller = new CompactionController(cfs, Collections.singleton(sstable), getDefaultGcBefore(cfs)))
         {
-            if (!hasIndexes && !new Bounds<Token>(sstable.first.token, sstable.last.token).intersects(ranges))
+            writer.switchWriter(createWriter(cfs, compactionFileLocation, expectedBloomFilterSize, sstable.getSSTableMetadata().repairedAt, sstable));
+
+            while (scanner.hasNext())
             {
-                cfs.replaceCompactedSSTables(Arrays.asList(sstable), Collections.<SSTableReader>emptyList(), OperationType.CLEANUP);
-                continue;
-            }
-            if (!needsCleanup(sstable, ranges))
-            {
-                logger.debug("Skipping {} for cleanup; all rows should be kept", sstable);
-                continue;
-            }
+                if (ci.isStopRequested())
+                    throw new CompactionInterruptedException(ci.getCompactionInfo());
 
-            CompactionController controller = new CompactionController(cfs, Collections.singleton(sstable), getDefaultGcBefore(cfs));
-            long start = System.nanoTime();
-
-            long totalkeysWritten = 0;
-
-            int expectedBloomFilterSize = Math.max(cfs.metadata.getIndexInterval(),
-                                                   (int) (SSTableReader.getApproximateKeyCount(Arrays.asList(sstable), cfs.metadata)));
-            if (logger.isDebugEnabled())
-                logger.debug("Expected bloom filter size : " + expectedBloomFilterSize);
-
-            logger.info("Cleaning up " + sstable);
-
-            File compactionFileLocation = cfs.directories.getDirectoryForNewSSTables();
-            if (compactionFileLocation == null)
-                throw new IOException("disk full");
-
-            ICompactionScanner scanner = cleanupStrategy.getScanner(sstable, getRateLimiter());
-            CleanupInfo ci = new CleanupInfo(sstable, scanner);
-
-            metrics.beginCompaction(ci);
-            SSTableWriter writer = createWriter(cfs,
-                                                compactionFileLocation,
-                                                expectedBloomFilterSize,
-                                                sstable);
-            SSTableReader newSstable = null;
-            try
-            {
-                while (scanner.hasNext())
-                {
-                    if (ci.isStopRequested())
-                        throw new CompactionInterruptedException(ci.getCompactionInfo());
-                    SSTableIdentityIterator row = (SSTableIdentityIterator) scanner.next();
-
-                    row = cleanupStrategy.cleanup(row);
-                    if (row == null)
-                        continue;
-                    AbstractCompactedRow compactedRow = controller.getCompactedRow(row);
-                    if (writer.append(compactedRow) != null)
-                        totalkeysWritten++;
-                }
-                if (totalkeysWritten > 0)
-                    newSstable = writer.closeAndOpenReader(sstable.maxDataAge);
-                else
-                    writer.abort();
-            }
-            catch (Throwable e)
-            {
-                writer.abort();
-                throw Throwables.propagate(e);
-            }
-            finally
-            {
-                controller.close();
-                scanner.close();
-                metrics.finishCompaction(ci);
-            }
-
-            List<SSTableReader> results = new ArrayList<SSTableReader>(1);
-            if (newSstable != null)
-            {
-                results.add(newSstable);
-
-                String format = "Cleaned up to %s.  %,d to %,d (~%d%% of original) bytes for %,d keys.  Time: %,dms.";
-                long dTime = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
-                long startsize = sstable.onDiskLength();
-                long endsize = newSstable.onDiskLength();
-                double ratio = (double) endsize / (double) startsize;
-                logger.info(String.format(format, writer.getFilename(), startsize, endsize, (int) (ratio * 100), totalkeysWritten, dTime));
+                SSTableIdentityIterator row = (SSTableIdentityIterator) scanner.next();
+                row = cleanupStrategy.cleanup(row);
+                if (row == null)
+                    continue;
+                AbstractCompactedRow compactedRow = new LazilyCompactedRow(controller, Collections.singletonList(row));
+                if (writer.append(compactedRow) != null)
+                    totalkeysWritten++;
             }
 
             // flush to ensure we don't lose the tombstones on a restart, since they are not commitlog'd
             cfs.indexManager.flushIndexesBlocking();
 
-            cfs.replaceCompactedSSTables(Arrays.asList(sstable), results, OperationType.CLEANUP);
+            writer.finish();
         }
+        catch (Throwable e)
+        {
+            writer.abort();
+            throw Throwables.propagate(e);
+        }
+        finally
+        {
+            scanner.close();
+            metrics.finishCompaction(ci);
+        }
+
+        List<SSTableReader> results = writer.finished();
+        if (!results.isEmpty())
+        {
+            String format = "Cleaned up to %s.  %,d to %,d (~%d%% of original) bytes for %,d keys.  Time: %,dms.";
+            long dTime = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
+            long startsize = sstable.onDiskLength();
+            long endsize = 0;
+            for (SSTableReader newSstable : results)
+                endsize += newSstable.onDiskLength();
+            double ratio = (double) endsize / (double) startsize;
+            logger.info(String.format(format, results.get(0).getFilename(), startsize, endsize, (int) (ratio * 100), totalkeysWritten, dTime));
+        }
+
     }
 
     private static abstract class CleanupStrategy
     {
-        public static CleanupStrategy get(ColumnFamilyStore cfs, Collection<Range<Token>> ranges, CounterId.OneShotRenewer renewer)
+        public static CleanupStrategy get(ColumnFamilyStore cfs, Collection<Range<Token>> ranges)
         {
-            if (cfs.indexManager.hasIndexes() || cfs.metadata.getDefaultValidator().isCommutative())
-                return new Full(cfs, ranges, renewer);
-
-            return new Bounded(cfs, ranges);
+            return cfs.indexManager.hasIndexes()
+                 ? new Full(cfs, ranges)
+                 : new Bounded(cfs, ranges);
         }
 
         public abstract ICompactionScanner getScanner(SSTableReader sstable, RateLimiter limiter);
@@ -668,15 +785,13 @@
         {
             private final Collection<Range<Token>> ranges;
             private final ColumnFamilyStore cfs;
-            private List<Column> indexedColumnsInRow;
-            private final CounterId.OneShotRenewer renewer;
+            private List<Cell> indexedColumnsInRow;
 
-            public Full(ColumnFamilyStore cfs, Collection<Range<Token>> ranges, CounterId.OneShotRenewer renewer)
+            public Full(ColumnFamilyStore cfs, Collection<Range<Token>> ranges)
             {
                 this.cfs = cfs;
                 this.ranges = ranges;
                 this.indexedColumnsInRow = null;
-                this.renewer = renewer;
             }
 
             @Override
@@ -688,7 +803,7 @@
             @Override
             public SSTableIdentityIterator cleanup(SSTableIdentityIterator row)
             {
-                if (Range.isInRanges(row.getKey().token, ranges))
+                if (Range.isInRanges(row.getKey().getToken(), ranges))
                     return row;
 
                 cfs.invalidateCachedRow(row.getKey());
@@ -699,29 +814,22 @@
                 while (row.hasNext())
                 {
                     OnDiskAtom column = row.next();
-                    if (column instanceof CounterColumn)
-                        renewer.maybeRenew((CounterColumn) column);
 
-                    if (column instanceof Column && cfs.indexManager.indexes((Column) column))
+                    if (column instanceof Cell && cfs.indexManager.indexes((Cell) column))
                     {
                         if (indexedColumnsInRow == null)
                             indexedColumnsInRow = new ArrayList<>();
 
-                        indexedColumnsInRow.add((Column) column);
+                        indexedColumnsInRow.add((Cell) column);
                     }
                 }
 
                 if (indexedColumnsInRow != null && !indexedColumnsInRow.isEmpty())
                 {
                     // acquire memtable lock here because secondary index deletion may cause a race. See CASSANDRA-3712
-                    Keyspace.switchLock.readLock().lock();
-                    try
+                    try (OpOrder.Group opGroup = cfs.keyspace.writeOrder.start())
                     {
-                        cfs.indexManager.deleteFromIndexes(row.getKey(), indexedColumnsInRow);
-                    }
-                    finally
-                    {
-                        Keyspace.switchLock.readLock().unlock();
+                        cfs.indexManager.deleteFromIndexes(row.getKey(), indexedColumnsInRow, opGroup);
                     }
                 }
                 return null;
@@ -732,14 +840,16 @@
     public static SSTableWriter createWriter(ColumnFamilyStore cfs,
                                              File compactionFileLocation,
                                              int expectedBloomFilterSize,
+                                             long repairedAt,
                                              SSTableReader sstable)
     {
         FileUtils.createDirectory(compactionFileLocation);
         return new SSTableWriter(cfs.getTempSSTablePath(compactionFileLocation),
                                  expectedBloomFilterSize,
+                                 repairedAt,
                                  cfs.metadata,
                                  cfs.partitioner,
-                                 SSTableMetadata.createCollector(Collections.singleton(sstable), cfs.metadata.comparator, sstable.getSSTableLevel()));
+                                 new MetadataCollector(Collections.singleton(sstable), cfs.metadata.comparator, sstable.getSSTableLevel()));
     }
 
     /**
@@ -756,67 +866,184 @@
         if (!cfs.isValid())
             return;
 
-        Collection<SSTableReader> sstables;
-        String snapshotName = validator.desc.sessionId.toString();
-        int gcBefore;
-        boolean isSnapshotValidation = cfs.snapshotExists(snapshotName);
-        if (isSnapshotValidation)
-        {
-            // If there is a snapshot created for the session then read from there.
-            sstables = cfs.getSnapshotSSTableReader(snapshotName);
-
-            // Computing gcbefore based on the current time wouldn't be very good because we know each replica will execute
-            // this at a different time (that's the whole purpose of repair with snaphsot). So instead we take the creation
-            // time of the snapshot, which should give us roughtly the same time on each replica (roughtly being in that case
-            // 'as good as in the non-snapshot' case)
-            gcBefore = cfs.gcBefore(cfs.getSnapshotCreationTime(snapshotName));
-        }
-        else
-        {
-            // flush first so everyone is validating data that is as similar as possible
-            StorageService.instance.forceKeyspaceFlush(cfs.keyspace.getName(), cfs.name);
-
-            // we don't mark validating sstables as compacting in DataTracker, so we have to mark them referenced
-            // instead so they won't be cleaned up if they do get compacted during the validation
-            sstables = cfs.markCurrentSSTablesReferenced();
-            if (validator.gcBefore > 0)
-                gcBefore = validator.gcBefore;
-            else
-                gcBefore = getDefaultGcBefore(cfs);
-        }
-
-        CompactionIterable ci = new ValidationCompactionIterable(cfs, sstables, validator.desc.range, gcBefore);
-        CloseableIterator<AbstractCompactedRow> iter = ci.iterator();
-        metrics.beginCompaction(ci);
+        Collection<SSTableReader> sstables = null;
         try
         {
-            // validate the CF as we iterate over it
-            validator.prepare(cfs);
-            while (iter.hasNext())
+
+            String snapshotName = validator.desc.sessionId.toString();
+            int gcBefore;
+            boolean isSnapshotValidation = cfs.snapshotExists(snapshotName);
+            if (isSnapshotValidation)
             {
-                if (ci.isStopRequested())
-                    throw new CompactionInterruptedException(ci.getCompactionInfo());
-                AbstractCompactedRow row = iter.next();
-                validator.add(row);
+                // If there is a snapshot created for the session then read from there.
+                sstables = cfs.getSnapshotSSTableReader(snapshotName);
+
+                // Computing gcbefore based on the current time wouldn't be very good because we know each replica will execute
+                // this at a different time (that's the whole purpose of repair with snaphsot). So instead we take the creation
+                // time of the snapshot, which should give us roughtly the same time on each replica (roughtly being in that case
+                // 'as good as in the non-snapshot' case)
+                gcBefore = cfs.gcBefore(cfs.getSnapshotCreationTime(snapshotName));
             }
-            validator.complete();
+            else
+            {
+                // flush first so everyone is validating data that is as similar as possible
+                StorageService.instance.forceKeyspaceFlush(cfs.keyspace.getName(), cfs.name);
+                // we don't mark validating sstables as compacting in DataTracker, so we have to mark them referenced
+                // instead so they won't be cleaned up if they do get compacted during the validation
+                if (validator.desc.parentSessionId == null || ActiveRepairService.instance.getParentRepairSession(validator.desc.parentSessionId) == null)
+                    sstables = cfs.markCurrentSSTablesReferenced();
+                else
+                    sstables = ActiveRepairService.instance.getParentRepairSession(validator.desc.parentSessionId).getAndReferenceSSTables(cfs.metadata.cfId);
+
+                if (validator.gcBefore > 0)
+                    gcBefore = validator.gcBefore;
+                else
+                    gcBefore = getDefaultGcBefore(cfs);
+            }
+
+            // Create Merkle tree suitable to hold estimated partitions for given range.
+            // We blindly assume that partition is evenly distributed on all sstables for now.
+            long numPartitions = 0;
+            for (SSTableReader sstable : sstables)
+            {
+                numPartitions += sstable.estimatedKeysForRanges(Collections.singleton(validator.desc.range));
+            }
+            // determine tree depth from number of partitions, but cap at 20 to prevent large tree.
+            int depth = numPartitions > 0 ? (int) Math.min(Math.floor(Math.log(numPartitions)), 20) : 0;
+            MerkleTree tree = new MerkleTree(cfs.partitioner, validator.desc.range, MerkleTree.RECOMMENDED_DEPTH, (int) Math.pow(2, depth));
+
+            long start = System.nanoTime();
+            try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables, validator.desc.range))
+            {
+                CompactionIterable ci = new ValidationCompactionIterable(cfs, scanners.scanners, gcBefore);
+                Iterator<AbstractCompactedRow> iter = ci.iterator();
+                metrics.beginCompaction(ci);
+                try
+                {
+                    // validate the CF as we iterate over it
+                    validator.prepare(cfs, tree);
+                    while (iter.hasNext())
+                    {
+                        if (ci.isStopRequested())
+                            throw new CompactionInterruptedException(ci.getCompactionInfo());
+                        AbstractCompactedRow row = iter.next();
+                        validator.add(row);
+                    }
+                    validator.complete();
+                }
+                finally
+                {
+                    if (isSnapshotValidation)
+                    {
+                        cfs.clearSnapshot(snapshotName);
+                    }
+
+                    metrics.finishCompaction(ci);
+                }
+            }
+
+            if (logger.isDebugEnabled())
+            {
+                // MT serialize may take time
+                long duration = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
+                logger.debug("Validation finished in {} msec, depth {} for {} keys, serialized size {} bytes for {}",
+                             duration,
+                             depth,
+                             numPartitions,
+                             MerkleTree.serializer.serializedSize(tree, 0),
+                             validator.desc);
+            }
         }
         finally
         {
-            iter.close();
-            if (isSnapshotValidation)
-            {
-                for (SSTableReader sstable : sstables)
-                    FileUtils.closeQuietly(sstable);
-                cfs.clearSnapshot(snapshotName);
-            }
-            else
-            {
+            if (sstables != null)
                 SSTableReader.releaseReferences(sstables);
+        }
+    }
+
+    /**
+     * Splits up an sstable into two new sstables. The first of the new tables will store repaired ranges, the second
+     * will store the non-repaired ranges. Once anticompation is completed, the original sstable is marked as compacted
+     * and subsequently deleted.
+     * @param cfs
+     * @param repairedSSTables
+     * @param ranges Repaired ranges to be placed into one of the new sstables. The repaired table will be tracked via
+     * the {@link org.apache.cassandra.io.sstable.metadata.StatsMetadata#repairedAt} field.
+     */
+    private Collection<SSTableReader> doAntiCompaction(ColumnFamilyStore cfs, Collection<Range<Token>> ranges, Collection<SSTableReader> repairedSSTables, long repairedAt)
+    {
+        List<SSTableReader> anticompactedSSTables = new ArrayList<>();
+        int repairedKeyCount = 0;
+        int unrepairedKeyCount = 0;
+        // TODO(5351): we can do better here:
+        int expectedBloomFilterSize = Math.max(cfs.metadata.getMinIndexInterval(), (int)(SSTableReader.getApproximateKeyCount(repairedSSTables)));
+        logger.info("Performing anticompaction on {} sstables", repairedSSTables.size());
+        // iterate over sstables to check if the repaired / unrepaired ranges intersect them.
+        for (SSTableReader sstable : repairedSSTables)
+        {
+            // check that compaction hasn't stolen any sstables used in previous repair sessions
+            // if we need to skip the anticompaction, it will be carried out by the next repair
+            if (!new File(sstable.getFilename()).exists())
+            {
+                logger.info("Skipping anticompaction for {}, required sstable was compacted and is no longer available.", sstable);
+                continue;
             }
 
-            metrics.finishCompaction(ci);
+            logger.info("Anticompacting {}", sstable);
+            Set<SSTableReader> sstableAsSet = new HashSet<>();
+            sstableAsSet.add(sstable);
+
+            File destination = cfs.directories.getDirectoryForNewSSTables();
+            SSTableRewriter repairedSSTableWriter = new SSTableRewriter(cfs, sstableAsSet, sstable.maxDataAge, OperationType.ANTICOMPACTION, false);
+            SSTableRewriter unRepairedSSTableWriter = new SSTableRewriter(cfs, sstableAsSet, sstable.maxDataAge, OperationType.ANTICOMPACTION, false);
+
+            AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
+            try (AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(new HashSet<>(Collections.singleton(sstable)));
+                 CompactionController controller = new CompactionController(cfs, sstableAsSet, CFMetaData.DEFAULT_GC_GRACE_SECONDS))
+            {
+                repairedSSTableWriter.switchWriter(CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, repairedAt, sstable));
+                unRepairedSSTableWriter.switchWriter(CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, ActiveRepairService.UNREPAIRED_SSTABLE, sstable));
+
+                CompactionIterable ci = new CompactionIterable(OperationType.ANTICOMPACTION, scanners.scanners, controller);
+                Iterator<AbstractCompactedRow> iter = ci.iterator();
+                while(iter.hasNext())
+                {
+                    AbstractCompactedRow row = iter.next();
+                    // if current range from sstable is repaired, save it into the new repaired sstable
+                    if (Range.isInRanges(row.key.getToken(), ranges))
+                    {
+                        repairedSSTableWriter.append(row);
+                        repairedKeyCount++;
+                    }
+                    // otherwise save into the new 'non-repaired' table
+                    else
+                    {
+                        unRepairedSSTableWriter.append(row);
+                        unrepairedKeyCount++;
+                    }
+                }
+                // we have the same readers being rewritten by both writers, so we ask the first one NOT to close them
+                // so that the second one can do so safely, without leaving us with references < 0 or any other ugliness
+                repairedSSTableWriter.finish(false, repairedAt);
+                unRepairedSSTableWriter.finish(ActiveRepairService.UNREPAIRED_SSTABLE);
+                // add repaired table with a non-null timestamp field to be saved in SSTableMetadata#repairedAt
+                anticompactedSSTables.addAll(repairedSSTableWriter.finished());
+                anticompactedSSTables.addAll(unRepairedSSTableWriter.finished());
+            }
+            catch (Throwable e)
+            {
+                JVMStabilityInspector.inspectThrowable(e);
+                logger.error("Error anticompacting " + sstable, e);
+                repairedSSTableWriter.abort();
+                unRepairedSSTableWriter.abort();
+            }
         }
+        String format = "Repaired {} keys of {} for {}/{}";
+        logger.debug(format, repairedKeyCount, (repairedKeyCount + unrepairedKeyCount), cfs.keyspace, cfs.getColumnFamilyName());
+        String format2 = "Anticompaction completed successfully, anticompacted from {} to {} sstable(s).";
+        logger.info(format2, repairedSSTables.size(), anticompactedSSTables.size());
+
+        return anticompactedSSTables;
     }
 
     /**
@@ -884,11 +1111,9 @@
 
     private static class ValidationCompactionIterable extends CompactionIterable
     {
-        public ValidationCompactionIterable(ColumnFamilyStore cfs, Collection<SSTableReader> sstables, Range<Token> range, int gcBefore)
+        public ValidationCompactionIterable(ColumnFamilyStore cfs, List<ICompactionScanner> scanners, int gcBefore)
         {
-            super(OperationType.VALIDATION,
-                  cfs.getCompactionStrategy().getScanners(sstables, range),
-                  new ValidationCompactionController(cfs, gcBefore));
+            super(OperationType.VALIDATION, scanners, new ValidationCompactionController(cfs, gcBefore));
         }
     }
 
@@ -906,7 +1131,7 @@
         }
 
         @Override
-        public boolean shouldPurge(DecoratedKey key, long delTimestamp)
+        public long maxPurgeableTimestamp(DecoratedKey key)
         {
             /*
              * The main reason we always purge is that including gcable tombstone would mean that the
@@ -919,7 +1144,7 @@
              * a tombstone that could shadow a column in another sstable, but this is doubly not a concern
              * since validation compaction is read-only.
              */
-            return true;
+            return Long.MAX_VALUE;
         }
     }
 

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
index 5ef4aad..d2ae04a 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java

@@ -19,34 +19,49 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.*;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
 import java.util.concurrent.TimeUnit;
 
-import com.google.common.base.Throwables;
+import com.google.common.base.Predicate;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.compaction.CompactionManager.CompactionExecutorStatsCollector;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.sstable.SSTableRewriter;
+import org.apache.cassandra.io.sstable.SSTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.CloseableIterator;
 
 public class CompactionTask extends AbstractCompactionTask
 {
     protected static final Logger logger = LoggerFactory.getLogger(CompactionTask.class);
     protected final int gcBefore;
+    private final boolean offline;
     protected static long totalBytesCompacted = 0;
-    private Set<SSTableReader> toCompact;
     private CompactionExecutorStatsCollector collector;
 
-    public CompactionTask(ColumnFamilyStore cfs, Iterable<SSTableReader> sstables, final int gcBefore)
+    public CompactionTask(ColumnFamilyStore cfs, Iterable<SSTableReader> sstables, int gcBefore, boolean offline)
     {
-        super(cfs, sstables);
+        super(cfs, Sets.newHashSet(sstables));
         this.gcBefore = gcBefore;
-        toCompact = Sets.newHashSet(sstables);
+        this.offline = offline;
     }
 
     public static synchronized long addToTotalBytesCompacted(long bytesCompacted)
@@ -58,23 +73,23 @@
     {
         this.collector = collector;
         run();
-        return toCompact.size();
+        return sstables.size();
     }
 
     public long getExpectedWriteSize()
     {
-        return cfs.getExpectedCompactedFileSize(toCompact, compactionType);
+        return cfs.getExpectedCompactedFileSize(sstables, compactionType);
     }
 
     public boolean reduceScopeForLimitedSpace()
     {
-        if (partialCompactionsAcceptable() && toCompact.size() > 1)
+        if (partialCompactionsAcceptable() && sstables.size() > 1)
         {
             // Try again w/o the largest one.
-            logger.warn("insufficient space to compact all requested files " + StringUtils.join(toCompact, ", "));
+            logger.warn("insufficient space to compact all requested files {}", StringUtils.join(sstables, ", "));
             // Note that we have removed files that are still marked as compacting.
             // This suboptimal but ok since the caller will unmark all the sstables at the end.
-            return toCompact.remove(cfs.getMaxSizeFile(toCompact));
+            return sstables.remove(cfs.getMaxSizeFile(sstables));
         }
         else
         {
@@ -93,6 +108,9 @@
         // it is not empty, it may compact down to nothing if all rows are deleted.
         assert sstables != null && sstableDirectory != null;
 
+        if (sstables.size() == 0)
+            return;
+
         // Note that the current compaction strategy, is not necessarily the one this task was created under.
         // This should be harmless; see comments to CFS.maybeReloadCompactionStrategy.
         AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
@@ -101,201 +119,162 @@
             cfs.snapshotWithoutFlush(System.currentTimeMillis() + "-compact-" + cfs.name);
 
         // sanity check: all sstables must belong to the same cfs
-        for (SSTableReader sstable : toCompact)
-            assert sstable.descriptor.cfname.equals(cfs.name);
+        assert !Iterables.any(sstables, new Predicate<SSTableReader>()
+        {
+            @Override
+            public boolean apply(SSTableReader sstable)
+            {
+                return !sstable.descriptor.cfname.equals(cfs.name);
+            }
+        });
 
-        UUID taskId = SystemKeyspace.startCompaction(cfs, toCompact);
-
-        CompactionController controller = getCompactionController(toCompact);
-        Set<SSTableReader> actuallyCompact = Sets.difference(toCompact, controller.getFullyExpiredSSTables());
+        UUID taskId = SystemKeyspace.startCompaction(cfs, sstables);
 
         // new sstables from flush can be added during a compaction, but only the compaction can remove them,
         // so in our single-threaded compaction world this is a valid way of determining if we're compacting
         // all the sstables (that existed when we started)
-        logger.info("Compacting {}", toCompact);
+        logger.info("Compacting {}", sstables);
 
         long start = System.nanoTime();
-        long totalkeysWritten = 0;
+        long totalKeysWritten = 0;
 
-        long estimatedTotalKeys = Math.max(cfs.metadata.getIndexInterval(), SSTableReader.getApproximateKeyCount(actuallyCompact, cfs.metadata));
-        long estimatedSSTables = Math.max(1, SSTable.getTotalBytes(actuallyCompact) / strategy.getMaxSSTableBytes());
-        long keysPerSSTable = (long) Math.ceil((double) estimatedTotalKeys / estimatedSSTables);
-        if (logger.isDebugEnabled())
-            logger.debug("Expected bloom filter size : " + keysPerSSTable);
-
-        AbstractCompactionIterable ci = DatabaseDescriptor.isMultithreadedCompaction()
-                                      ? new ParallelCompactionIterable(compactionType, strategy.getScanners(actuallyCompact), controller)
-                                      : new CompactionIterable(compactionType, strategy.getScanners(actuallyCompact), controller);
-        CloseableIterator<AbstractCompactedRow> iter = ci.iterator();
-        Map<DecoratedKey, RowIndexEntry> cachedKeys = new HashMap<DecoratedKey, RowIndexEntry>();
-
-        // we can't preheat until the tracker has been set. This doesn't happen until we tell the cfs to
-        // replace the old entries.  Track entries to preheat here until then.
-        Map<Descriptor, Map<DecoratedKey, RowIndexEntry>> cachedKeyMap =  new HashMap<Descriptor, Map<DecoratedKey, RowIndexEntry>>();
-
-        Collection<SSTableReader> sstables = new ArrayList<SSTableReader>();
-        Collection<SSTableWriter> writers = new ArrayList<SSTableWriter>();
-
-        if (collector != null)
-            collector.beginCompaction(ci);
-        try
+        try (CompactionController controller = getCompactionController(sstables);)
         {
-            if (!iter.hasNext())
+
+            Set<SSTableReader> actuallyCompact = Sets.difference(sstables, controller.getFullyExpiredSSTables());
+
+            long estimatedTotalKeys = Math.max(cfs.metadata.getMinIndexInterval(), SSTableReader.getApproximateKeyCount(actuallyCompact));
+            long estimatedSSTables = Math.max(1, SSTableReader.getTotalBytes(actuallyCompact) / strategy.getMaxSSTableBytes());
+            long keysPerSSTable = (long) Math.ceil((double) estimatedTotalKeys / estimatedSSTables);
+            logger.debug("Expected bloom filter size : {}", keysPerSSTable);
+
+            try (AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(actuallyCompact))
             {
-                // don't mark compacted in the finally block, since if there _is_ nondeleted data,
-                // we need to sync it (via closeAndOpen) first, so there is no period during which
-                // a crash could cause data loss.
-                cfs.markObsolete(toCompact, compactionType);
-                return;
-            }
+                AbstractCompactionIterable ci = new CompactionIterable(compactionType, scanners.scanners, controller);
+                Iterator<AbstractCompactedRow> iter = ci.iterator();
 
-            SSTableWriter writer = createCompactionWriter(sstableDirectory, keysPerSSTable);
-            writers.add(writer);
-            while (iter.hasNext())
-            {
-                if (ci.isStopRequested())
-                    throw new CompactionInterruptedException(ci.getCompactionInfo());
-
-                AbstractCompactedRow row = iter.next();
-                RowIndexEntry indexEntry = writer.append(row);
-                if (indexEntry == null)
-                {
-                    controller.invalidateCachedRow(row.key);
-                    row.close();
-                    continue;
-                }
-
-                totalkeysWritten++;
-
-                if (DatabaseDescriptor.getPreheatKeyCache())
-                {
-                    for (SSTableReader sstable : actuallyCompact)
-                    {
-                        if (sstable.getCachedPosition(row.key, false) != null)
-                        {
-                            cachedKeys.put(row.key, indexEntry);
-                            break;
-                        }
-                    }
-                }
-
-                if (newSSTableSegmentThresholdReached(writer))
-                {
-                    // tmp = false because later we want to query it with descriptor from SSTableReader
-                    cachedKeyMap.put(writer.descriptor.asTemporary(false), cachedKeys);
-                    writer = createCompactionWriter(sstableDirectory, keysPerSSTable);
-                    writers.add(writer);
-                    cachedKeys = new HashMap<DecoratedKey, RowIndexEntry>();
-                }
-            }
-
-            if (writer.getFilePointer() > 0)
-            {
-                cachedKeyMap.put(writer.descriptor.asTemporary(false), cachedKeys);
-            }
-            else
-            {
-                writer.abort();
-                writers.remove(writer);
-            }
-
-            long maxAge = getMaxDataAge(toCompact);
-            for (SSTableWriter completedWriter : writers)
-                sstables.add(completedWriter.closeAndOpenReader(maxAge));
-        }
-        catch (Throwable t)
-        {
-            for (SSTableWriter writer : writers)
-                writer.abort();
-            // also remove already completed SSTables
-            for (SSTableReader sstable : sstables)
-            {
-                sstable.markObsolete();
-                sstable.releaseReference();
-            }
-            throw Throwables.propagate(t);
-        }
-        finally
-        {
-            controller.close();
-
-            // point of no return -- the new sstables are live on disk; next we'll start deleting the old ones
-            // (in replaceCompactedSSTables)
-            if (taskId != null)
-                SystemKeyspace.finishCompaction(taskId);
-
-            if (collector != null)
-                collector.finishCompaction(ci);
-
-            try
-            {
-                // We don't expect this to throw, but just in case, we do it after the cleanup above, to make sure
-                // we don't end up with compaction information hanging around indefinitely in limbo.
-                iter.close();
-            }
-            catch (IOException e)
-            {
-                throw new RuntimeException(e);
-            }
-        }
-
-        replaceCompactedSSTables(toCompact, sstables);
-        // TODO: this doesn't belong here, it should be part of the reader to load when the tracker is wired up
-        for (SSTableReader sstable : sstables)
-        {
-            if (sstable.acquireReference())
-            {
+                // we can't preheat until the tracker has been set. This doesn't happen until we tell the cfs to
+                // replace the old entries.  Track entries to preheat here until then.
+                long minRepairedAt = getMinRepairedAt(actuallyCompact);
+                // we only need the age of the data that we're actually retaining
+                long maxAge = getMaxDataAge(actuallyCompact);
+                if (collector != null)
+                    collector.beginCompaction(ci);
+                long lastCheckObsoletion = start;
+                SSTableRewriter writer = new SSTableRewriter(cfs, sstables, maxAge, compactionType, offline);
                 try
                 {
-                    sstable.preheat(cachedKeyMap.get(sstable.descriptor));
+                    if (!iter.hasNext())
+                    {
+                        // don't mark compacted in the finally block, since if there _is_ nondeleted data,
+                        // we need to sync it (via closeAndOpen) first, so there is no period during which
+                        // a crash could cause data loss.
+                        cfs.markObsolete(sstables, compactionType);
+                        return;
+                    }
+
+                    writer.switchWriter(createCompactionWriter(sstableDirectory, keysPerSSTable, minRepairedAt));
+                    while (iter.hasNext())
+                    {
+                        if (ci.isStopRequested())
+                            throw new CompactionInterruptedException(ci.getCompactionInfo());
+
+                        AbstractCompactedRow row = iter.next();
+                        if (writer.append(row) != null)
+                        {
+                            totalKeysWritten++;
+                            if (newSSTableSegmentThresholdReached(writer.currentWriter()))
+                            {
+                                writer.switchWriter(createCompactionWriter(sstableDirectory, keysPerSSTable, minRepairedAt));
+                            }
+                        }
+
+                        if (System.nanoTime() - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L))
+                        {
+                            controller.maybeRefreshOverlaps();
+                            lastCheckObsoletion = System.nanoTime();
+                        }
+                    }
+
+                    // don't replace old sstables yet, as we need to mark the compaction finished in the system table
+                    writer.finish(false);
+                }
+                catch (Throwable t)
+                {
+                    writer.abort();
+                    throw t;
                 }
                 finally
                 {
-                    sstable.releaseReference();
+
+                    // point of no return -- the new sstables are live on disk; next we'll start deleting the old ones
+                    // (in replaceCompactedSSTables)
+                    if (taskId != null)
+                        SystemKeyspace.finishCompaction(taskId);
+
+                    if (collector != null)
+                        collector.finishCompaction(ci);
                 }
+
+                Collection<SSTableReader> oldSStables = this.sstables;
+                List<SSTableReader> newSStables = writer.finished();
+                if (!offline)
+                    cfs.getDataTracker().markCompactedSSTablesReplaced(oldSStables, newSStables, compactionType);
+
+                // log a bunch of statistics about the result and save to system table compaction_history
+                long dTime = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
+                long startsize = SSTableReader.getTotalBytes(oldSStables);
+                long endsize = SSTableReader.getTotalBytes(newSStables);
+                double ratio = (double) endsize / (double) startsize;
+
+                StringBuilder newSSTableNames = new StringBuilder();
+                for (SSTableReader reader : newSStables)
+                    newSSTableNames.append(reader.descriptor.baseFilename()).append(",");
+
+                double mbps = dTime > 0 ? (double) endsize / (1024 * 1024) / ((double) dTime / 1000) : 0;
+                long totalSourceRows = 0;
+                long[] counts = ci.getMergedRowCounts();
+                StringBuilder mergeSummary = new StringBuilder(counts.length * 10);
+                Map<Integer, Long> mergedRows = new HashMap<>();
+                for (int i = 0; i < counts.length; i++)
+                {
+                    long count = counts[i];
+                    if (count == 0)
+                        continue;
+
+                    int rows = i + 1;
+                    totalSourceRows += rows * count;
+                    mergeSummary.append(String.format("%d:%d, ", rows, count));
+                    mergedRows.put(rows, count);
+                }
+
+                SystemKeyspace.updateCompactionHistory(cfs.keyspace.getName(), cfs.name, System.currentTimeMillis(), startsize, endsize, mergedRows);
+                logger.info(String.format("Compacted %d sstables to [%s].  %,d bytes to %,d (~%d%% of original) in %,dms = %fMB/s.  %,d total partitions merged to %,d.  Partition merge counts were {%s}",
+                                          oldSStables.size(), newSSTableNames.toString(), startsize, endsize, (int) (ratio * 100), dTime, mbps, totalSourceRows, totalKeysWritten, mergeSummary.toString()));
+                logger.debug(String.format("CF Total Bytes Compacted: %,d", CompactionTask.addToTotalBytesCompacted(endsize)));
+                logger.debug("Actual #keys: {}, Estimated #keys:{}, Err%: {}", totalKeysWritten, estimatedTotalKeys, ((double)(totalKeysWritten - estimatedTotalKeys)/totalKeysWritten));
             }
         }
-
-        // log a bunch of statistics about the result and save to system table compaction_history
-        long dTime = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
-        long startsize = SSTable.getTotalBytes(toCompact);
-        long endsize = SSTable.getTotalBytes(sstables);
-        double ratio = (double) endsize / (double) startsize;
-
-        StringBuilder builder = new StringBuilder();
-        for (SSTableReader reader : sstables)
-            builder.append(reader.descriptor.baseFilename()).append(",");
-
-        double mbps = dTime > 0 ? (double) endsize / (1024 * 1024) / ((double) dTime / 1000) : 0;
-        long totalSourceRows = 0;
-        long[] counts = ci.getMergedRowCounts();
-        StringBuilder mergeSummary = new StringBuilder(counts.length * 10);
-        Map<Integer, Long> mergedRows = new HashMap<Integer, Long>();
-        for (int i = 0; i < counts.length; i++)
-        {
-            long count = counts[i];
-            if (count == 0)
-                continue;
-
-            int rows = i + 1;
-            totalSourceRows += rows * count;
-            mergeSummary.append(String.format("%d:%d, ", rows, count));
-            mergedRows.put(rows, count);
-        }
-
-        SystemKeyspace.updateCompactionHistory(cfs.keyspace.getName(), cfs.name, System.currentTimeMillis(), startsize, endsize, mergedRows);
-        logger.info(String.format("Compacted %d sstables to [%s].  %,d bytes to %,d (~%d%% of original) in %,dms = %fMB/s.  %,d total partitions merged to %,d.  Partition merge counts were {%s}",
-                                  toCompact.size(), builder.toString(), startsize, endsize, (int) (ratio * 100), dTime, mbps, totalSourceRows, totalkeysWritten, mergeSummary.toString()));
-        logger.debug(String.format("CF Total Bytes Compacted: %,d", CompactionTask.addToTotalBytesCompacted(endsize)));
     }
 
-    private SSTableWriter createCompactionWriter(File sstableDirectory, long keysPerSSTable)
+    private long getMinRepairedAt(Set<SSTableReader> actuallyCompact)
+    {
+        long minRepairedAt= Long.MAX_VALUE;
+        for (SSTableReader sstable : actuallyCompact)
+            minRepairedAt = Math.min(minRepairedAt, sstable.getSSTableMetadata().repairedAt);
+        if (minRepairedAt == Long.MAX_VALUE)
+            return ActiveRepairService.UNREPAIRED_SSTABLE;
+        return minRepairedAt;
+    }
+
+    private SSTableWriter createCompactionWriter(File sstableDirectory, long keysPerSSTable, long repairedAt)
     {
         return new SSTableWriter(cfs.getTempSSTablePath(sstableDirectory),
                                  keysPerSSTable,
+                                 repairedAt,
                                  cfs.metadata,
                                  cfs.partitioner,
-                                 SSTableMetadata.createCollector(toCompact, cfs.metadata.comparator, getLevel()));
+                                 new MetadataCollector(sstables, cfs.metadata.comparator, getLevel()));
     }
 
     protected int getLevel()
@@ -303,11 +282,6 @@
         return 0;
     }
 
-    protected void replaceCompactedSSTables(Collection<SSTableReader> compacted, Collection<SSTableReader> replacements)
-    {
-        cfs.replaceCompactedSSTables(compacted, replacements, compactionType);
-    }
-
     protected CompactionController getCompactionController(Set<SSTableReader> toCompact)
     {
         return new CompactionController(cfs, toCompact, gcBefore);

diff --git a/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
index 9c708db..8c997ed 100644
--- a/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java

@@ -59,7 +59,7 @@
                 return null;
 
             if (cfs.getDataTracker().markCompacting(latestBucket))
-                return new CompactionTask(cfs, latestBucket, gcBefore);
+                return new CompactionTask(cfs, latestBucket, gcBefore, false);
         }
     }
 
@@ -75,12 +75,37 @@
 
         int base = cfs.getMinimumCompactionThreshold();
         long now = getNow();
-
         Iterable<SSTableReader> candidates = filterSuspectSSTables(cfs.getUncompactingSSTables());
 
-        List<SSTableReader> mostInteresting = getCompactionCandidates(candidates, now, base);
-        if (mostInteresting != null)
-            return mostInteresting;
+        Set<SSTableReader> repairedCandidates = new HashSet<>();
+        Set<SSTableReader> unRepairedCandidates = new HashSet<>();
+        for (SSTableReader sstable : candidates)
+        {
+            if (sstable.isRepaired())
+            {
+                repairedCandidates.add(sstable);
+            }
+            else
+            {
+                unRepairedCandidates.add(sstable);
+            }
+        }
+
+
+        List<SSTableReader> mostInterestingRepaired = getCompactionCandidates(repairedCandidates, now, base);
+        List<SSTableReader> mostInterestingUnrepaired = getCompactionCandidates(unRepairedCandidates, now, base);
+        if (mostInterestingRepaired != null && mostInterestingUnrepaired != null)
+        {
+            return mostInterestingRepaired.size() > mostInterestingUnrepaired.size() ? mostInterestingRepaired : mostInterestingUnrepaired;
+        }
+        else if (mostInterestingRepaired != null)
+        {
+            return mostInterestingRepaired;
+        }
+        else if (mostInterestingUnrepaired != null)
+        {
+            return mostInterestingUnrepaired;
+        }
 
         // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
         // ratio is greater than threshold.
@@ -320,13 +345,13 @@
     }
 
     @Override
-    public synchronized AbstractCompactionTask getMaximalTask(int gcBefore)
+    public synchronized Collection<AbstractCompactionTask> getMaximalTask(int gcBefore)
     {
         Iterable<SSTableReader> sstables = cfs.markAllCompacting();
         if (sstables == null)
             return null;
 
-        return new CompactionTask(cfs, sstables, gcBefore);
+        return Arrays.<AbstractCompactionTask>asList(new CompactionTask(cfs, sstables, gcBefore, false));
     }
 
     @Override
@@ -340,7 +365,7 @@
             return null;
         }
 
-        return new CompactionTask(cfs, sstables, gcBefore).setUserDefined(true);
+        return new CompactionTask(cfs, sstables, gcBefore, false).setUserDefined(true);
     }
 
     public int getEstimatedRemainingTasks()

diff --git a/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java b/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java
index 2757411..fa59dba 100644
--- a/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java
+++ b/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.security.MessageDigest;
@@ -31,12 +30,12 @@
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.io.sstable.ColumnNameHelper;
 import org.apache.cassandra.io.sstable.ColumnStats;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableWriter;
 import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.MergeIterator;
 import org.apache.cassandra.utils.StreamingHistogram;
 
@@ -47,17 +46,18 @@
  * in memory at a time is the bloom filter, the index, and one column from each
  * pre-compaction row.
  */
-public class LazilyCompactedRow extends AbstractCompactedRow implements Iterable<OnDiskAtom>
+public class LazilyCompactedRow extends AbstractCompactedRow
 {
     private final List<? extends OnDiskAtomIterator> rows;
     private final CompactionController controller;
-    private final boolean shouldPurge;
+    private final long maxPurgeableTimestamp;
     private final ColumnFamily emptyColumnFamily;
-    private Reducer reducer;
     private ColumnStats columnStats;
     private boolean closed;
     private ColumnIndex.Builder indexBuilder;
     private final SecondaryIndexManager.Updater indexer;
+    private final Reducer reducer;
+    private final Iterator<OnDiskAtom> merger;
     private DeletionTime maxRowTombstone;
 
     public LazilyCompactedRow(CompactionController controller, List<? extends OnDiskAtomIterator> rows)
@@ -65,7 +65,7 @@
         super(rows.get(0).getKey());
         this.rows = rows;
         this.controller = controller;
-        indexer = controller.cfs.indexManager.updaterFor(key);
+        indexer = controller.cfs.indexManager.gcUpdaterFor(key);
 
         // Combine top-level tombstones, keeping the one with the highest markedForDeleteAt timestamp.  This may be
         // purged (depending on gcBefore), but we need to remember it to properly delete columns during the merge
@@ -77,28 +77,37 @@
                 maxRowTombstone = rowTombstone;
         }
 
+        // tombstones with a localDeletionTime before this can be purged.  This is the minimum timestamp for any sstable
+        // containing `key` outside of the set of sstables involved in this compaction.
+        maxPurgeableTimestamp = controller.maxPurgeableTimestamp(key);
 
-        // Don't pass maxTombstoneTimestamp to shouldPurge since we might well have cells with
-        // tombstones newer than the row-level tombstones we've seen -- but we won't know that
-        // until we iterate over them.  By passing MAX_VALUE we will only purge if there are
-        // no other versions of this row present.
-        this.shouldPurge = controller.shouldPurge(key, Long.MAX_VALUE);
-
-        emptyColumnFamily = EmptyColumns.factory.create(controller.cfs.metadata);
+        emptyColumnFamily = ArrayBackedSortedColumns.factory.create(controller.cfs.metadata);
         emptyColumnFamily.delete(maxRowTombstone);
-        if (shouldPurge)
+        if (maxRowTombstone.markedForDeleteAt < maxPurgeableTimestamp)
             emptyColumnFamily.purgeTombstones(controller.gcBefore);
+
+        reducer = new Reducer();
+        merger = Iterators.filter(MergeIterator.get(rows, emptyColumnFamily.getComparator().onDiskAtomComparator(), reducer), Predicates.notNull());
     }
 
-    public RowIndexEntry write(long currentPosition, DataOutput out) throws IOException
+    private static void removeDeleted(ColumnFamily cf, boolean shouldPurge, DecoratedKey key, CompactionController controller)
+    {
+        // We should only purge cell tombstones if shouldPurge is true, but regardless, it's still ok to remove cells that
+        // are shadowed by a row or range tombstone; removeDeletedColumnsOnly(cf, Integer.MIN_VALUE) will accomplish this
+        // without purging tombstones.
+        int overriddenGCBefore = shouldPurge ? controller.gcBefore : Integer.MIN_VALUE;
+        ColumnFamilyStore.removeDeletedColumnsOnly(cf, overriddenGCBefore, controller.cfs.indexManager.gcUpdaterFor(key));
+    }
+
+    public RowIndexEntry write(long currentPosition, DataOutputPlus out) throws IOException
     {
         assert !closed;
 
         ColumnIndex columnsIndex;
         try
         {
-            indexBuilder = new ColumnIndex.Builder(emptyColumnFamily, key.key, out);
-            columnsIndex = indexBuilder.buildForCompaction(iterator());
+            indexBuilder = new ColumnIndex.Builder(emptyColumnFamily, key.getKey(), out);
+            columnsIndex = indexBuilder.buildForCompaction(merger);
 
             // if there aren't any columns or tombstones, return null
             if (columnsIndex.columnsIndex.isEmpty() && !emptyColumnFamily.isMarkedForDelete())
@@ -109,16 +118,14 @@
             throw new RuntimeException(e);
         }
         // reach into the reducer (created during iteration) to get column count, size, max column timestamp
-        // (however, if there are zero columns, iterator() will not be called by ColumnIndexer and reducer will be null)
-        columnStats = new ColumnStats(reducer == null ? 0 : reducer.columns,
-                                      reducer == null ? Long.MAX_VALUE : reducer.minTimestampTracker.get(),
-                                      reducer == null ? emptyColumnFamily.maxTimestamp() : Math.max(emptyColumnFamily.maxTimestamp(), reducer.maxTimestampTracker.get()),
-                                      reducer == null ? Integer.MIN_VALUE : reducer.maxDeletionTimeTracker.get(),
-                                      reducer == null ? new StreamingHistogram(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE) : reducer.tombstones,
-                                      reducer == null ? Collections.<ByteBuffer>emptyList() : reducer.minColumnNameSeen,
-                                      reducer == null ? Collections.<ByteBuffer>emptyList() : reducer.maxColumnNameSeen
-        );
-        reducer = null;
+        columnStats = new ColumnStats(reducer.columns,
+                                      reducer.minTimestampTracker.get(),
+                                      Math.max(emptyColumnFamily.deletionInfo().maxTimestamp(), reducer.maxTimestampTracker.get()),
+                                      reducer.maxDeletionTimeTracker.get(),
+                                      reducer.tombstones,
+                                      reducer.minColumnNameSeen,
+                                      reducer.maxColumnNameSeen,
+                                      reducer.hasLegacyCounterShards);
 
         // in case no columns were ever written, we may still need to write an empty header with a top-level tombstone
         indexBuilder.maybeWriteEmptyRowHeader();
@@ -149,25 +156,12 @@
         }
 
         // initialize indexBuilder for the benefit of its tombstoneTracker, used by our reducing iterator
-        indexBuilder = new ColumnIndex.Builder(emptyColumnFamily, key.key, out);
-        Iterator<OnDiskAtom> iter = iterator();
-        while (iter.hasNext())
-            iter.next().updateDigest(digest);
+        indexBuilder = new ColumnIndex.Builder(emptyColumnFamily, key.getKey(), out);
+        while (merger.hasNext())
+            merger.next().updateDigest(digest);
         close();
     }
 
-    public AbstractType<?> getComparator()
-    {
-        return emptyColumnFamily.getComparator();
-    }
-
-    public Iterator<OnDiskAtom> iterator()
-    {
-        reducer = new Reducer();
-        Iterator<OnDiskAtom> iter = MergeIterator.get(rows, getComparator().onDiskAtomComparator, reducer);
-        return Iterators.filter(iter, Predicates.notNull());
-    }
-
     public ColumnStats columnStats()
     {
         return columnStats;
@@ -213,6 +207,7 @@
         StreamingHistogram tombstones = new StreamingHistogram(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE);
         List<ByteBuffer> minColumnNameSeen = Collections.emptyList();
         List<ByteBuffer> maxColumnNameSeen = Collections.emptyList();
+        boolean hasLegacyCounterShards = false;
 
         public Reducer()
         {
@@ -229,23 +224,20 @@
         {
             if (current instanceof RangeTombstone)
             {
-                if (tombstone == null || current.maxTimestamp() >= tombstone.maxTimestamp())
+                if (tombstone == null || current.timestamp() >= tombstone.timestamp())
                     tombstone = (RangeTombstone)current;
             }
             else
             {
-                Column column = (Column) current;
-                container.addColumn(column);
+                Cell cell = (Cell) current;
+                container.addColumn(cell);
 
                 // skip the index-update checks if there is no indexing needed since they are a bit expensive
                 if (indexer == SecondaryIndexManager.nullUpdater)
                     return;
 
-                if (!column.isMarkedForDelete(System.currentTimeMillis())
-                    && !container.getColumn(column.name()).equals(column))
-                {
-                    indexer.remove(column);
-                }
+                if (cell.isLive() && !container.getColumn(cell.name()).equals(cell))
+                    indexer.remove(cell);
             }
         }
 
@@ -259,7 +251,7 @@
                 RangeTombstone t = tombstone;
                 tombstone = null;
 
-                if (shouldPurge && t.data.isGcAble(controller.gcBefore))
+                if (t.timestamp() < maxPurgeableTimestamp && t.data.isGcAble(controller.gcBefore))
                 {
                     indexBuilder.tombstoneTracker().update(t, true);
                     return null;
@@ -267,51 +259,57 @@
                 else
                 {
                     tombstones.update(t.getLocalDeletionTime());
-                    minTimestampTracker.update(t.minTimestamp());
-                    maxTimestampTracker.update(t.maxTimestamp());
+                    minTimestampTracker.update(t.timestamp());
+                    maxTimestampTracker.update(t.timestamp());
                     maxDeletionTimeTracker.update(t.getLocalDeletionTime());
                     minColumnNameSeen = ColumnNameHelper.minComponents(minColumnNameSeen, t.min, controller.cfs.metadata.comparator);
                     maxColumnNameSeen = ColumnNameHelper.maxComponents(maxColumnNameSeen, t.max, controller.cfs.metadata.comparator);
-
                     return t;
                 }
             }
             else
             {
+                boolean shouldPurge = container.getSortedColumns().iterator().next().timestamp() < maxPurgeableTimestamp;
                 // when we clear() the container, it removes the deletion info, so this needs to be reset each time
                 container.delete(maxRowTombstone);
-                ColumnFamily purged = PrecompactedRow.removeDeleted(key, shouldPurge, controller, container);
-                if (purged == null || !purged.iterator().hasNext())
+                removeDeleted(container, shouldPurge, key, controller);
+                Iterator<Cell> iter = container.iterator();
+                if (!iter.hasNext())
                 {
                     // don't call clear() because that resets the deletion time. See CASSANDRA-7808.
-                    container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata());;
+                    container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata());
                     return null;
                 }
-                Column reduced = purged.iterator().next();
+
+                int localDeletionTime = container.deletionInfo().getTopLevelDeletion().localDeletionTime;
+                if (localDeletionTime < Integer.MAX_VALUE)
+                    tombstones.update(localDeletionTime);
+
+                Cell reduced = iter.next();
                 container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata());
 
-                // PrecompactedRow.removeDeleted has only checked the top-level CF deletion times,
-                // not the range tombstones. For that we use the columnIndexer tombstone tracker.
+                // removeDeleted have only checked the top-level CF deletion times,
+                // not the range tombstone. For that we use the columnIndexer tombstone tracker.
                 if (indexBuilder.tombstoneTracker().isDeleted(reduced))
                 {
                     indexer.remove(reduced);
                     return null;
                 }
-                int localDeletionTime = purged.deletionInfo().getTopLevelDeletion().localDeletionTime;
-                if (localDeletionTime < Integer.MAX_VALUE)
-                    tombstones.update(localDeletionTime);
+
                 columns++;
-                minTimestampTracker.update(reduced.minTimestamp());
-                maxTimestampTracker.update(reduced.maxTimestamp());
+                minTimestampTracker.update(reduced.timestamp());
+                maxTimestampTracker.update(reduced.timestamp());
                 maxDeletionTimeTracker.update(reduced.getLocalDeletionTime());
                 minColumnNameSeen = ColumnNameHelper.minComponents(minColumnNameSeen, reduced.name(), controller.cfs.metadata.comparator);
                 maxColumnNameSeen = ColumnNameHelper.maxComponents(maxColumnNameSeen, reduced.name(), controller.cfs.metadata.comparator);
 
                 int deletionTime = reduced.getLocalDeletionTime();
                 if (deletionTime < Integer.MAX_VALUE)
-                {
                     tombstones.update(deletionTime);
-                }
+
+                if (reduced instanceof CounterCell)
+                    hasLegacyCounterShards = hasLegacyCounterShards || ((CounterCell) reduced).hasLegacyShards();
+
                 return reduced;
             }
         }

diff --git a/src/java/org/apache/cassandra/db/compaction/LegacyLeveledManifest.java b/src/java/org/apache/cassandra/db/compaction/LegacyLeveledManifest.java
deleted file mode 100644
index e10be10..0000000
--- a/src/java/org/apache/cassandra/db/compaction/LegacyLeveledManifest.java
+++ /dev/null

@@ -1,143 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.io.sstable.Component;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.Pair;
-import org.codehaus.jackson.JsonNode;
-import org.codehaus.jackson.map.ObjectMapper;
-
-/**
- * This class was added to be able to migrate pre-CASSANDRA-4782 leveled manifests into the sstable metadata
- *
- * @deprecated since it can be removed in a future revision.
- */
-@Deprecated
-public class LegacyLeveledManifest
-{
-    private static final Logger logger = LoggerFactory.getLogger(LegacyLeveledManifest.class);
-
-    private Map<Integer, Integer> sstableLevels;
-
-    private LegacyLeveledManifest(File path) throws IOException
-    {
-        sstableLevels = new HashMap<Integer, Integer>();
-        ObjectMapper m = new ObjectMapper();
-        JsonNode rootNode = m.readValue(path, JsonNode.class);
-        JsonNode generations = rootNode.get("generations");
-        assert generations.isArray();
-        for (JsonNode generation : generations)
-        {
-            int level = generation.get("generation").getIntValue();
-            JsonNode generationValues = generation.get("members");
-            for (JsonNode generationValue : generationValues)
-            {
-                sstableLevels.put(generationValue.getIntValue(), level);
-            }
-        }
-    }
-
-    private int levelOf(int sstableGeneration)
-    {
-        return sstableLevels.containsKey(sstableGeneration) ? sstableLevels.get(sstableGeneration) : 0;
-    }
-
-    /**
-     * We need to migrate if there is a legacy leveledmanifest json-file
-     * <p/>
-     * If there is no jsonfile, we can just start as normally, sstable level will be at 0 for all sstables.
-     *
-     * @param keyspace
-     * @param columnFamily
-     * @return
-     */
-    public static boolean manifestNeedsMigration(String keyspace, String columnFamily)
-    {
-        return Directories.create(keyspace, columnFamily).tryGetLeveledManifest() != null;
-    }
-
-    public static void migrateManifests(String keyspace, String columnFamily) throws IOException
-    {
-        logger.info("Migrating manifest for {}/{}", keyspace, columnFamily);
-
-        snapshotWithoutCFS(keyspace, columnFamily);
-        Directories directories = Directories.create(keyspace, columnFamily);
-        File manifestFile = directories.tryGetLeveledManifest();
-        if (manifestFile == null)
-            return;
-
-        LegacyLeveledManifest legacyManifest = new LegacyLeveledManifest(manifestFile);
-        for (Map.Entry<Descriptor, Set<Component>> entry : directories.sstableLister().includeBackups(false).skipTemporary(true).list().entrySet())
-        {
-            Descriptor d = entry.getKey();
-            Pair<SSTableMetadata, Set<Integer>> oldMetadata = SSTableMetadata.serializer.deserialize(d, false);
-            String metadataFilename = d.filenameFor(Component.STATS);
-            LeveledManifest.mutateLevel(oldMetadata, d, metadataFilename, legacyManifest.levelOf(d.generation));
-        }
-        FileUtils.deleteWithConfirm(manifestFile);
-    }
-
-    /**
-     * Snapshot a CF without having to load the sstables in that directory
-     *
-     * @param keyspace
-     * @param columnFamily
-     * @throws IOException
-     */
-    public static void snapshotWithoutCFS(String keyspace, String columnFamily) throws IOException
-    {
-        Directories directories = Directories.create(keyspace, columnFamily);
-        String snapshotName = "pre-sstablemetamigration";
-        logger.info("Snapshotting {}, {} to {}", keyspace, columnFamily, snapshotName);
-
-        for (Map.Entry<Descriptor, Set<Component>> entry : directories.sstableLister().includeBackups(false).skipTemporary(true).list().entrySet())
-        {
-            Descriptor descriptor = entry.getKey();
-            File snapshotDirectoryPath = Directories.getSnapshotDirectory(descriptor, snapshotName);
-            for (Component component : entry.getValue())
-            {
-                File sourceFile = new File(descriptor.filenameFor(component));
-                File targetLink = new File(snapshotDirectoryPath, sourceFile.getName());
-                FileUtils.createHardLink(sourceFile, targetLink);
-            }
-        }
-
-        File manifestFile = directories.tryGetLeveledManifest();
-        if (manifestFile != null)
-        {
-            File snapshotDirectory = new File(new File(manifestFile.getParentFile(), Directories.SNAPSHOT_SUBDIR), snapshotName);
-            if (!snapshotDirectory.exists())
-                snapshotDirectory.mkdirs();
-            File target = new File(snapshotDirectory, manifestFile.getName());
-            FileUtils.createHardLink(manifestFile, target);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
index 8637a2e..7f2d881 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java

@@ -32,12 +32,12 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.notifications.INotification;
 import org.apache.cassandra.notifications.INotificationConsumer;
 import org.apache.cassandra.notifications.SSTableAddedNotification;
 import org.apache.cassandra.notifications.SSTableListChangedNotification;
+import org.apache.cassandra.notifications.SSTableRepairStatusChanged;
 
 public class LeveledCompactionStrategy extends AbstractCompactionStrategy implements INotificationConsumer
 {
@@ -109,11 +109,13 @@
     {
         if (!isEnabled())
             return null;
-
-        return getMaximalTask(gcBefore);
+        Collection<AbstractCompactionTask> tasks = getMaximalTask(gcBefore);
+        if (tasks == null || tasks.size() == 0)
+            return null;
+        return tasks.iterator().next();
     }
 
-    public AbstractCompactionTask getMaximalTask(int gcBefore)
+    public Collection<AbstractCompactionTask> getMaximalTask(int gcBefore)
     {
         while (true)
         {
@@ -142,7 +144,7 @@
             {
                 LeveledCompactionTask newTask = new LeveledCompactionTask(cfs, candidate.sstables, candidate.level, gcBefore, candidate.maxSSTableBytes);
                 newTask.setCompactionType(op);
-                return newTask;
+                return Arrays.<AbstractCompactionTask>asList(newTask);
             }
         }
     }
@@ -185,6 +187,10 @@
             SSTableListChangedNotification listChangedNotification = (SSTableListChangedNotification) notification;
             manifest.replace(listChangedNotification.removed, listChangedNotification.added);
         }
+        else if (notification instanceof SSTableRepairStatusChanged)
+        {
+            manifest.repairStatusChanged(((SSTableRepairStatusChanged) notification).sstable);
+        }
     }
 
     public long getMaxSSTableBytes()
@@ -192,33 +198,53 @@
         return maxSSTableSizeInMB * 1024L * 1024L;
     }
 
-    public List<ICompactionScanner> getScanners(Collection<SSTableReader> sstables, Range<Token> range)
+    public ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token> range)
     {
         Multimap<Integer, SSTableReader> byLevel = ArrayListMultimap.create();
         for (SSTableReader sstable : sstables)
-            byLevel.get(sstable.getSSTableLevel()).add(sstable);
-
-        List<ICompactionScanner> scanners = new ArrayList<ICompactionScanner>(sstables.size());
-        for (Integer level : byLevel.keySet())
         {
-            // level can be -1 when sstables are added to DataTracker but not to LeveledManifest
-            // since we don't know which level those sstable belong yet, we simply do the same as L0 sstables.
-            if (level <= 0)
-            {
-                // L0 makes no guarantees about overlapping-ness.  Just create a direct scanner for each
-                for (SSTableReader sstable : byLevel.get(level))
-                    scanners.add(sstable.getScanner(range, CompactionManager.instance.getRateLimiter()));
-            }
+            if (manifest.hasRepairedData() && !sstable.isRepaired())
+                byLevel.get(0).add(sstable);
             else
-            {
-                // Create a LeveledScanner that only opens one sstable at a time, in sorted order
-                List<SSTableReader> intersecting = LeveledScanner.intersecting(byLevel.get(level), range);
-                if (!intersecting.isEmpty())
-                    scanners.add(new LeveledScanner(intersecting, range));
-            }
+                byLevel.get(sstable.getSSTableLevel()).add(sstable);
         }
 
-        return scanners;
+        List<ICompactionScanner> scanners = new ArrayList<ICompactionScanner>(sstables.size());
+        try
+        {
+            for (Integer level : byLevel.keySet())
+            {
+                // level can be -1 when sstables are added to DataTracker but not to LeveledManifest
+                // since we don't know which level those sstable belong yet, we simply do the same as L0 sstables.
+                if (level <= 0)
+                {
+                    // L0 makes no guarantees about overlapping-ness.  Just create a direct scanner for each
+                    for (SSTableReader sstable : byLevel.get(level))
+                        scanners.add(sstable.getScanner(range, CompactionManager.instance.getRateLimiter()));
+                }
+                else
+                {
+                    // Create a LeveledScanner that only opens one sstable at a time, in sorted order
+                    List<SSTableReader> intersecting = LeveledScanner.intersecting(byLevel.get(level), range);
+                    if (!intersecting.isEmpty())
+                        scanners.add(new LeveledScanner(intersecting, range));
+                }
+            }
+        }
+        catch (Throwable t)
+        {
+            try
+            {
+                new ScannerList(scanners).close();
+            }
+            catch (Throwable t2)
+            {
+                t.addSuppressed(t2);
+            }
+            throw t;
+        }
+
+        return new ScannerList(scanners);
     }
 
     // Lazily creates SSTableBoundedScanner for sstable that are assumed to be from the
@@ -253,7 +279,7 @@
             }
 
             totalLength = length;
-            Collections.sort(this.sstables, SSTable.sstableComparator);
+            Collections.sort(this.sstables, SSTableReader.sstableComparator);
             sstableIterator = this.sstables.iterator();
             assert sstableIterator.hasNext(); // caller should check intersecting first
             currentScanner = sstableIterator.next().getScanner(range, CompactionManager.instance.getRateLimiter());

diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
index f64f633..2731b6d 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java

@@ -30,7 +30,7 @@
 
     public LeveledCompactionTask(ColumnFamilyStore cfs, Collection<SSTableReader> sstables, int level, final int gcBefore, long maxSSTableBytes)
     {
-        super(cfs, sstables, gcBefore);
+        super(cfs, sstables, gcBefore, false);
         this.level = level;
         this.maxSSTableBytes = maxSSTableBytes;
     }

diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
index b704523..a4d2115 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java

@@ -17,8 +17,6 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import java.io.DataOutputStream;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.util.*;
 
@@ -42,16 +40,12 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.*;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
 
 public class LeveledManifest
 {
     private static final Logger logger = LoggerFactory.getLogger(LeveledManifest.class);
 
-    public static final String EXTENSION = ".json";
-
     /**
      * limit the number of L0 sstables we do at once, because compaction bloom filter creation
      * uses a pessimistic estimate of how many keys overlap (none), so we risk wasting memory
@@ -66,15 +60,20 @@
     private static final int NO_COMPACTION_LIMIT = 25;
 
     private final ColumnFamilyStore cfs;
-    private final List<SSTableReader>[] generations;
+    @VisibleForTesting
+    protected final List<SSTableReader>[] generations;
+    @VisibleForTesting
+    protected final List<SSTableReader> unrepairedL0;
     private final RowPosition[] lastCompactedKeys;
     private final int maxSSTableSizeInBytes;
     private final SizeTieredCompactionStrategyOptions options;
+    private boolean hasRepairedData = false;
     private final int [] compactionCounter;
 
     private LeveledManifest(ColumnFamilyStore cfs, int maxSSTableSizeInMB, SizeTieredCompactionStrategyOptions options)
     {
         this.cfs = cfs;
+        this.hasRepairedData = cfs.getRepairedSSTables().size() > 0;
         this.maxSSTableSizeInBytes = maxSSTableSizeInMB * 1024 * 1024;
         this.options = options;
 
@@ -86,9 +85,10 @@
         lastCompactedKeys = new RowPosition[n];
         for (int i = 0; i < generations.length; i++)
         {
-            generations[i] = new ArrayList<SSTableReader>();
+            generations[i] = new ArrayList<>();
             lastCompactedKeys[i] = cfs.partitioner.getMinimumToken().minKeyBound();
         }
+        unrepairedL0 = new ArrayList<>();
         compactionCounter = new int[n];
     }
 
@@ -115,12 +115,72 @@
 
     public synchronized void add(SSTableReader reader)
     {
-        int level = reader.getSSTableLevel();
-        assert level < generations.length : "Invalid level " + level + " out of " + (generations.length - 1);
-        logDistribution();
+        if (!hasRepairedData && reader.isRepaired())
+        {
+            // this is the first repaired sstable we get - we need to
+            // rebuild the entire manifest, unrepaired data should be
+            // in unrepairedL0. Note that we keep the sstable level in
+            // the sstable metadata since we are likely to be able to
+            // re-add it at a good level later (during anticompaction
+            // for example).
+            hasRepairedData = true;
+            rebuildManifestAfterFirstRepair();
+        }
 
-        logger.debug("Adding {} to L{}", reader, level);
-        generations[level].add(reader);
+        int level = reader.getSSTableLevel();
+        if (hasRepairedData && !reader.isRepaired())
+        {
+            logger.debug("Adding unrepaired {} to unrepaired L0", reader);
+            unrepairedL0.add(reader);
+        }
+        else
+        {
+            assert level < generations.length : "Invalid level " + level + " out of " + (generations.length - 1);
+            logDistribution();
+            if (canAddSSTable(reader))
+            {
+                // adding the sstable does not cause overlap in the level
+                logger.debug("Adding {} to L{}", reader, level);
+                generations[level].add(reader);
+            }
+            else
+            {
+                // this can happen if:
+                // * a compaction has promoted an overlapping sstable to the given level, or
+                // * we promote a non-repaired sstable to repaired at level > 0, but an ongoing compaction
+                //   was also supposed to add an sstable at the given level.
+                //
+                // The add(..):ed sstable will be sent to level 0
+                try
+                {
+                    reader.descriptor.getMetadataSerializer().mutateLevel(reader.descriptor, 0);
+                    reader.reloadSSTableMetadata();
+                }
+                catch (IOException e)
+                {
+                    logger.error("Could not change sstable level - adding it at level 0 anyway, we will find it at restart.", e);
+                }
+                generations[0].add(reader);
+            }
+        }
+
+    }
+
+
+    /**
+     * Since we run standard LCS when we have no repaired data
+     * we need to move all sstables from the leveling
+     * to unrepairedL0.
+     */
+    private void rebuildManifestAfterFirstRepair()
+    {
+        for (int i = 0; i < getAllLevelSize().length; i++)
+        {
+            List<SSTableReader> oldLevel = generations[i];
+            generations[i] = new ArrayList<>();
+            for (SSTableReader sstable : oldLevel)
+                add(sstable);
+        }
     }
 
     public synchronized void replace(Collection<SSTableReader> removed, Collection<SSTableReader> added)
@@ -128,7 +188,7 @@
         assert !removed.isEmpty(); // use add() instead of promote when adding new sstables
         logDistribution();
         if (logger.isDebugEnabled())
-            logger.debug("Replacing [" + toString(removed) + "]");
+            logger.debug("Replacing [{}]", toString(removed));
 
         // the level for the added sstables is the max of the removed ones,
         // plus one if the removed were all on the same level
@@ -149,13 +209,13 @@
 
         for (SSTableReader ssTableReader : added)
             add(ssTableReader);
-        lastCompactedKeys[minLevel] = SSTable.sstableOrdering.max(added).last;
+        lastCompactedKeys[minLevel] = SSTableReader.sstableOrdering.max(added).last;
     }
 
     public synchronized void repairOverlappingSSTables(int level)
     {
         SSTableReader previous = null;
-        Collections.sort(generations[level], SSTable.sstableComparator);
+        Collections.sort(generations[level], SSTableReader.sstableComparator);
         List<SSTableReader> outOfOrderSSTables = new ArrayList<SSTableReader>();
         for (SSTableReader current : generations[level])
         {
@@ -179,13 +239,37 @@
         }
     }
 
+    /**
+     * Checks if adding the sstable creates an overlap in the level
+     * @param sstable the sstable to add
+     * @return true if it is safe to add the sstable in the level.
+     */
+    private boolean canAddSSTable(SSTableReader sstable)
+    {
+        int level = sstable.getSSTableLevel();
+        if (level == 0)
+            return true;
+
+        List<SSTableReader> copyLevel = new ArrayList<>(generations[level]);
+        copyLevel.add(sstable);
+        Collections.sort(copyLevel, SSTableReader.sstableComparator);
+
+        SSTableReader previous = null;
+        for (SSTableReader current : copyLevel)
+        {
+            if (previous != null && current.first.compareTo(previous.last) <= 0)
+                return false;
+            previous = current;
+        }
+        return true;
+    }
+
     private synchronized void sendBackToL0(SSTableReader sstable)
     {
         remove(sstable);
-        String metaDataFile = sstable.descriptor.filenameFor(Component.STATS);
         try
         {
-            mutateLevel(Pair.create(sstable.getSSTableMetadata(), sstable.getAncestors()), sstable.descriptor, metaDataFile, 0);
+            sstable.descriptor.getMetadataSerializer().mutateLevel(sstable.descriptor, 0);
             sstable.reloadSSTableMetadata();
             add(sstable);
         }
@@ -195,6 +279,15 @@
         }
     }
 
+    public synchronized void repairStatusChanged(Collection<SSTableReader> sstables)
+    {
+        for(SSTableReader sstable : sstables)
+        {
+            remove(sstable);
+            add(sstable);
+        }
+    }
+
     private String toString(Collection<SSTableReader> sstables)
     {
         StringBuilder builder = new StringBuilder();
@@ -227,6 +320,18 @@
      */
     public synchronized CompactionCandidate getCompactionCandidates()
     {
+        // if we don't have any repaired data, continue as usual
+        if (hasRepairedData)
+        {
+            Collection<SSTableReader> unrepairedMostInterresting = getSSTablesForSTCS(unrepairedL0);
+            if (!unrepairedMostInterresting.isEmpty())
+            {
+                logger.info("Unrepaired data is most interresting, compacting {} sstables with STCS", unrepairedMostInterresting.size());
+                for (SSTableReader reader : unrepairedMostInterresting)
+                    assert !reader.isRepaired();
+                return new CompactionCandidate(unrepairedMostInterresting, 0, Long.MAX_VALUE);
+            }
+        }
         // LevelDB gives each level a score of how much data it contains vs its ideal amount, and
         // compacts the level with the highest score. But this falls apart spectacularly once you
         // get behind.  Consider this set of levels:
@@ -256,27 +361,21 @@
         // it can help a lot.
         for (int i = generations.length - 1; i > 0; i--)
         {
-            List<SSTableReader> sstables = generations[i];
+            List<SSTableReader> sstables = getLevel(i);
             if (sstables.isEmpty())
                 continue; // mostly this just avoids polluting the debug log with zero scores
             // we want to calculate score excluding compacting ones
             Set<SSTableReader> sstablesInLevel = Sets.newHashSet(sstables);
             Set<SSTableReader> remaining = Sets.difference(sstablesInLevel, cfs.getDataTracker().getCompacting());
-            double score = (double)SSTableReader.getTotalBytes(remaining) / (double)maxBytesForLevel(i);
+            double score = (double) SSTableReader.getTotalBytes(remaining) / (double)maxBytesForLevel(i);
             logger.debug("Compaction score for level {} is {}", i, score);
 
             if (score > 1.001)
             {
                 // before proceeding with a higher level, let's see if L0 is far enough behind to warrant STCS
-                if (!DatabaseDescriptor.getDisableSTCSInL0() && generations[0].size() > MAX_COMPACTING_L0)
+                if (!DatabaseDescriptor.getDisableSTCSInL0() && getLevel(0).size() > MAX_COMPACTING_L0)
                 {
-                    Iterable<SSTableReader> candidates = cfs.getDataTracker().getUncompactingSSTables(generations[0]);
-                    List<Pair<SSTableReader,Long>> pairs = SizeTieredCompactionStrategy.createSSTableAndLengthPairs(AbstractCompactionStrategy.filterSuspectSSTables(candidates));
-                    List<List<SSTableReader>> buckets = SizeTieredCompactionStrategy.getBuckets(pairs,
-                                                                                                options.bucketHigh,
-                                                                                                options.bucketLow,
-                                                                                                options.minSSTableSize);
-                    List<SSTableReader> mostInteresting = SizeTieredCompactionStrategy.mostInterestingBucket(buckets, 4, 32);
+                    List<SSTableReader> mostInteresting = getSSTablesForSTCS(getLevel(0));
                     if (!mostInteresting.isEmpty())
                     {
                         logger.debug("L0 is too far behind, performing size-tiering there first");
@@ -302,7 +401,7 @@
         }
 
         // Higher levels are happy, time for a standard, non-STCS L0 compaction
-        if (generations[0].isEmpty())
+        if (getLevel(0).isEmpty())
             return null;
         Collection<SSTableReader> candidates = getCandidatesFor(0);
         if (candidates.isEmpty())
@@ -310,6 +409,17 @@
         return new CompactionCandidate(candidates, getNextLevel(candidates), cfs.getCompactionStrategy().getMaxSSTableBytes());
     }
 
+    private List<SSTableReader> getSSTablesForSTCS(Collection<SSTableReader> sstables)
+    {
+        Iterable<SSTableReader> candidates = cfs.getDataTracker().getUncompactingSSTables(sstables);
+        List<Pair<SSTableReader,Long>> pairs = SizeTieredCompactionStrategy.createSSTableAndLengthPairs(AbstractCompactionStrategy.filterSuspectSSTables(candidates));
+        List<List<SSTableReader>> buckets = SizeTieredCompactionStrategy.getBuckets(pairs,
+                                                                                    options.bucketHigh,
+                                                                                    options.bucketLow,
+                                                                                    options.minSSTableSize);
+        return SizeTieredCompactionStrategy.mostInterestingBucket(buckets, 4, 32);
+    }
+
     /**
      * If we do something that makes many levels contain too little data (cleanup, change sstable size) we will "never"
      * compact the high levels.
@@ -377,14 +487,14 @@
     {
         if (i >= generations.length)
             throw new ArrayIndexOutOfBoundsException("Maximum valid generation is " + (generations.length - 1));
-        return generations[i].size();
+        return getLevel(i).size();
     }
 
     public synchronized int[] getAllLevelSize()
     {
         int[] counts = new int[generations.length];
         for (int i = 0; i < counts.length; i++)
-            counts[i] = generations[i].size();
+            counts[i] = getLevel(i).size();
         return counts;
     }
 
@@ -394,10 +504,10 @@
         {
             for (int i = 0; i < generations.length; i++)
             {
-                if (!generations[i].isEmpty())
+                if (!getLevel(i).isEmpty())
                 {
                     logger.debug("L{} contains {} SSTables ({} bytes) in {}",
-                                 i, generations[i].size(), SSTableReader.getTotalBytes(generations[i]), this);
+                                 i, getLevel(i).size(), SSTableReader.getTotalBytes(getLevel(i)), this);
                 }
             }
         }
@@ -409,6 +519,7 @@
         int level = reader.getSSTableLevel();
         assert level >= 0 : reader + " not present in manifest: "+level;
         generations[level].remove(reader);
+        unrepairedL0.remove(reader);
         return level;
     }
 
@@ -428,13 +539,13 @@
          */
         Iterator<SSTableReader> iter = candidates.iterator();
         SSTableReader sstable = iter.next();
-        Token first = sstable.first.token;
-        Token last = sstable.last.token;
+        Token first = sstable.first.getToken();
+        Token last = sstable.last.getToken();
         while (iter.hasNext())
         {
             sstable = iter.next();
-            first = first.compareTo(sstable.first.token) <= 0 ? first : sstable.first.token;
-            last = last.compareTo(sstable.last.token) >= 0 ? last : sstable.last.token;
+            first = first.compareTo(sstable.first.getToken()) <= 0 ? first : sstable.first.getToken();
+            last = last.compareTo(sstable.last.getToken()) >= 0 ? last : sstable.last.getToken();
         }
         return overlapping(first, last, others);
     }
@@ -442,7 +553,7 @@
     @VisibleForTesting
     static Set<SSTableReader> overlapping(SSTableReader sstable, Iterable<SSTableReader> others)
     {
-        return overlapping(sstable.first.token, sstable.last.token, others);
+        return overlapping(sstable.first.getToken(), sstable.last.getToken(), others);
     }
 
     /**
@@ -455,7 +566,7 @@
         Bounds<Token> promotedBounds = new Bounds<Token>(start, end);
         for (SSTableReader candidate : sstables)
         {
-            Bounds<Token> candidateBounds = new Bounds<Token>(candidate.first.token, candidate.last.token);
+            Bounds<Token> candidateBounds = new Bounds<Token>(candidate.first.getToken(), candidate.last.getToken());
             if (candidateBounds.intersects(promotedBounds))
                 overlapped.add(candidate);
         }
@@ -477,14 +588,14 @@
      */
     private Collection<SSTableReader> getCandidatesFor(int level)
     {
-        assert !generations[level].isEmpty();
+        assert !getLevel(level).isEmpty();
         logger.debug("Choosing candidates for L{}", level);
 
         final Set<SSTableReader> compacting = cfs.getDataTracker().getCompacting();
 
         if (level == 0)
         {
-            Set<SSTableReader> compactingL0 = ImmutableSet.copyOf(Iterables.filter(generations[0], Predicates.in(compacting)));
+            Set<SSTableReader> compactingL0 = ImmutableSet.copyOf(Iterables.filter(getLevel(0), Predicates.in(compacting)));
 
             // L0 is the dumping ground for new sstables which thus may overlap each other.
             //
@@ -501,7 +612,7 @@
             // So if an L1 sstable is suspect we can't do much besides try anyway and hope for the best.
             Set<SSTableReader> candidates = new HashSet<SSTableReader>();
             Set<SSTableReader> remaining = new HashSet<SSTableReader>();
-            Iterables.addAll(remaining, Iterables.filter(generations[0], Predicates.not(suspectP)));
+            Iterables.addAll(remaining, Iterables.filter(getLevel(0), Predicates.not(suspectP)));
             for (SSTableReader sstable : ageSortedSSTables(remaining))
             {
                 if (candidates.contains(sstable))
@@ -523,18 +634,18 @@
                 if (candidates.size() > MAX_COMPACTING_L0)
                 {
                     // limit to only the MAX_COMPACTING_L0 oldest candidates
-                    candidates = new HashSet<SSTableReader>(ageSortedSSTables(candidates).subList(0, MAX_COMPACTING_L0));
+                    candidates = new HashSet<>(ageSortedSSTables(candidates).subList(0, MAX_COMPACTING_L0));
                     break;
                 }
             }
 
             // leave everything in L0 if we didn't end up with a full sstable's worth of data
-            if (SSTable.getTotalBytes(candidates) > maxSSTableSizeInBytes)
+            if (SSTableReader.getTotalBytes(candidates) > maxSSTableSizeInBytes)
             {
                 // add sstables from L1 that overlap candidates
                 // if the overlapping ones are already busy in a compaction, leave it out.
                 // TODO try to find a set of L0 sstables that only overlaps with non-busy L1 sstables
-                Set<SSTableReader> l1overlapping = overlapping(candidates, generations[1]);
+                Set<SSTableReader> l1overlapping = overlapping(candidates, getLevel(1));
                 if (Sets.intersection(l1overlapping, compacting).size() > 0)
                     return Collections.emptyList();
                 candidates = Sets.union(candidates, l1overlapping);
@@ -546,11 +657,11 @@
         }
 
         // for non-L0 compactions, pick up where we left off last time
-        Collections.sort(generations[level], SSTable.sstableComparator);
+        Collections.sort(getLevel(level), SSTableReader.sstableComparator);
         int start = 0; // handles case where the prior compaction touched the very last range
-        for (int i = 0; i < generations[level].size(); i++)
+        for (int i = 0; i < getLevel(level).size(); i++)
         {
-            SSTableReader sstable = generations[level].get(i);
+            SSTableReader sstable = getLevel(level).get(i);
             if (sstable.first.compareTo(lastCompactedKeys[level]) > 0)
             {
                 start = i;
@@ -560,10 +671,10 @@
 
         // look for a non-suspect keyspace to compact with, starting with where we left off last time,
         // and wrapping back to the beginning of the generation if necessary
-        for (int i = 0; i < generations[level].size(); i++)
+        for (int i = 0; i < getLevel(level).size(); i++)
         {
-            SSTableReader sstable = generations[level].get((start + i) % generations[level].size());
-            Set<SSTableReader> candidates = Sets.union(Collections.singleton(sstable), overlapping(sstable, generations[level + 1]));
+            SSTableReader sstable = getLevel(level).get((start + i) % getLevel(level).size());
+            Set<SSTableReader> candidates = Sets.union(Collections.singleton(sstable), overlapping(sstable, getLevel(level + 1)));
             if (Iterables.any(candidates, suspectP))
                 continue;
             if (Sets.intersection(candidates, compacting).isEmpty())
@@ -577,7 +688,7 @@
     private List<SSTableReader> ageSortedSSTables(Collection<SSTableReader> candidates)
     {
         List<SSTableReader> ageSortedCandidates = new ArrayList<SSTableReader>(candidates);
-        Collections.sort(ageSortedCandidates, SSTable.maxTimestampComparator);
+        Collections.sort(ageSortedCandidates, SSTableReader.maxTimestampComparator);
         return ageSortedCandidates;
     }
 
@@ -591,7 +702,7 @@
     {
         for (int i = generations.length - 1; i >= 0; i--)
         {
-            if (generations[i].size() > 0)
+            if (getLevel(i).size() > 0)
                 return i;
         }
         return 0;
@@ -599,7 +710,7 @@
 
     public synchronized SortedSet<SSTableReader> getLevelSorted(int level, Comparator<SSTableReader> comparator)
     {
-        return ImmutableSortedSet.copyOf(comparator, generations[level]);
+        return ImmutableSortedSet.copyOf(comparator, getLevel(level));
     }
 
     public List<SSTableReader> getLevel(int i)
@@ -614,7 +725,7 @@
 
         for (int i = generations.length - 1; i >= 0; i--)
         {
-            List<SSTableReader> sstables = generations[i];
+            List<SSTableReader> sstables = getLevel(i);
             estimated[i] = Math.max(0L, SSTableReader.getTotalBytes(sstables) - maxBytesForLevel(i)) / maxSSTableSizeInBytes;
             tasks += estimated[i];
         }
@@ -635,7 +746,7 @@
         }
 
         int newLevel;
-        if (minimumLevel == 0 && minimumLevel == maximumLevel && SSTable.getTotalBytes(sstables) < maxSSTableSizeInBytes)
+        if (minimumLevel == 0 && minimumLevel == maximumLevel && SSTableReader.getTotalBytes(sstables) < maxSSTableSizeInBytes)
         {
             newLevel = 0;
         }
@@ -648,33 +759,9 @@
 
     }
 
-    /**
-     * Scary method mutating existing sstable component
-     *
-     * Tries to do it safely by moving the new file on top of the old one
-     *
-     * Caller needs to reload the sstable metadata (sstableReader.reloadSSTableMetadata())
-     *
-     * @see org.apache.cassandra.io.sstable.SSTableReader#reloadSSTableMetadata()
-     *
-     * @param oldMetadata
-     * @param descriptor
-     * @param filename
-     * @param level
-     * @throws IOException
-     */
-    public static synchronized void mutateLevel(Pair<SSTableMetadata, Set<Integer>> oldMetadata, Descriptor descriptor, String filename, int level) throws IOException
+    public boolean hasRepairedData()
     {
-        logger.debug("Mutating {} to level {}", descriptor.filenameFor(Component.STATS), level);
-        SSTableMetadata metadata = SSTableMetadata.copyWithNewSSTableLevel(oldMetadata.left, level);
-        DataOutputStream out = new DataOutputStream(new FileOutputStream(filename + "-tmp"));
-        SSTableMetadata.serializer.legacySerialize(metadata, oldMetadata.right, descriptor, out);
-        out.flush();
-        out.close();
-        // we cant move a file on top of another file in windows:
-        if (!FBUtilities.isUnix())
-            FileUtils.delete(filename);
-        FileUtils.renameWithConfirm(filename + "-tmp", filename);
+        return hasRepairedData;
     }
 
     public static class CompactionCandidate
@@ -690,16 +777,4 @@
             this.maxSSTableBytes = maxSSTableBytes;
         }
     }
-
-    public static void maybeMigrateManifests() throws IOException
-    {
-        for (String keyspaceName : Schema.instance.getKeyspaces())
-        {
-            for (CFMetaData cfm : Schema.instance.getKeyspaceMetaData(keyspaceName).values())
-            {
-                if (LegacyLeveledManifest.manifestNeedsMigration(keyspaceName,cfm.cfName))
-                    LegacyLeveledManifest.migrateManifests(keyspaceName, cfm.cfName);
-            }
-        }
-    }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/OperationType.java b/src/java/org/apache/cassandra/db/compaction/OperationType.java
index 2416ed1..15d18f6 100644
--- a/src/java/org/apache/cassandra/db/compaction/OperationType.java
+++ b/src/java/org/apache/cassandra/db/compaction/OperationType.java

@@ -23,13 +23,15 @@
     VALIDATION("Validation"),
     KEY_CACHE_SAVE("Key cache save"),
     ROW_CACHE_SAVE("Row cache save"),
+    COUNTER_CACHE_SAVE("Counter cache save"),
     CLEANUP("Cleanup"),
     SCRUB("Scrub"),
     UPGRADE_SSTABLES("Upgrade sstables"),
     INDEX_BUILD("Secondary index build"),
     /** Compaction for tombstone removal */
     TOMBSTONE_COMPACTION("Tombstone Compaction"),
-    UNKNOWN("Unknown compaction type");
+    UNKNOWN("Unknown compaction type"),
+    ANTICOMPACTION("Anticompaction after repair");
 
     private final String type;
 

diff --git a/src/java/org/apache/cassandra/db/compaction/ParallelCompactionIterable.java b/src/java/org/apache/cassandra/db/compaction/ParallelCompactionIterable.java
deleted file mode 100644
index f1790d8..0000000
--- a/src/java/org/apache/cassandra/db/compaction/ParallelCompactionIterable.java
+++ /dev/null

@@ -1,403 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.concurrent.*;
-
-import com.google.common.collect.AbstractIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
-import org.apache.cassandra.concurrent.NamedThreadFactory;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
-import org.apache.cassandra.utils.*;
-
-/**
- * A class to run compaction taking advantage of multiple-core processes:
- *
- * One Deserializer thread per input sstable performs read + deserialize (a row at a time).
- * The resulting ColumnFamilies are added to a queue, which is fed to the merge Reducer.
- *
- * The merge Reducer creates MergeTasks on a thread-per-core Executor, and returns AsyncPrecompactedRow objects.
- *
- * The main complication is in handling larger-than-memory rows.  When one is encountered, no further deserialization
- * is done until that row is merged and written -- creating a pipeline stall, as it were.  Thus, this is intended
- * to be useful with mostly-in-memory row sizes, but preserves correctness in the face of occasional exceptions.
- */
-public class ParallelCompactionIterable extends AbstractCompactionIterable
-{
-    private static final Logger logger = LoggerFactory.getLogger(ParallelCompactionIterable.class);
-
-    private final int maxInMemorySize;
-
-    public ParallelCompactionIterable(OperationType type, List<ICompactionScanner> scanners, CompactionController controller)
-    {
-        this(type, scanners, controller, DatabaseDescriptor.getInMemoryCompactionLimit() / (scanners.isEmpty() ? 1 : scanners.size()));
-    }
-
-    public ParallelCompactionIterable(OperationType type, List<ICompactionScanner> scanners, CompactionController controller, int maxInMemorySize)
-    {
-        super(controller, type, scanners);
-        this.maxInMemorySize = maxInMemorySize;
-    }
-
-    public CloseableIterator<AbstractCompactedRow> iterator()
-    {
-        List<CloseableIterator<RowContainer>> sources = new ArrayList<CloseableIterator<RowContainer>>(scanners.size());
-        for (ICompactionScanner scanner : scanners)
-            sources.add(new Deserializer(scanner, maxInMemorySize));
-        return new Unwrapper(MergeIterator.get(sources, RowContainer.comparator, new Reducer()));
-    }
-
-    private static class Unwrapper extends AbstractIterator<AbstractCompactedRow> implements CloseableIterator<AbstractCompactedRow>
-    {
-        private final CloseableIterator<CompactedRowContainer> reducer;
-
-        public Unwrapper(CloseableIterator<CompactedRowContainer> reducer)
-        {
-            this.reducer = reducer;
-        }
-
-        protected AbstractCompactedRow computeNext()
-        {
-            if (!reducer.hasNext())
-                return endOfData();
-
-            CompactedRowContainer container = reducer.next();
-            AbstractCompactedRow compactedRow;
-            compactedRow = container.future == null
-                         ? container.row
-                         : new PrecompactedRow(container.key, FBUtilities.waitOnFuture(container.future));
-
-            return compactedRow;
-        }
-
-        public void close() throws IOException
-        {
-            reducer.close();
-        }
-    }
-
-    private class Reducer extends MergeIterator.Reducer<RowContainer, CompactedRowContainer>
-    {
-        private final List<RowContainer> rows = new ArrayList<RowContainer>();
-
-        private final ThreadPoolExecutor executor = new DebuggableThreadPoolExecutor(FBUtilities.getAvailableProcessors(),
-                                                                                     Integer.MAX_VALUE,
-                                                                                     TimeUnit.MILLISECONDS,
-                                                                                     new SynchronousQueue<Runnable>(),
-                                                                                     new NamedThreadFactory("CompactionReducer"));
-
-        public void reduce(RowContainer current)
-        {
-            rows.add(current);
-        }
-
-        protected CompactedRowContainer getReduced()
-        {
-            assert rows.size() > 0;
-
-            ParallelCompactionIterable.this.updateCounterFor(rows.size());
-            CompactedRowContainer compacted = getCompactedRow(rows);
-            rows.clear();
-            long n = 0;
-            for (ICompactionScanner scanner : scanners)
-                n += scanner.getCurrentPosition();
-            bytesRead = n;
-            return compacted;
-        }
-
-        public CompactedRowContainer getCompactedRow(List<RowContainer> rows)
-        {
-            boolean inMemory = true;
-            for (RowContainer container : rows)
-            {
-                if (container.row == null)
-                {
-                    inMemory = false;
-                    break;
-                }
-            }
-
-            if (inMemory)
-            {
-                // caller will re-use rows List, so make ourselves a copy
-                List<Row> rawRows = new ArrayList<Row>(rows.size());
-                for (RowContainer rowContainer : rows)
-                    rawRows.add(rowContainer.row);
-                return new CompactedRowContainer(rows.get(0).getKey(), executor.submit(new MergeTask(rawRows)));
-            }
-
-            List<OnDiskAtomIterator> iterators = new ArrayList<OnDiskAtomIterator>(rows.size());
-            for (RowContainer container : rows)
-                iterators.add(container.row == null ? container.wrapper : new DeserializedColumnIterator(container.row));
-            return new CompactedRowContainer(new LazilyCompactedRow(controller, iterators));
-        }
-
-        public void close()
-        {
-            executor.shutdown();
-        }
-
-        /**
-         * Merges a set of in-memory rows
-         */
-        private class MergeTask implements Callable<ColumnFamily>
-        {
-            private final List<Row> rows;
-
-            public MergeTask(List<Row> rows)
-            {
-                this.rows = rows;
-            }
-
-            public ColumnFamily call() throws Exception
-            {
-                final ColumnFamily returnCF = ArrayBackedSortedColumns.factory.create(controller.cfs.metadata);
-
-                List<CloseableIterator<Column>> data = new ArrayList<CloseableIterator<Column>>(rows.size());
-                for (Row row : rows)
-                {
-                    returnCF.delete(row.cf);
-                    data.add(FBUtilities.closeableIterator(row.cf.iterator()));
-                }
-
-                PrecompactedRow.merge(returnCF, data, controller.cfs.indexManager.updaterFor(rows.get(0).key));
-                return PrecompactedRow.removeDeleted(rows.get(0).key, controller, returnCF);
-            }
-        }
-
-        private class DeserializedColumnIterator implements OnDiskAtomIterator
-        {
-            private final Row row;
-            private final Iterator<Column> iter;
-
-            public DeserializedColumnIterator(Row row)
-            {
-                this.row = row;
-                iter = row.cf.iterator();
-            }
-
-            public ColumnFamily getColumnFamily()
-            {
-                return row.cf;
-            }
-
-            public DecoratedKey getKey()
-            {
-                return row.key;
-            }
-
-            public void close() throws IOException {}
-
-            public boolean hasNext()
-            {
-                return iter.hasNext();
-            }
-
-            public OnDiskAtom next()
-            {
-                return iter.next();
-            }
-
-            public void remove()
-            {
-                throw new UnsupportedOperationException();
-            }
-        }
-    }
-
-    private static class Deserializer extends AbstractIterator<RowContainer> implements CloseableIterator<RowContainer>
-    {
-        private final LinkedBlockingQueue<RowContainer> queue = new LinkedBlockingQueue<RowContainer>(1);
-        private static final RowContainer finished = new RowContainer((Row) null);
-        private final ICompactionScanner scanner;
-
-        public Deserializer(ICompactionScanner ssts, final int maxInMemorySize)
-        {
-            this.scanner = ssts;
-            Runnable runnable = new WrappedRunnable()
-            {
-                protected void runMayThrow() throws Exception
-                {
-                    SimpleCondition condition = null;
-                    while (true)
-                    {
-                        if (condition != null)
-                        {
-                            condition.await();
-                            condition = null;
-                        }
-                        if (!scanner.hasNext())
-                        {
-                            queue.put(finished);
-                            break;
-                        }
-
-                        SSTableIdentityIterator iter = (SSTableIdentityIterator) scanner.next();
-                        if (iter.dataSize > maxInMemorySize)
-                        {
-                            logger.debug("parallel lazy deserialize from {}", iter.getPath());
-                            condition = new SimpleCondition();
-                            queue.put(new RowContainer(new NotifyingSSTableIdentityIterator(iter, condition)));
-                        }
-                        else
-                        {
-                            logger.debug("parallel eager deserialize from {}", iter.getPath());
-                            queue.put(new RowContainer(new Row(iter.getKey(), iter.getColumnFamilyWithColumns(ArrayBackedSortedColumns.factory))));
-                        }
-                    }
-                }
-            };
-            new Thread(runnable, "Deserialize " + scanner.getBackingFiles()).start();
-        }
-
-        protected RowContainer computeNext()
-        {
-            RowContainer container;
-            try
-            {
-                container = queue.take();
-            }
-            catch (InterruptedException e)
-            {
-                throw new AssertionError(e);
-            }
-            return container == finished ? endOfData() : container;
-        }
-
-        public void close() throws IOException
-        {
-            scanner.close();
-        }
-    }
-
-    /**
-     * a wrapper around SSTII that notifies the given condition when it is closed
-     */
-    private static class NotifyingSSTableIdentityIterator implements OnDiskAtomIterator
-    {
-        private final SSTableIdentityIterator wrapped;
-        private final SimpleCondition condition;
-
-        public NotifyingSSTableIdentityIterator(SSTableIdentityIterator wrapped, SimpleCondition condition)
-        {
-            this.wrapped = wrapped;
-            this.condition = condition;
-        }
-
-        public ColumnFamily getColumnFamily()
-        {
-            return wrapped.getColumnFamily();
-        }
-
-        public DecoratedKey getKey()
-        {
-            return wrapped.getKey();
-        }
-
-        public void close() throws IOException
-        {
-            try
-            {
-                wrapped.close();
-            }
-            finally
-            {
-                condition.signalAll();
-            }
-        }
-
-        public boolean hasNext()
-        {
-            return wrapped.hasNext();
-        }
-
-        public OnDiskAtom next()
-        {
-            return wrapped.next();
-        }
-
-        public void remove()
-        {
-            throw new UnsupportedOperationException();
-        }
-    }
-
-    private static class RowContainer
-    {
-        // either row is not null, or wrapper is not null.  But not both.
-        public final Row row;
-        public final NotifyingSSTableIdentityIterator wrapper;
-        public static final Comparator<RowContainer> comparator = new Comparator<RowContainer>()
-        {
-            public int compare(RowContainer o1, RowContainer o2)
-            {
-                return o1.getKey().compareTo(o2.getKey());
-            }
-        };
-
-        private RowContainer(Row row)
-        {
-            this.row = row;
-            wrapper = null;
-        }
-
-        public RowContainer(NotifyingSSTableIdentityIterator wrapper)
-        {
-            this.wrapper = wrapper;
-            row = null;
-        }
-
-        public DecoratedKey getKey()
-        {
-            return row == null ? wrapper.getKey() : row.key;
-        }
-    }
-
-    private static class CompactedRowContainer
-    {
-        public final DecoratedKey key;
-        /** either "future" or "row" will be not-null, but not both at once. */
-        public final Future<ColumnFamily> future;
-        public final LazilyCompactedRow row;
-
-        private CompactedRowContainer(DecoratedKey key, Future<ColumnFamily> future)
-        {
-            this.key = key;
-            this.future = future;
-            row = null;
-        }
-
-        private CompactedRowContainer(LazilyCompactedRow row)
-        {
-            this.row = row;
-            future = null;
-            key = null;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/compaction/PrecompactedRow.java b/src/java/org/apache/cassandra/db/compaction/PrecompactedRow.java
deleted file mode 100644
index db72847..0000000
--- a/src/java/org/apache/cassandra/db/compaction/PrecompactedRow.java
+++ /dev/null

@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.io.DataOutput;
-import java.io.IOException;
-import java.security.MessageDigest;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.io.sstable.ColumnStats;
-import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
-import org.apache.cassandra.io.sstable.SSTableWriter;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.utils.CloseableIterator;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.MergeIterator;
-
-/**
- * PrecompactedRow merges its rows in its constructor in memory.
- */
-public class PrecompactedRow extends AbstractCompactedRow
-{
-    private final ColumnFamily compactedCf;
-
-    // it is caller's responsibility to call removeDeleted from the cf before calling this constructor
-    public PrecompactedRow(DecoratedKey key, ColumnFamily cf)
-    {
-        super(key);
-        compactedCf = cf;
-    }
-
-    public static ColumnFamily removeDeleted(DecoratedKey key, CompactionController controller, ColumnFamily cf)
-    {
-        assert key != null;
-        assert controller != null;
-        assert cf != null;
-
-        // avoid calling shouldPurge unless we actually need to: it can be very expensive if LCS
-        // gets behind and has hundreds of overlapping L0 sstables.  Essentially, this method is an
-        // ugly refactor of removeDeleted(controller.shouldPurge(key), controller, cf),
-        // taking this into account.
-        Boolean shouldPurge = null;
-
-        if (cf.hasIrrelevantData(controller.gcBefore))
-            shouldPurge = controller.shouldPurge(key, cf.maxTimestamp());
-
-        // We should only gc tombstone if shouldPurge == true. But otherwise,
-        // it is still ok to collect column that shadowed by their (deleted)
-        // container, which removeDeleted(cf, Integer.MAX_VALUE) will do
-        return ColumnFamilyStore.removeDeleted(cf, shouldPurge != null && shouldPurge ? controller.gcBefore : Integer.MIN_VALUE);
-    }
-
-    public static ColumnFamily removeDeleted(DecoratedKey key, boolean shouldPurge, CompactionController controller, ColumnFamily cf)
-    {
-        // See comment in preceding method
-        return ColumnFamilyStore.removeDeleted(cf,
-                                               shouldPurge ? controller.gcBefore : Integer.MIN_VALUE,
-                                               controller.cfs.indexManager.updaterFor(key));
-    }
-
-    public PrecompactedRow(CompactionController controller, List<SSTableIdentityIterator> rows)
-    {
-        this(rows.get(0).getKey(), removeDeleted(rows.get(0).getKey(), controller, merge(rows, controller)));
-    }
-
-    private static ColumnFamily merge(List<SSTableIdentityIterator> rows, CompactionController controller)
-    {
-        assert !rows.isEmpty();
-
-        final ColumnFamily returnCF = ArrayBackedSortedColumns.factory.create(controller.cfs.metadata);
-
-        // transform into iterators that MergeIterator will like, and apply row-level tombstones
-        List<CloseableIterator<Column>> data = new ArrayList<>(rows.size());
-        for (SSTableIdentityIterator row : rows)
-        {
-            ColumnFamily cf = row.getColumnFamilyWithColumns(ArrayBackedSortedColumns.factory);
-            returnCF.delete(cf);
-            data.add(FBUtilities.closeableIterator(cf.iterator()));
-        }
-
-        merge(returnCF, data, controller.cfs.indexManager.updaterFor(rows.get(0).getKey()));
-
-        return returnCF;
-    }
-
-    // returnCF should already have row-level tombstones applied
-    public static void merge(final ColumnFamily returnCF, List<CloseableIterator<Column>> data, final SecondaryIndexManager.Updater indexer)
-    {
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Comparator<Column> fcomp = filter.getColumnComparator(returnCF.getComparator());
-
-        MergeIterator.Reducer<Column, Column> reducer = new MergeIterator.Reducer<Column, Column>()
-        {
-            ColumnFamily container = returnCF.cloneMeShallow();
-
-            public void reduce(Column column)
-            {
-                container.addColumn(column);
-
-                // skip the index-update checks if there is no indexing needed since they are a bit expensive
-                if (indexer == SecondaryIndexManager.nullUpdater)
-                    return;
-
-                // notify the index that the column has been overwritten if the value being reduced has been
-                // superceded by another directly, or indirectly by a range tombstone
-                if ((!column.isMarkedForDelete(System.currentTimeMillis()) && !container.getColumn(column.name()).equals(column))
-                    || returnCF.deletionInfo().isDeleted(column))
-                {
-                    indexer.remove(column);
-                }
-            }
-
-            protected Column getReduced()
-            {
-                Column c = container.iterator().next();
-                container.clear();
-                return c;
-            }
-        };
-
-        Iterator<Column> reduced = MergeIterator.get(data, fcomp, reducer);
-        filter.collectReducedColumns(returnCF, reduced, CompactionManager.NO_GC, System.currentTimeMillis());
-    }
-
-    public RowIndexEntry write(long currentPosition, DataOutput out) throws IOException
-    {
-        if (compactedCf == null)
-            return null;
-
-        return SSTableWriter.rawAppend(compactedCf, currentPosition, key, out);
-    }
-
-    public void update(MessageDigest digest)
-    {
-        if (compactedCf == null)
-            return;
-
-        DataOutputBuffer buffer = new DataOutputBuffer();
-        try
-        {
-            DeletionTime.serializer.serialize(compactedCf.deletionInfo().getTopLevelDeletion(), buffer);
-            digest.update(buffer.getData(), 0, buffer.getLength());
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-        compactedCf.updateDigest(digest);
-    }
-
-    public ColumnStats columnStats()
-    {
-        return compactedCf.getColumnStats();
-    }
-
-    /**
-     * @return the full column family represented by this compacted row.
-     *
-     * We do not provide this method for other AbstractCompactedRow, because this fits the whole row into
-     * memory and don't make sense for those other implementations.
-     */
-    public ColumnFamily getFullColumnFamily()
-    {
-        return compactedCf;
-    }
-
-    public void close() { }
-}

diff --git a/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java b/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
index d0aafa4..6b9f161 100644
--- a/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
+++ b/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java

@@ -57,7 +57,7 @@
 
         public SplittingCompactionTask(ColumnFamilyStore cfs, SSTableReader sstable, int sstableSizeInMB)
         {
-            super(cfs, Collections.singletonList(sstable), CompactionManager.NO_GC);
+            super(cfs, Collections.singletonList(sstable), CompactionManager.NO_GC, true);
             this.sstableSizeInMB = sstableSizeInMB;
 
             if (sstableSizeInMB <= 0)
@@ -67,12 +67,7 @@
         @Override
         protected CompactionController getCompactionController(Set<SSTableReader> toCompact)
         {
-            return new SplitController(cfs, toCompact);
-        }
-
-        @Override
-        protected void replaceCompactedSSTables(Collection<SSTableReader> compacted, Collection<SSTableReader> replacements)
-        {
+            return new SplitController(cfs);
         }
 
         @Override
@@ -90,15 +85,15 @@
 
     public static class SplitController extends CompactionController
     {
-        public SplitController(ColumnFamilyStore cfs, Collection<SSTableReader> toCompact)
+        public SplitController(ColumnFamilyStore cfs)
         {
             super(cfs, CompactionManager.NO_GC);
         }
 
         @Override
-        public boolean shouldPurge(DecoratedKey key, long maxDeletionTimestamp)
+        public long maxPurgeableTimestamp(DecoratedKey key)
         {
-            return false;
+            return Long.MIN_VALUE;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index 820761c..b3d098d 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java

@@ -27,15 +27,17 @@
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.OutputHandler;
 
 public class Scrubber implements Closeable
 {
-    public final ColumnFamilyStore cfs;
-    public final SSTableReader sstable;
-    public final File destination;
-    public final boolean skipCorrupted;
+    private final ColumnFamilyStore cfs;
+    private final SSTableReader sstable;
+    private final File destination;
+    private final boolean skipCorrupted;
 
     private final CompactionController controller;
     private final boolean isCommutative;
@@ -45,7 +47,8 @@
     private final RandomAccessReader indexFile;
     private final ScrubInfo scrubInfo;
 
-    private SSTableWriter writer;
+    private final boolean isOffline;
+
     private SSTableReader newSstable;
     private SSTableReader newInOrderSstable;
 
@@ -64,9 +67,9 @@
     };
     private final SortedSet<Row> outOfOrderRows = new TreeSet<>(rowComparator);
 
-    public Scrubber(ColumnFamilyStore cfs, SSTableReader sstable, boolean skipCorrupted) throws IOException
+    public Scrubber(ColumnFamilyStore cfs, SSTableReader sstable, boolean skipCorrupted, boolean isOffline) throws IOException
     {
-        this(cfs, sstable, skipCorrupted, new OutputHandler.LogOutput(), false);
+        this(cfs, sstable, skipCorrupted, new OutputHandler.LogOutput(), isOffline);
     }
 
     public Scrubber(ColumnFamilyStore cfs, SSTableReader sstable, boolean skipCorrupted, OutputHandler outputHandler, boolean isOffline) throws IOException
@@ -75,6 +78,7 @@
         this.sstable = sstable;
         this.outputHandler = outputHandler;
         this.skipCorrupted = skipCorrupted;
+        this.isOffline = isOffline;
 
         // Calculate the expected compacted filesize
         this.destination = cfs.directories.getDirectoryForNewSSTables();
@@ -86,8 +90,8 @@
         this.controller = isOffline
                         ? new ScrubController(cfs)
                         : new CompactionController(cfs, Collections.singleton(sstable), CompactionManager.getDefaultGcBefore(cfs));
-        this.isCommutative = cfs.metadata.getDefaultValidator().isCommutative();
-        this.expectedBloomFilterSize = Math.max(cfs.metadata.getIndexInterval(), (int)(SSTableReader.getApproximateKeyCount(toScrub,cfs.metadata)));
+        this.isCommutative = cfs.metadata.isCounter();
+        this.expectedBloomFilterSize = Math.max(cfs.metadata.getMinIndexInterval(), (int)(SSTableReader.getApproximateKeyCount(toScrub)));
 
         // loop through each row, deserializing to check for damage.
         // we'll also loop through the index at the same time, using the position from the index to recover if the
@@ -103,17 +107,17 @@
     public void scrub()
     {
         outputHandler.output(String.format("Scrubbing %s (%s bytes)", sstable, dataFile.length()));
+        SSTableRewriter writer = new SSTableRewriter(cfs, new HashSet<>(Collections.singleton(sstable)), sstable.maxDataAge, OperationType.SCRUB, isOffline);
         try
         {
             ByteBuffer nextIndexKey = ByteBufferUtil.readWithShortLength(indexFile);
             {
                 // throw away variable so we don't have a side effect in the assert
-                long firstRowPositionFromIndex = RowIndexEntry.serializer.deserialize(indexFile, sstable.descriptor.version).position;
+                long firstRowPositionFromIndex = sstable.metadata.comparator.rowIndexEntrySerializer().deserialize(indexFile, sstable.descriptor.version).position;
                 assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex;
             }
 
-            // TODO errors when creating the writer may leave empty temp files.
-            writer = CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, sstable);
+            writer.switchWriter(CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, sstable.getSSTableMetadata().repairedAt, sstable));
 
             DecoratedKey prevKey = null;
 
@@ -129,11 +133,6 @@
                 try
                 {
                     key = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(dataFile));
-                    if (sstable.descriptor.version.hasRowSizeAndColumnCount)
-                    {
-                        dataSize = dataFile.readLong();
-                        outputHandler.debug(String.format("row %s is %s bytes", ByteBufferUtil.bytesToHex(key.key), dataSize));
-                    }
                 }
                 catch (Throwable th)
                 {
@@ -148,10 +147,11 @@
                     nextIndexKey = indexFile.isEOF() ? null : ByteBufferUtil.readWithShortLength(indexFile);
                     nextRowPositionFromIndex = indexFile.isEOF()
                                              ? dataFile.length()
-                                             : RowIndexEntry.serializer.deserialize(indexFile, sstable.descriptor.version).position;
+                                             : sstable.metadata.comparator.rowIndexEntrySerializer().deserialize(indexFile, sstable.descriptor.version).position;
                 }
                 catch (Throwable th)
                 {
+                    JVMStabilityInspector.inspectThrowable(th);
                     outputHandler.warn("Error reading index file", th);
                     nextIndexKey = null;
                     nextRowPositionFromIndex = dataFile.length();
@@ -161,26 +161,15 @@
                 long dataStartFromIndex = currentIndexKey == null
                                         ? -1
                                         : rowStart + 2 + currentIndexKey.remaining();
-                if (sstable.descriptor.version.hasRowSizeAndColumnCount)
-                    dataStartFromIndex += 8;
                 long dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex;
 
-                if (!sstable.descriptor.version.hasRowSizeAndColumnCount)
-                {
-                    dataSize = dataSizeFromIndex;
-                    // avoid an NPE if key is null
-                    String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.key);
-                    outputHandler.debug(String.format("row %s is %s bytes", keyName, dataSize));
-                }
-                else
-                {
-                    if (currentIndexKey != null)
-                        outputHandler.debug(String.format("Index doublecheck: row %s is %s bytes", ByteBufferUtil.bytesToHex(currentIndexKey),  dataSizeFromIndex));
-                }
+                dataSize = dataSizeFromIndex;
+                // avoid an NPE if key is null
+                String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey());
+                outputHandler.debug(String.format("row %s is %s bytes", keyName, dataSize));
 
                 assert currentIndexKey != null || indexFile.isEOF();
 
-                writer.mark();
                 try
                 {
                     if (key == null)
@@ -196,22 +185,21 @@
                     }
 
                     AbstractCompactedRow compactedRow = new LazilyCompactedRow(controller, Collections.singletonList(atoms));
-                    if (writer.append(compactedRow) == null)
+                    if (writer.tryAppend(compactedRow) == null)
                         emptyRows++;
                     else
                         goodRows++;
                     prevKey = key;
-                    if (!key.key.equals(currentIndexKey) || dataStart != dataStartFromIndex)
+                    if (!key.getKey().equals(currentIndexKey) || dataStart != dataStartFromIndex)
                         outputHandler.warn("Index file contained a different key or row size; using key from data file");
                 }
                 catch (Throwable th)
                 {
                     throwIfFatal(th);
                     outputHandler.warn("Error reading row (stacktrace follows):", th);
-                    writer.resetAndTruncate();
 
                     if (currentIndexKey != null
-                        && (key == null || !key.key.equals(currentIndexKey) || dataStart != dataStartFromIndex || dataSize != dataSizeFromIndex))
+                        && (key == null || !key.getKey().equals(currentIndexKey) || dataStart != dataStartFromIndex || dataSize != dataSizeFromIndex))
                     {
                         outputHandler.output(String.format("Retrying from row index; data is %s bytes starting at %s",
                                                   dataSizeFromIndex, dataStartFromIndex));
@@ -226,7 +214,7 @@
                             }
 
                             AbstractCompactedRow compactedRow = new LazilyCompactedRow(controller, Collections.singletonList(atoms));
-                            if (writer.append(compactedRow) == null)
+                            if (writer.tryAppend(compactedRow) == null)
                                 emptyRows++;
                             else
                                 goodRows++;
@@ -238,7 +226,6 @@
                             throwIfCommutative(key, th2);
 
                             outputHandler.warn("Retry failed too. Skipping to next row (retry's stacktrace follows)", th2);
-                            writer.resetAndTruncate();
                             dataFile.seek(nextRowPositionFromIndex);
                             badRows++;
                         }
@@ -255,13 +242,27 @@
                 }
             }
 
-            if (writer.getFilePointer() > 0)
-                newSstable = writer.closeAndOpenReader(sstable.maxDataAge);
+            if (!outOfOrderRows.isEmpty())
+            {
+                // out of order rows, but no bad rows found - we can keep our repairedAt time
+                long repairedAt = badRows > 0 ? ActiveRepairService.UNREPAIRED_SSTABLE : sstable.getSSTableMetadata().repairedAt;
+                SSTableWriter inOrderWriter = CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, repairedAt, sstable);
+                for (Row row : outOfOrderRows)
+                    inOrderWriter.append(row.key, row.cf);
+                newInOrderSstable = inOrderWriter.closeAndOpenReader(sstable.maxDataAge);
+                if (!isOffline)
+                    cfs.getDataTracker().addSSTables(Collections.singleton(newInOrderSstable));
+                outputHandler.warn(String.format("%d out of order rows found while scrubbing %s; Those have been written (in order) to a new sstable (%s)", outOfOrderRows.size(), sstable, newInOrderSstable));
+            }
+
+            // finish obsoletes the old sstable
+            writer.finish(!isOffline, badRows > 0 ? ActiveRepairService.UNREPAIRED_SSTABLE : sstable.getSSTableMetadata().repairedAt);
+            if (!writer.finished().isEmpty())
+                newSstable = writer.finished().get(0);
         }
         catch (Throwable t)
         {
-            if (writer != null)
-                writer.abort();
+            writer.abort();
             throw Throwables.propagate(t);
         }
         finally
@@ -269,15 +270,6 @@
             controller.close();
         }
 
-        if (!outOfOrderRows.isEmpty())
-        {
-            SSTableWriter inOrderWriter = CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, sstable);
-            for (Row row : outOfOrderRows)
-                inOrderWriter.append(row.key, row.cf);
-            newInOrderSstable = inOrderWriter.closeAndOpenReader(sstable.maxDataAge);
-            outputHandler.warn(String.format("%d out of order rows found while scrubbing %s; Those have been written (in order) to a new sstable (%s)", outOfOrderRows.size(), sstable, newInOrderSstable));
-        }
-
         if (newSstable == null)
         {
             if (badRows > 0)
@@ -299,7 +291,7 @@
         outputHandler.warn(String.format("Out of order row detected (%s found after %s)", key, prevKey));
         // adding atoms in sorted order is worst-case for TMBSC, but we shouldn't need to do this very often
         // and there's no sense in failing on mis-sorted cells when a TreeMap could safe us
-        ColumnFamily cf = atoms.getColumnFamily().cloneMeShallow(TreeMapBackedSortedColumns.factory, false);
+        ColumnFamily cf = atoms.getColumnFamily().cloneMeShallow(ArrayBackedSortedColumns.factory, false);
         while (atoms.hasNext())
         {
             OnDiskAtom atom = atoms.next();
@@ -383,9 +375,9 @@
         }
 
         @Override
-        public boolean shouldPurge(DecoratedKey key, long delTimestamp)
+        public long maxPurgeableTimestamp(DecoratedKey key)
         {
-            return false;
+            return Long.MIN_VALUE;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
index 0f94918..461c5e1 100644
--- a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java

@@ -23,6 +23,7 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -30,12 +31,35 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.Pair;
 
 public class SizeTieredCompactionStrategy extends AbstractCompactionStrategy
 {
     private static final Logger logger = LoggerFactory.getLogger(SizeTieredCompactionStrategy.class);
 
+    private static final Comparator<Pair<List<SSTableReader>,Double>> bucketsByHotnessComparator = new Comparator<Pair<List<SSTableReader>, Double>>()
+    {
+        public int compare(Pair<List<SSTableReader>, Double> o1, Pair<List<SSTableReader>, Double> o2)
+        {
+            int comparison = Double.compare(o1.right, o2.right);
+            if (comparison != 0)
+                return comparison;
+
+            // break ties by compacting the smallest sstables first (this will probably only happen for
+            // system tables and new/unread sstables)
+            return Long.compare(avgSize(o1.left), avgSize(o2.left));
+        }
+
+        private long avgSize(List<SSTableReader> sstables)
+        {
+            long n = 0;
+            for (SSTableReader sstable : sstables)
+                n += sstable.bytesOnDisk();
+            return n / sstables.size();
+        }
+    };
+
     protected SizeTieredCompactionStrategyOptions options;
     protected volatile int estimatedRemainingTasks;
 
@@ -57,6 +81,15 @@
 
         Iterable<SSTableReader> candidates = filterSuspectSSTables(cfs.getUncompactingSSTables());
         candidates = filterColdSSTables(Lists.newArrayList(candidates), options.coldReadsToOmit);
+        Pair<Set<SSTableReader>,Set<SSTableReader>> repairedUnrepaired = splitInRepairedAndUnrepaired(candidates);
+        if (repairedUnrepaired.left.size() > repairedUnrepaired.right.size())
+        {
+            candidates = repairedUnrepaired.left;
+        }
+        else
+        {
+            candidates = repairedUnrepaired.right;
+        }
 
         List<List<SSTableReader>> buckets = getBuckets(createSSTableAndLengthPairs(candidates), options.bucketHigh, options.bucketLow, options.minSSTableSize);
         logger.debug("Compaction buckets are {}", buckets);
@@ -67,7 +100,7 @@
 
         // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
         // ratio is greater than threshold.
-        List<SSTableReader> sstablesWithTombstones = new ArrayList<SSTableReader>();
+        List<SSTableReader> sstablesWithTombstones = new ArrayList<>();
         for (SSTableReader sstable : candidates)
         {
             if (worthDroppingTombstones(sstable, gcBefore))
@@ -80,6 +113,20 @@
         return Collections.singletonList(sstablesWithTombstones.get(0));
     }
 
+    private static Pair<Set<SSTableReader>, Set<SSTableReader>> splitInRepairedAndUnrepaired(Iterable<SSTableReader> candidates)
+    {
+        Set<SSTableReader> repaired = new HashSet<>();
+        Set<SSTableReader> unRepaired = new HashSet<>();
+        for(SSTableReader candidate : candidates)
+        {
+            if (!candidate.isRepaired())
+                unRepaired.add(candidate);
+            else
+                repaired.add(candidate);
+        }
+        return Pair.create(repaired, unRepaired);
+    }
+
     /**
      * Removes as many cold sstables as possible while retaining at least 1-coldReadsToOmit of the total reads/sec
      * across all sstables
@@ -161,29 +208,7 @@
         if (prunedBucketsAndHotness.isEmpty())
             return Collections.emptyList();
 
-        // prefer compacting the hottest bucket
-        Pair<List<SSTableReader>, Double> hottest = Collections.max(prunedBucketsAndHotness, new Comparator<Pair<List<SSTableReader>, Double>>()
-        {
-            public int compare(Pair<List<SSTableReader>, Double> o1, Pair<List<SSTableReader>, Double> o2)
-            {
-                int comparison = Double.compare(o1.right, o2.right);
-                if (comparison != 0)
-                    return comparison;
-
-                // break ties by compacting the smallest sstables first (this will probably only happen for
-                // system tables and new/unread sstables)
-                return Long.compare(avgSize(o1.left), avgSize(o2.left));
-            }
-
-            private long avgSize(List<SSTableReader> sstables)
-            {
-                long n = 0;
-                for (SSTableReader sstable : sstables)
-                    n += sstable.bytesOnDisk();
-                return n / sstables.size();
-            }
-        });
-
+        Pair<List<SSTableReader>, Double> hottest = Collections.max(prunedBucketsAndHotness, bucketsByHotnessComparator);
         return hottest.left;
     }
 
@@ -245,17 +270,26 @@
                 return null;
 
             if (cfs.getDataTracker().markCompacting(hottestBucket))
-                return new CompactionTask(cfs, hottestBucket, gcBefore);
+                return new CompactionTask(cfs, hottestBucket, gcBefore, false);
         }
     }
 
-    public AbstractCompactionTask getMaximalTask(final int gcBefore)
+    public Collection<AbstractCompactionTask> getMaximalTask(final int gcBefore)
     {
-        Iterable<SSTableReader> sstables = cfs.markAllCompacting();
-        if (sstables == null)
+        Iterable<SSTableReader> allSSTables = cfs.markAllCompacting();
+        if (allSSTables == null || Iterables.isEmpty(allSSTables))
             return null;
-
-        return new CompactionTask(cfs, sstables, gcBefore);
+        Set<SSTableReader> sstables = Sets.newHashSet(allSSTables);
+        Set<SSTableReader> repaired = new HashSet<>();
+        Set<SSTableReader> unrepaired = new HashSet<>();
+        for (SSTableReader sstable : sstables)
+        {
+            if (sstable.isRepaired())
+                repaired.add(sstable);
+            else
+                unrepaired.add(sstable);
+        }
+        return Arrays.<AbstractCompactionTask>asList(new CompactionTask(cfs, repaired, gcBefore, false), new CompactionTask(cfs, unrepaired, gcBefore, false));
     }
 
     public AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, final int gcBefore)
@@ -268,7 +302,7 @@
             return null;
         }
 
-        return new CompactionTask(cfs, sstables, gcBefore).setUserDefined(true);
+        return new CompactionTask(cfs, sstables, gcBefore, false).setUserDefined(true);
     }
 
     public int getEstimatedRemainingTasks()
@@ -368,4 +402,4 @@
             cfs.getMinimumCompactionThreshold(),
             cfs.getMaximumCompactionThreshold());
     }
-}
\ No newline at end of file
+}

diff --git a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyOptions.java b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyOptions.java
index c6c5f1b..84e7d61 100644
--- a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyOptions.java
+++ b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyOptions.java

@@ -26,7 +26,7 @@
     protected static final long DEFAULT_MIN_SSTABLE_SIZE = 50L * 1024L * 1024L;
     protected static final double DEFAULT_BUCKET_LOW = 0.5;
     protected static final double DEFAULT_BUCKET_HIGH = 1.5;
-    protected static final double DEFAULT_COLD_READS_TO_OMIT = 0.0;
+    protected static final double DEFAULT_COLD_READS_TO_OMIT = 0.05;
     protected static final String MIN_SSTABLE_SIZE_KEY = "min_sstable_size";
     protected static final String BUCKET_LOW_KEY = "bucket_low";
     protected static final String BUCKET_HIGH_KEY = "bucket_high";

diff --git a/src/java/org/apache/cassandra/db/compaction/Upgrader.java b/src/java/org/apache/cassandra/db/compaction/Upgrader.java
index 98a55e9..f102fef 100644
--- a/src/java/org/apache/cassandra/db/compaction/Upgrader.java
+++ b/src/java/org/apache/cassandra/db/compaction/Upgrader.java

@@ -26,6 +26,8 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.utils.CLibrary;
 import org.apache.cassandra.utils.CloseableIterator;
 import org.apache.cassandra.utils.OutputHandler;
 
@@ -33,7 +35,7 @@
 {
     private final ColumnFamilyStore cfs;
     private final SSTableReader sstable;
-    private final Collection<SSTableReader> toUpgrade;
+    private final Set<SSTableReader> toUpgrade;
     private final File directory;
 
     private final OperationType compactionType = OperationType.UPGRADE_SSTABLES;
@@ -47,7 +49,7 @@
     {
         this.cfs = cfs;
         this.sstable = sstable;
-        this.toUpgrade = Collections.singletonList(sstable);
+        this.toUpgrade = new HashSet<>(Collections.singleton(sstable));
         this.outputHandler = outputHandler;
 
         this.directory = new File(sstable.getFilename()).getParentFile();
@@ -55,14 +57,14 @@
         this.controller = new UpgradeController(cfs);
 
         this.strategy = cfs.getCompactionStrategy();
-        long estimatedTotalKeys = Math.max(cfs.metadata.getIndexInterval(), SSTableReader.getApproximateKeyCount(toUpgrade, cfs.metadata));
-        long estimatedSSTables = Math.max(1, SSTable.getTotalBytes(this.toUpgrade) / strategy.getMaxSSTableBytes());
+        long estimatedTotalKeys = Math.max(cfs.metadata.getMinIndexInterval(), SSTableReader.getApproximateKeyCount(toUpgrade));
+        long estimatedSSTables = Math.max(1, SSTableReader.getTotalBytes(this.toUpgrade) / strategy.getMaxSSTableBytes());
         this.estimatedRows = (long) Math.ceil((double) estimatedTotalKeys / estimatedSSTables);
     }
 
-    private SSTableWriter createCompactionWriter()
+    private SSTableWriter createCompactionWriter(long repairedAt)
     {
-        SSTableMetadata.Collector sstableMetadataCollector = SSTableMetadata.createCollector(cfs.getComparator());
+        MetadataCollector sstableMetadataCollector = new MetadataCollector(cfs.getComparator());
 
         // Get the max timestamp of the precompacted sstables
         // and adds generation of live ancestors
@@ -78,63 +80,36 @@
             sstableMetadataCollector.sstableLevel(sstable.getSSTableLevel());
         }
 
-        return new SSTableWriter(cfs.getTempSSTablePath(directory), estimatedRows, cfs.metadata, cfs.partitioner, sstableMetadataCollector);
+        return new SSTableWriter(cfs.getTempSSTablePath(directory), estimatedRows, repairedAt, cfs.metadata, cfs.partitioner, sstableMetadataCollector);
     }
 
     public void upgrade()
     {
         outputHandler.output("Upgrading " + sstable);
 
-
-        AbstractCompactionIterable ci = new CompactionIterable(compactionType, strategy.getScanners(this.toUpgrade), controller);
-
-        CloseableIterator<AbstractCompactedRow> iter = ci.iterator();
-
-        Collection<SSTableReader> sstables = new ArrayList<SSTableReader>();
-        Collection<SSTableWriter> writers = new ArrayList<SSTableWriter>();
-
-        try
+        SSTableRewriter writer = new SSTableRewriter(cfs, toUpgrade, CompactionTask.getMaxDataAge(this.toUpgrade), OperationType.UPGRADE_SSTABLES, true);
+        try (AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(this.toUpgrade))
         {
-            SSTableWriter writer = createCompactionWriter();
-            writers.add(writer);
+            Iterator<AbstractCompactedRow> iter = new CompactionIterable(compactionType, scanners.scanners, controller).iterator();
+            writer.switchWriter(createCompactionWriter(sstable.getSSTableMetadata().repairedAt));
             while (iter.hasNext())
             {
                 AbstractCompactedRow row = iter.next();
-
                 writer.append(row);
             }
 
-            long maxAge = CompactionTask.getMaxDataAge(this.toUpgrade);
-            for (SSTableWriter completedWriter : writers)
-                sstables.add(completedWriter.closeAndOpenReader(maxAge));
-
+            writer.finish();
             outputHandler.output("Upgrade of " + sstable + " complete.");
 
         }
         catch (Throwable t)
         {
-            for (SSTableWriter writer : writers)
-                writer.abort();
-            // also remove already completed SSTables
-            for (SSTableReader sstable : sstables)
-            {
-                sstable.markObsolete();
-                sstable.releaseReference();
-            }
+            writer.abort();
             throw Throwables.propagate(t);
         }
         finally
         {
             controller.close();
-
-            try
-            {
-                iter.close();
-            }
-            catch (IOException e)
-            {
-                throw new RuntimeException(e);
-            }
         }
     }
 
@@ -146,9 +121,9 @@
         }
 
         @Override
-        public boolean shouldPurge(DecoratedKey key, long delTimestamp)
+        public long maxPurgeableTimestamp(DecoratedKey key)
         {
-            return false;
+            return Long.MIN_VALUE;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractCType.java b/src/java/org/apache/cassandra/db/composites/AbstractCType.java
new file mode 100644
index 0000000..5af7458
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/AbstractCType.java

@@ -0,0 +1,410 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.NativeCell;
+import org.apache.cassandra.db.RangeTombstone;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.filter.SliceQueryFilter;
+import org.apache.cassandra.db.marshal.AbstractCompositeType;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
+
+public abstract class AbstractCType implements CType
+{
+    static final Comparator<Cell> rightNativeCell = new Comparator<Cell>()
+    {
+        public int compare(Cell o1, Cell o2)
+        {
+            return -((NativeCell) o2).compareTo(o1.name());
+        }
+    };
+
+    static final Comparator<Cell> neitherNativeCell = new Comparator<Cell>()
+    {
+        public int compare(Cell o1, Cell o2)
+        {
+            return compareUnsigned(o1.name(), o2.name());
+        }
+    };
+
+    // only one or the other of these will ever be used
+    static final Comparator<Object> asymmetricRightNativeCell = new Comparator<Object>()
+    {
+        public int compare(Object o1, Object o2)
+        {
+            return -((NativeCell) o2).compareTo((Composite) o1);
+        }
+    };
+
+    static final Comparator<Object> asymmetricNeitherNativeCell = new Comparator<Object>()
+    {
+        public int compare(Object o1, Object o2)
+        {
+            return compareUnsigned((Composite) o1, ((Cell) o2).name());
+        }
+    };
+
+    private final Comparator<Composite> reverseComparator;
+    private final Comparator<IndexInfo> indexComparator;
+    private final Comparator<IndexInfo> indexReverseComparator;
+
+    private final Serializer serializer;
+
+    private final ISerializer<IndexInfo> indexSerializer;
+    private final IVersionedSerializer<ColumnSlice> sliceSerializer;
+    private final IVersionedSerializer<SliceQueryFilter> sliceQueryFilterSerializer;
+    private final DeletionInfo.Serializer deletionInfoSerializer;
+    private final RangeTombstone.Serializer rangeTombstoneSerializer;
+    private final RowIndexEntry.Serializer rowIndexEntrySerializer;
+
+    protected final boolean isByteOrderComparable;
+
+    protected AbstractCType(boolean isByteOrderComparable)
+    {
+        reverseComparator = new Comparator<Composite>()
+        {
+            public int compare(Composite c1, Composite c2)
+            {
+                return AbstractCType.this.compare(c2, c1);
+            }
+        };
+        indexComparator = new Comparator<IndexInfo>()
+        {
+            public int compare(IndexInfo o1, IndexInfo o2)
+            {
+                return AbstractCType.this.compare(o1.lastName, o2.lastName);
+            }
+        };
+        indexReverseComparator = new Comparator<IndexInfo>()
+        {
+            public int compare(IndexInfo o1, IndexInfo o2)
+            {
+                return AbstractCType.this.compare(o1.firstName, o2.firstName);
+            }
+        };
+
+        serializer = new Serializer(this);
+
+        indexSerializer = new IndexInfo.Serializer(this);
+        sliceSerializer = new ColumnSlice.Serializer(this);
+        sliceQueryFilterSerializer = new SliceQueryFilter.Serializer(this);
+        deletionInfoSerializer = new DeletionInfo.Serializer(this);
+        rangeTombstoneSerializer = new RangeTombstone.Serializer(this);
+        rowIndexEntrySerializer = new RowIndexEntry.Serializer(this);
+        this.isByteOrderComparable = isByteOrderComparable;
+    }
+
+    protected static boolean isByteOrderComparable(Iterable<AbstractType<?>> types)
+    {
+        boolean isByteOrderComparable = true;
+        for (AbstractType<?> type : types)
+            isByteOrderComparable &= type.isByteOrderComparable();
+        return isByteOrderComparable;
+    }
+
+    static int compareUnsigned(Composite c1, Composite c2)
+    {
+        if (c1.isStatic() != c2.isStatic())
+        {
+            // Static sorts before non-static no matter what, except for empty which
+            // always sort first
+            if (c1.isEmpty())
+                return c2.isEmpty() ? 0 : -1;
+            if (c2.isEmpty())
+                return 1;
+            return c1.isStatic() ? -1 : 1;
+        }
+
+        int s1 = c1.size();
+        int s2 = c2.size();
+        int minSize = Math.min(s1, s2);
+
+        for (int i = 0; i < minSize; i++)
+        {
+            int cmp = ByteBufferUtil.compareUnsigned(c1.get(i), c2.get(i));
+            if (cmp != 0)
+                return cmp;
+        }
+
+        if (s1 == s2)
+            return c1.eoc().compareTo(c2.eoc());
+        return s1 < s2 ? c1.eoc().prefixComparisonResult : -c2.eoc().prefixComparisonResult;
+    }
+
+    public int compare(Composite c1, Composite c2)
+    {
+        if (c1.isStatic() != c2.isStatic())
+        {
+            // Static sorts before non-static no matter what, except for empty which
+            // always sort first
+            if (c1.isEmpty())
+                return c2.isEmpty() ? 0 : -1;
+            if (c2.isEmpty())
+                return 1;
+            return c1.isStatic() ? -1 : 1;
+        }
+
+        int s1 = c1.size();
+        int s2 = c2.size();
+        int minSize = Math.min(s1, s2);
+
+        for (int i = 0; i < minSize; i++)
+        {
+            int cmp = isByteOrderComparable
+                      ? ByteBufferUtil.compareUnsigned(c1.get(i), c2.get(i))
+                      : subtype(i).compare(c1.get(i), c2.get(i));
+            if (cmp != 0)
+                return cmp;
+        }
+
+        if (s1 == s2)
+            return c1.eoc().compareTo(c2.eoc());
+        return s1 < s2 ? c1.eoc().prefixComparisonResult : -c2.eoc().prefixComparisonResult;
+    }
+
+    protected Comparator<Cell> getByteOrderColumnComparator(boolean isRightNative)
+    {
+        if (isRightNative)
+            return rightNativeCell;
+        return neitherNativeCell;
+    }
+
+    protected Comparator<Object> getByteOrderAsymmetricColumnComparator(boolean isRightNative)
+    {
+        if (isRightNative)
+            return asymmetricRightNativeCell;
+        return asymmetricNeitherNativeCell;
+    }
+
+    public void validate(Composite name)
+    {
+        ByteBuffer previous = null;
+        for (int i = 0; i < name.size(); i++)
+        {
+            AbstractType<?> comparator = subtype(i);
+            ByteBuffer value = name.get(i);
+            comparator.validateCollectionMember(value, previous);
+            previous = value;
+        }
+    }
+
+    public boolean isCompatibleWith(CType previous)
+    {
+        if (this == previous)
+            return true;
+
+        // Extending with new components is fine, shrinking is not
+        if (size() < previous.size())
+            return false;
+
+        for (int i = 0; i < previous.size(); i++)
+        {
+            AbstractType<?> tprev = previous.subtype(i);
+            AbstractType<?> tnew = subtype(i);
+            if (!tnew.isCompatibleWith(tprev))
+                return false;
+        }
+        return true;
+    }
+
+    public String getString(Composite c)
+    {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < c.size(); i++)
+        {
+            if (i > 0)
+                sb.append(":");
+            sb.append(AbstractCompositeType.escape(subtype(i).getString(c.get(i))));
+        }
+        switch (c.eoc())
+        {
+            case START:
+                sb.append(":_");
+                break;
+            case END:
+                sb.append(":!");
+                break;
+        }
+        return sb.toString();
+    }
+
+    public Composite make(Object... components)
+    {
+        if (components.length > size())
+            throw new IllegalArgumentException("Too many components, max is " + size());
+
+        CBuilder builder = builder();
+        for (int i = 0; i < components.length; i++)
+        {
+            Object obj = components[i];
+            if (obj instanceof ByteBuffer)
+                builder.add((ByteBuffer)obj);
+            else
+                builder.add(obj);
+        }
+        return builder.build();
+    }
+
+    public CType.Serializer serializer()
+    {
+        return serializer;
+    }
+
+    public Comparator<Composite> reverseComparator()
+    {
+        return reverseComparator;
+    }
+
+    public Comparator<IndexInfo> indexComparator()
+    {
+        return indexComparator;
+    }
+
+    public Comparator<IndexInfo> indexReverseComparator()
+    {
+        return indexReverseComparator;
+    }
+
+    public ISerializer<IndexInfo> indexSerializer()
+    {
+        return indexSerializer;
+    }
+
+    public IVersionedSerializer<ColumnSlice> sliceSerializer()
+    {
+        return sliceSerializer;
+    }
+
+    public IVersionedSerializer<SliceQueryFilter> sliceQueryFilterSerializer()
+    {
+        return sliceQueryFilterSerializer;
+    }
+
+    public DeletionInfo.Serializer deletionInfoSerializer()
+    {
+        return deletionInfoSerializer;
+    }
+
+    public RangeTombstone.Serializer rangeTombstoneSerializer()
+    {
+        return rangeTombstoneSerializer;
+    }
+
+    public RowIndexEntry.Serializer rowIndexEntrySerializer()
+    {
+        return rowIndexEntrySerializer;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (o == null)
+            return false;
+
+        if (!getClass().equals(o.getClass()))
+            return false;
+
+        CType c = (CType)o;
+        if (size() != c.size())
+            return false;
+
+        for (int i = 0; i < size(); i++)
+        {
+            if (!subtype(i).equals(c.subtype(i)))
+                return false;
+        }
+        return true;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        int h = 31;
+        for (int i = 0; i < size(); i++)
+            h += subtype(i).hashCode();
+        return h + getClass().hashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return asAbstractType().toString();
+    }
+
+    protected static ByteBuffer sliceBytes(ByteBuffer bb, int offs, int length)
+    {
+        ByteBuffer copy = bb.duplicate();
+        copy.position(offs);
+        copy.limit(offs + length);
+        return copy;
+    }
+
+    protected static void checkRemaining(ByteBuffer bb, int offs, int length)
+    {
+        if (offs + length > bb.limit())
+            throw new IllegalArgumentException("Not enough bytes");
+    }
+
+    private static class Serializer implements CType.Serializer
+    {
+        private final CType type;
+
+        public Serializer(CType type)
+        {
+            this.type = type;
+        }
+
+        public void serialize(Composite c, DataOutputPlus out) throws IOException
+        {
+            ByteBufferUtil.writeWithShortLength(c.toByteBuffer(), out);
+        }
+
+        public Composite deserialize(DataInput in) throws IOException
+        {
+            return type.fromByteBuffer(ByteBufferUtil.readWithShortLength(in));
+        }
+
+        public long serializedSize(Composite c, TypeSizes type)
+        {
+            return type.sizeofWithShortLength(c.toByteBuffer());
+        }
+
+        public void skip(DataInput in) throws IOException
+        {
+            ByteBufferUtil.skipShortLength(in);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractCellNameType.java b/src/java/org/apache/cassandra/db/composites/AbstractCellNameType.java
new file mode 100644
index 0000000..e33cc63
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/AbstractCellNameType.java

@@ -0,0 +1,454 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.CQL3Row;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.IDiskAtomFilter;
+import org.apache.cassandra.db.filter.NamesQueryFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.ColumnToCollectionType;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public abstract class AbstractCellNameType extends AbstractCType implements CellNameType
+{
+    final Comparator<Cell> columnComparator;
+    private final Comparator<Cell> columnReverseComparator;
+    final Comparator<Object> asymmetricComparator;
+    private final Comparator<OnDiskAtom> onDiskAtomComparator;
+
+    private final ISerializer<CellName> cellSerializer;
+    private final ColumnSerializer columnSerializer;
+    private final OnDiskAtom.Serializer onDiskAtomSerializer;
+    private final IVersionedSerializer<NamesQueryFilter> namesQueryFilterSerializer;
+    private final IVersionedSerializer<IDiskAtomFilter> diskAtomFilterSerializer;
+
+    protected AbstractCellNameType(boolean isByteOrderComparable)
+    {
+        super(isByteOrderComparable);
+        columnComparator = new Comparator<Cell>()
+        {
+            public int compare(Cell c1, Cell c2)
+            {
+                return AbstractCellNameType.this.compare(c1.name(), c2.name());
+            }
+        };
+        asymmetricComparator = new Comparator<Object>()
+        {
+            public int compare(Object c1, Object c2)
+            {
+                return AbstractCellNameType.this.compare((Composite) c1, ((Cell) c2).name());
+            }
+        };
+        columnReverseComparator = new Comparator<Cell>()
+        {
+            public int compare(Cell c1, Cell c2)
+            {
+                return AbstractCellNameType.this.compare(c2.name(), c1.name());
+            }
+        };
+        onDiskAtomComparator = new Comparator<OnDiskAtom>()
+        {
+            public int compare(OnDiskAtom c1, OnDiskAtom c2)
+            {
+                int comp = AbstractCellNameType.this.compare(c1.name(), c2.name());
+                if (comp != 0)
+                    return comp;
+
+                if (c1 instanceof RangeTombstone)
+                {
+                    if (c2 instanceof RangeTombstone)
+                    {
+                        RangeTombstone t1 = (RangeTombstone)c1;
+                        RangeTombstone t2 = (RangeTombstone)c2;
+                        int comp2 = AbstractCellNameType.this.compare(t1.max, t2.max);
+                        return comp2 == 0 ? t1.data.compareTo(t2.data) : comp2;
+                    }
+                    else
+                    {
+                        return -1;
+                    }
+                }
+                else
+                {
+                    return c2 instanceof RangeTombstone ? 1 : 0;
+                }
+            }
+        };
+
+        // A trivial wrapped over the composite serializer
+        cellSerializer = new ISerializer<CellName>()
+        {
+            public void serialize(CellName c, DataOutputPlus out) throws IOException
+            {
+                serializer().serialize(c, out);
+            }
+
+            public CellName deserialize(DataInput in) throws IOException
+            {
+                Composite ct = serializer().deserialize(in);
+                if (ct.isEmpty())
+                    throw ColumnSerializer.CorruptColumnException.create(in, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+
+                assert ct instanceof CellName : ct;
+                return (CellName)ct;
+            }
+
+            public long serializedSize(CellName c, TypeSizes type)
+            {
+                return serializer().serializedSize(c, type);
+            }
+        };
+        columnSerializer = new ColumnSerializer(this);
+        onDiskAtomSerializer = new OnDiskAtom.Serializer(this);
+        namesQueryFilterSerializer = new NamesQueryFilter.Serializer(this);
+        diskAtomFilterSerializer = new IDiskAtomFilter.Serializer(this);
+    }
+
+    public final Comparator<Cell> columnComparator(boolean isRightNative)
+    {
+        if (!isByteOrderComparable)
+            return columnComparator;
+        return getByteOrderColumnComparator(isRightNative);
+    }
+
+    public final Comparator<Object> asymmetricColumnComparator(boolean isRightNative)
+    {
+        if (!isByteOrderComparable)
+            return asymmetricComparator;
+        return getByteOrderAsymmetricColumnComparator(isRightNative);
+    }
+
+    public Comparator<Cell> columnReverseComparator()
+    {
+        return columnReverseComparator;
+    }
+
+    public Comparator<OnDiskAtom> onDiskAtomComparator()
+    {
+        return onDiskAtomComparator;
+    }
+
+    public ISerializer<CellName> cellSerializer()
+    {
+        return cellSerializer;
+    }
+
+    public ColumnSerializer columnSerializer()
+    {
+        return columnSerializer;
+    }
+
+    public OnDiskAtom.Serializer onDiskAtomSerializer()
+    {
+        return onDiskAtomSerializer;
+    }
+
+    public IVersionedSerializer<NamesQueryFilter> namesQueryFilterSerializer()
+    {
+        return namesQueryFilterSerializer;
+    }
+
+    public IVersionedSerializer<IDiskAtomFilter> diskAtomFilterSerializer()
+    {
+        return diskAtomFilterSerializer;
+    }
+
+    public CellName cellFromByteBuffer(ByteBuffer bytes)
+    {
+        // we're not guaranteed to get a CellName back from fromByteBuffer(), so it's on the caller to guarantee this
+        return (CellName)fromByteBuffer(bytes);
+    }
+
+    public CellName create(Composite prefix, ColumnDefinition column, ByteBuffer collectionElement)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public CellName rowMarker(Composite prefix)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public Composite staticPrefix()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public boolean hasCollections()
+    {
+        return false;
+    }
+
+    public boolean supportCollections()
+    {
+        return false;
+    }
+
+    public ColumnToCollectionType collectionType()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public CellNameType addOrUpdateCollection(ColumnIdentifier columnName, CollectionType newCollection)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Composite make(Object... components)
+    {
+        return components.length == size() ? makeCellName(components) : super.make(components);
+    }
+
+    public CellName makeCellName(Object... components)
+    {
+        ByteBuffer[] rawComponents = new ByteBuffer[components.length];
+        for (int i = 0; i < components.length; i++)
+        {
+            Object c = components[i];
+            if (c instanceof ByteBuffer)
+            {
+                rawComponents[i] = (ByteBuffer)c;
+            }
+            else
+            {
+                AbstractType<?> type = subtype(i);
+                // If it's a collection type, we need to find the right collection and use the key comparator (since we're building a cell name)
+                if (type instanceof ColumnToCollectionType)
+                {
+                    assert i > 0;
+                    type = ((ColumnToCollectionType)type).defined.get(rawComponents[i-1]).nameComparator();
+                }
+                rawComponents[i] = ((AbstractType)type).decompose(c);
+            }
+        }
+        return makeCellName(rawComponents);
+    }
+
+    protected abstract CellName makeCellName(ByteBuffer[] components);
+
+    protected static CQL3Row.Builder makeDenseCQL3RowBuilder(final long now)
+    {
+        return new CQL3Row.Builder()
+        {
+            public CQL3Row.RowIterator group(Iterator<Cell> cells)
+            {
+                return new DenseRowIterator(cells, now);
+            }
+        };
+    }
+
+    private static class DenseRowIterator extends AbstractIterator<CQL3Row> implements CQL3Row.RowIterator
+    {
+        private final Iterator<Cell> cells;
+        private final long now;
+
+        public DenseRowIterator(Iterator<Cell> cells, long now)
+        {
+            this.cells = cells;
+            this.now = now;
+        }
+
+        public CQL3Row getStaticRow()
+        {
+            // There can't be static columns in dense tables
+            return null;
+        }
+
+        protected CQL3Row computeNext()
+        {
+            while (cells.hasNext())
+            {
+                final Cell cell = cells.next();
+                if (!cell.isLive(now))
+                    continue;
+
+                return new CQL3Row()
+                {
+                    public ByteBuffer getClusteringColumn(int i)
+                    {
+                        return cell.name().get(i);
+                    }
+
+                    public Cell getColumn(ColumnIdentifier name)
+                    {
+                        return cell;
+                    }
+
+                    public List<Cell> getCollection(ColumnIdentifier name)
+                    {
+                        return null;
+                    }
+                };
+            }
+            return endOfData();
+        }
+    }
+
+    protected static CQL3Row.Builder makeSparseCQL3RowBuilder(final CFMetaData cfMetaData, final CellNameType type, final long now)
+    {
+        return new CQL3Row.Builder()
+        {
+            public CQL3Row.RowIterator group(Iterator<Cell> cells)
+            {
+                return new SparseRowIterator(cfMetaData, type, cells, now);
+            }
+        };
+    }
+
+    private static class SparseRowIterator extends AbstractIterator<CQL3Row> implements CQL3Row.RowIterator
+    {
+        private final CFMetaData cfMetaData;
+        private final CellNameType type;
+        private final Iterator<Cell> cells;
+        private final long now;
+        private final CQL3Row staticRow;
+
+        private Cell nextCell;
+        private CellName previous;
+        private CQL3RowOfSparse currentRow;
+
+        public SparseRowIterator(CFMetaData cfMetaData, CellNameType type, Iterator<Cell> cells, long now)
+        {
+            this.cfMetaData = cfMetaData;
+            this.type = type;
+            this.cells = cells;
+            this.now = now;
+            this.staticRow = hasNextCell() && nextCell.name().isStatic()
+                           ? computeNext()
+                           : null;
+        }
+
+        public CQL3Row getStaticRow()
+        {
+            return staticRow;
+        }
+
+        private boolean hasNextCell()
+        {
+            if (nextCell != null)
+                return true;
+
+            while (cells.hasNext())
+            {
+                Cell cell = cells.next();
+                if (!cell.isLive(now))
+                    continue;
+
+                nextCell = cell;
+                return true;
+            }
+            return false;
+        }
+
+        protected CQL3Row computeNext()
+        {
+            while (hasNextCell())
+            {
+                CQL3Row toReturn = null;
+                CellName current = nextCell.name();
+                if (currentRow == null || !current.isSameCQL3RowAs(type, previous))
+                {
+                    toReturn = currentRow;
+                    currentRow = new CQL3RowOfSparse(cfMetaData, current);
+                }
+                currentRow.add(nextCell);
+                nextCell = null;
+                previous = current;
+
+                if (toReturn != null)
+                    return toReturn;
+            }
+            if (currentRow != null)
+            {
+                CQL3Row toReturn = currentRow;
+                currentRow = null;
+                return toReturn;
+            }
+            return endOfData();
+        }
+    }
+
+    private static class CQL3RowOfSparse implements CQL3Row
+    {
+        private final CFMetaData cfMetaData;
+        private final CellName cell;
+        private Map<ColumnIdentifier, Cell> columns;
+        private Map<ColumnIdentifier, List<Cell>> collections;
+
+        CQL3RowOfSparse(CFMetaData metadata, CellName cell)
+        {
+            this.cfMetaData = metadata;
+            this.cell = cell;
+        }
+
+        public ByteBuffer getClusteringColumn(int i)
+        {
+            return cell.get(i);
+        }
+
+        void add(Cell cell)
+        {
+            CellName cellName = cell.name();
+            ColumnIdentifier columnName =  cellName.cql3ColumnName(cfMetaData);
+            if (cellName.isCollectionCell())
+            {
+                if (collections == null)
+                    collections = new HashMap<>();
+
+                List<Cell> values = collections.get(columnName);
+                if (values == null)
+                {
+                    values = new ArrayList<Cell>();
+                    collections.put(columnName, values);
+                }
+                values.add(cell);
+            }
+            else
+            {
+                if (columns == null)
+                    columns = new HashMap<>();
+                columns.put(columnName, cell);
+            }
+        }
+
+        public Cell getColumn(ColumnIdentifier name)
+        {
+            return columns == null ? null : columns.get(name);
+        }
+
+        public List<Cell> getCollection(ColumnIdentifier name)
+        {
+            return collections == null ? null : collections.get(name);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractComposite.java b/src/java/org/apache/cassandra/db/composites/AbstractComposite.java
new file mode 100644
index 0000000..14fa16c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/AbstractComposite.java

@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public abstract class AbstractComposite implements Composite
+{
+    public boolean isEmpty()
+    {
+        return size() == 0;
+    }
+
+    public boolean isStatic()
+    {
+        return false;
+    }
+
+    public EOC eoc()
+    {
+        return EOC.NONE;
+    }
+
+    public Composite start()
+    {
+        return withEOC(EOC.START);
+    }
+
+    public Composite end()
+    {
+        return withEOC(EOC.END);
+    }
+
+    public Composite withEOC(EOC newEoc)
+    {
+        // Note: CompositeBound overwrite this so we assume the EOC of this is NONE
+        switch (newEoc)
+        {
+            case START:
+                return BoundedComposite.startOf(this);
+            case END:
+                return BoundedComposite.endOf(this);
+            default:
+                return this;
+        }
+    }
+
+    public ColumnSlice slice()
+    {
+        return new ColumnSlice(start(), end());
+    }
+
+    public ByteBuffer toByteBuffer()
+    {
+        // This is the legacy format of composites.
+        // See org.apache.cassandra.db.marshal.CompositeType for details.
+        ByteBuffer result = ByteBuffer.allocate(dataSize() + 3 * size() + (isStatic() ? 2 : 0));
+        if (isStatic())
+            ByteBufferUtil.writeShortLength(result, CompositeType.STATIC_MARKER);
+
+        for (int i = 0; i < size(); i++)
+        {
+            ByteBuffer bb = get(i);
+            ByteBufferUtil.writeShortLength(result, bb.remaining());
+            result.put(bb.duplicate());
+            result.put((byte)0);
+        }
+        result.flip();
+        return result;
+    }
+
+    public int dataSize()
+    {
+        int size = 0;
+        for (int i = 0; i < size(); i++)
+            size += get(i).remaining();
+        return size;
+    }
+
+    public boolean isPrefixOf(CType type, Composite c)
+    {
+        if (size() > c.size() || isStatic() != c.isStatic())
+            return false;
+
+        for (int i = 0; i < size(); i++)
+        {
+            if (type.subtype(i).compare(get(i), c.get(i)) != 0)
+                return false;
+        }
+        return true;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if(!(o instanceof Composite))
+            return false;
+
+        Composite c = (Composite)o;
+        if (size() != c.size() || isStatic() != c.isStatic())
+            return false;
+
+        for (int i = 0; i < size(); i++)
+        {
+            if (!get(i).equals(c.get(i)))
+                return false;
+        }
+        return eoc() == c.eoc();
+    }
+
+    @Override
+    public int hashCode()
+    {
+        int h = 31;
+        for (int i = 0; i < size(); i++)
+            h += get(i).hashCode();
+        return h + eoc().hashCode() + (isStatic() ? 1 : 0);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractCompoundCellNameType.java b/src/java/org/apache/cassandra/db/composites/AbstractCompoundCellNameType.java
new file mode 100644
index 0000000..40537ed
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/AbstractCompoundCellNameType.java

@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CompositeType;
+
+public abstract class AbstractCompoundCellNameType extends AbstractCellNameType
+{
+    protected final CompoundCType clusteringType;
+    protected final CompoundCType fullType;
+
+    protected final int clusteringSize;
+    protected final int fullSize;
+
+    protected AbstractCompoundCellNameType(CompoundCType clusteringType, CompoundCType fullType)
+    {
+        super(isByteOrderComparable(fullType.types));
+        this.clusteringType = clusteringType;
+        this.fullType = fullType;
+
+        this.clusteringSize = clusteringType.size();
+        this.fullSize = fullType.size();
+    }
+
+    public int clusteringPrefixSize()
+    {
+        return clusteringSize;
+    }
+
+    public boolean isCompound()
+    {
+        return true;
+    }
+
+    public int size()
+    {
+        return fullSize;
+    }
+
+    public AbstractType<?> subtype(int i)
+    {
+        return fullType.subtype(i);
+    }
+
+    public CBuilder prefixBuilder()
+    {
+        return clusteringType.builder();
+    }
+
+    public CBuilder builder()
+    {
+        return new CompoundCType.CompoundCBuilder(this);
+    }
+
+    @Override
+    public Composite fromByteBuffer(ByteBuffer bytes)
+    {
+        if (!bytes.hasRemaining())
+            return Composites.EMPTY;
+
+        ByteBuffer[] elements = new ByteBuffer[fullSize];
+        int idx = bytes.position(), i = 0;
+        byte eoc = 0;
+
+        boolean isStatic = false;
+        if (CompositeType.isStaticName(bytes))
+        {
+            isStatic = true;
+            idx += 2;
+        }
+
+        while (idx < bytes.limit())
+        {
+            checkRemaining(bytes, idx, 2);
+            int length = bytes.getShort(idx) & 0xFFFF;
+            idx += 2;
+
+            checkRemaining(bytes, idx, length + 1);
+            elements[i++] = sliceBytes(bytes, idx, length);
+            idx += length;
+            eoc = bytes.get(idx++);
+        }
+
+        return makeWith(elements, i, Composite.EOC.from(eoc), isStatic);
+    }
+
+    public AbstractType<?> asAbstractType()
+    {
+        return CompositeType.getInstance(fullType.types);
+    }
+
+    public Deserializer newDeserializer(DataInput in)
+    {
+        return new CompositeDeserializer(this, in);
+    }
+
+    protected CellName makeCellName(ByteBuffer[] components)
+    {
+        return (CellName)makeWith(components, components.length, Composite.EOC.NONE, false);
+    }
+
+    protected abstract Composite makeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic);
+    protected abstract Composite copyAndMakeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic);
+
+    private static class CompositeDeserializer implements CellNameType.Deserializer
+    {
+        private static byte[] EMPTY = new byte[0];
+
+        private final AbstractCompoundCellNameType type;
+        private final DataInput in;
+
+        private byte[] nextFull;
+        private int nextIdx;
+
+        private final ByteBuffer[] nextComponents;
+        private int nextSize;
+        private Composite.EOC nextEOC;
+        private boolean nextIsStatic;
+
+        public CompositeDeserializer(AbstractCompoundCellNameType type, DataInput in)
+        {
+            this.type = type;
+            this.in = in;
+            this.nextComponents = new ByteBuffer[type.size()];
+        }
+
+        public boolean hasNext() throws IOException
+        {
+            if (nextFull == null)
+                maybeReadNext();
+            return nextFull != EMPTY;
+        }
+
+        public boolean hasUnprocessed() throws IOException
+        {
+            return nextFull != null;
+        }
+
+        public int compareNextTo(Composite composite) throws IOException
+        {
+            maybeReadNext();
+
+            if (composite.isEmpty())
+                return nextFull == EMPTY ? 0 : 1;
+
+            if (nextFull == EMPTY)
+                return -1;
+
+            if (nextIsStatic != composite.isStatic())
+                return nextIsStatic ? -1 : 1;
+
+            ByteBuffer previous = null;
+            for (int i = 0; i < composite.size(); i++)
+            {
+                if (!hasComponent(i))
+                    return nextEOC == Composite.EOC.END ? 1 : -1;
+
+                AbstractType<?> comparator = type.subtype(i);
+                ByteBuffer value1 = nextComponents[i];
+                ByteBuffer value2 = composite.get(i);
+
+                int cmp = comparator.compareCollectionMembers(value1, value2, previous);
+                if (cmp != 0)
+                    return cmp;
+
+                previous = value1;
+            }
+
+            // If we have more component than composite
+            if (!allComponentsDeserialized() || composite.size() < nextSize)
+                return composite.eoc() == Composite.EOC.END ? -1 : 1;
+
+            // same size, check eoc
+            if (nextEOC != composite.eoc())
+            {
+                switch (nextEOC)
+                {
+                    case START: return -1;
+                    case END:   return 1;
+                    case NONE:  return composite.eoc() == Composite.EOC.START ? 1 : -1;
+                }
+            }
+
+            return 0;
+        }
+
+        private boolean hasComponent(int i)
+        {
+            while (i >= nextSize && deserializeOne())
+                continue;
+
+            return i < nextSize;
+        }
+
+        private int readShort()
+        {
+            return ((nextFull[nextIdx++] & 0xFF) << 8) | (nextFull[nextIdx++] & 0xFF);
+        }
+
+        private int peekShort()
+        {
+            return ((nextFull[nextIdx] & 0xFF) << 8) | (nextFull[nextIdx+1] & 0xFF);
+        }
+
+        private boolean deserializeOne()
+        {
+            if (allComponentsDeserialized())
+                return false;
+
+            int length = readShort();
+            ByteBuffer component = ByteBuffer.wrap(nextFull, nextIdx, length);
+            nextIdx += length;
+            nextComponents[nextSize++] = component;
+            nextEOC = Composite.EOC.from(nextFull[nextIdx++]);
+            return true;
+        }
+
+        private void deserializeAll()
+        {
+            while (deserializeOne())
+                continue;
+        }
+
+        private boolean allComponentsDeserialized()
+        {
+            return nextIdx >= nextFull.length;
+        }
+
+        private void maybeReadNext() throws IOException
+        {
+            if (nextFull != null)
+                return;
+
+            nextIdx = 0;
+            nextSize = 0;
+
+            int length = in.readShort() & 0xFFFF;
+            // Note that empty is ok because it marks the end of row
+            if (length == 0)
+            {
+                nextFull = EMPTY;
+                return;
+            }
+
+            nextFull = new byte[length];
+            in.readFully(nextFull);
+
+            // Is is a static?
+            nextIsStatic = false;
+            if (peekShort() == CompositeType.STATIC_MARKER)
+            {
+                nextIsStatic = true;
+                readShort(); // Skip the static marker
+            }
+        }
+
+        public Composite readNext() throws IOException
+        {
+            maybeReadNext();
+            if (nextFull == EMPTY)
+                return Composites.EMPTY;
+
+            deserializeAll();
+            Composite c = type.copyAndMakeWith(nextComponents, nextSize, nextEOC, nextIsStatic);
+            nextFull = null;
+            return c;
+        }
+
+        public void skipNext() throws IOException
+        {
+            maybeReadNext();
+            nextFull = null;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractSimpleCellNameType.java b/src/java/org/apache/cassandra/db/composites/AbstractSimpleCellNameType.java
new file mode 100644
index 0000000..b3f4778
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/AbstractSimpleCellNameType.java

@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+
+import net.nicoulaj.compilecommand.annotations.Inline;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.NativeCell;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public abstract class AbstractSimpleCellNameType extends AbstractCellNameType
+{
+    protected final AbstractType<?> type;
+
+    static final Comparator<Cell> rightNativeCell = new Comparator<Cell>()
+    {
+        public int compare(Cell o1, Cell o2)
+        {
+            return -((NativeCell) o2).compareToSimple(o1.name());
+        }
+    };
+
+    static final Comparator<Cell> neitherNativeCell = new Comparator<Cell>()
+    {
+        public int compare(Cell o1, Cell o2)
+        {
+            return compareUnsigned(o1.name(), o2.name());
+        }
+    };
+
+    // only one or the other of these will ever be used
+    static final Comparator<Object> asymmetricRightNativeCell = new Comparator<Object>()
+    {
+        public int compare(Object o1, Object o2)
+        {
+            return -((NativeCell) o2).compareToSimple((Composite) o1);
+        }
+    };
+
+    static final Comparator<Object> asymmetricNeitherNativeCell = new Comparator<Object>()
+    {
+        public int compare(Object o1, Object o2)
+        {
+            return compareUnsigned((Composite) o1, ((Cell) o2).name());
+        }
+    };
+
+    protected AbstractSimpleCellNameType(AbstractType<?> type)
+    {
+        super(type.isByteOrderComparable());
+        this.type = type;
+    }
+
+    public boolean isCompound()
+    {
+        return false;
+    }
+
+    public int size()
+    {
+        return 1;
+    }
+
+    @Inline
+    static int compareUnsigned(Composite c1, Composite c2)
+    {
+        ByteBuffer b1 = c1.toByteBuffer();
+        ByteBuffer b2 = c2.toByteBuffer();
+        return ByteBufferUtil.compareUnsigned(b1, b2);
+    }
+
+    public int compare(Composite c1, Composite c2)
+    {
+        if (isByteOrderComparable)
+            return compareUnsigned(c1, c2);
+
+        assert !(c1.isEmpty() | c2.isEmpty());
+        return type.compare(c1.get(0), c2.get(0));
+    }
+
+    protected Comparator<Cell> getByteOrderColumnComparator(boolean isRightNative)
+    {
+        if (isRightNative)
+            return rightNativeCell;
+        return neitherNativeCell;
+    }
+
+    protected Comparator<Object> getByteOrderAsymmetricColumnComparator(boolean isRightNative)
+    {
+        if (isRightNative)
+            return asymmetricRightNativeCell;
+        return asymmetricNeitherNativeCell;
+    }
+
+    public AbstractType<?> subtype(int i)
+    {
+        if (i != 0)
+            throw new IllegalArgumentException();
+        return type;
+    }
+
+    protected CellName makeCellName(ByteBuffer[] components)
+    {
+        assert components.length == 1;
+        return cellFromByteBuffer(components[0]);
+    }
+
+    public CBuilder builder()
+    {
+        return new SimpleCType.SimpleCBuilder(this);
+    }
+
+    public AbstractType<?> asAbstractType()
+    {
+        return type;
+    }
+
+    public Deserializer newDeserializer(DataInput in)
+    {
+        return new SimpleDeserializer(this, in);
+    }
+
+    private static class SimpleDeserializer implements CellNameType.Deserializer
+    {
+        private final AbstractSimpleCellNameType type;
+        private ByteBuffer next;
+        private final DataInput in;
+
+        public SimpleDeserializer(AbstractSimpleCellNameType type, DataInput in)
+        {
+            this.type = type;
+            this.in = in;
+        }
+
+        public boolean hasNext() throws IOException
+        {
+            if (next == null)
+                maybeReadNext();
+
+            return next.hasRemaining();
+        }
+
+        public boolean hasUnprocessed() throws IOException
+        {
+            return next != null;
+        }
+
+        public int compareNextTo(Composite composite) throws IOException
+        {
+            maybeReadNext();
+
+            if (composite.isEmpty())
+                return next.hasRemaining() ? 1 : 0;
+
+            return type.subtype(0).compare(next, composite.get(0));
+        }
+
+        private void maybeReadNext() throws IOException
+        {
+            if (next != null)
+                return;
+
+            int length = in.readShort() & 0xFFFF;
+            // Note that empty is ok because it marks the end of row
+            if (length == 0)
+            {
+                next = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+                return;
+            }
+
+            byte[] b = new byte[length];
+            in.readFully(b);
+            next = ByteBuffer.wrap(b);
+        }
+
+        public Composite readNext() throws IOException
+        {
+            maybeReadNext();
+            Composite c = type.fromByteBuffer(next);
+            next = null;
+            return c;
+        }
+
+        public void skipNext() throws IOException
+        {
+            maybeReadNext();
+            next = null;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/BoundedComposite.java b/src/java/org/apache/cassandra/db/composites/BoundedComposite.java
new file mode 100644
index 0000000..7f596fe
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/BoundedComposite.java

@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.ObjectSizes;
+
+/**
+ * Wraps another Composite and adds an EOC byte to track whether this is a slice start or end.
+ */
+public class BoundedComposite extends AbstractComposite
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new BoundedComposite(null, false));
+
+    private final Composite wrapped;
+    private final boolean isStart;
+
+    private BoundedComposite(Composite wrapped, boolean isStart)
+    {
+        this.wrapped = wrapped;
+        this.isStart = isStart;
+    }
+
+    static Composite startOf(Composite c)
+    {
+        return new BoundedComposite(c, true);
+    }
+
+    static Composite endOf(Composite c)
+    {
+        return new BoundedComposite(c, false);
+    }
+
+    public int size()
+    {
+        return wrapped.size();
+    }
+
+    public boolean isStatic()
+    {
+        return wrapped.isStatic();
+    }
+
+    public ByteBuffer get(int i)
+    {
+        return wrapped.get(i);
+    }
+
+    @Override
+    public EOC eoc()
+    {
+        return isStart ? EOC.START : EOC.END;
+    }
+
+    @Override
+    public Composite withEOC(EOC eoc)
+    {
+        switch (eoc)
+        {
+            case START:
+                return isStart ? this : startOf(wrapped);
+            case END:
+                return isStart ? endOf(wrapped) : this;
+            default:
+                return wrapped;
+        }
+    }
+
+    @Override
+    public ByteBuffer toByteBuffer()
+    {
+        ByteBuffer bb = wrapped.toByteBuffer();
+        bb.put(bb.remaining() - 1, (byte)(isStart ? -1 : 1));
+        return bb;
+    }
+
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE + wrapped.unsharedHeapSize();
+    }
+
+    public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        return new BoundedComposite(wrapped.copy(cfm, allocator), isStart);
+    }
+}

diff --git a/src/java/org/apache/cassandra/tools/NodeToolHelp.java b/src/java/org/apache/cassandra/db/composites/CBuilder.java
similarity index 68%
copy from src/java/org/apache/cassandra/tools/NodeToolHelp.java
copy to src/java/org/apache/cassandra/db/composites/CBuilder.java
index c89e48c..39035cb 100644
--- a/src/java/org/apache/cassandra/tools/NodeToolHelp.java
+++ b/src/java/org/apache/cassandra/db/composites/CBuilder.java

@@ -15,22 +15,22 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.tools;
+package org.apache.cassandra.db.composites;
 
+import java.nio.ByteBuffer;
 import java.util.List;
 
-public class NodeToolHelp
+/**
+ * A builder of Composite.
+ */
+public interface CBuilder
 {
-    public List<NodeToolCommand> commands;
+    public int remainingCount();
 
-    public static class NodeToolCommand
-    {
-        public String name;
-        public String help;
+    public CBuilder add(ByteBuffer value);
+    public CBuilder add(Object value);
 
-        public String toString()
-        {
-            return name;
-        }
-    }
+    public Composite build();
+    public Composite buildWith(ByteBuffer value);
+    public Composite buildWith(List<ByteBuffer> values);
 }

diff --git a/src/java/org/apache/cassandra/db/composites/CType.java b/src/java/org/apache/cassandra/db/composites/CType.java
new file mode 100644
index 0000000..280f7af
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CType.java

@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.RangeTombstone;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.filter.SliceQueryFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.IVersionedSerializer;
+
+import static org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
+
+/**
+ * A type for a Composite.
+ *
+ * There is essentially 2 types of Composite and such of CType:
+ *   1. the "simple" ones, see SimpleCType.
+ *   2. the "truly-composite" ones, see CompositeCType.
+ *
+ * API-wise, a CType is simply a collection of AbstractType with a few utility
+ * methods.
+ */
+public interface CType extends Comparator<Composite>
+{
+    /**
+     * Returns whether this is a "truly-composite" underneath.
+     */
+    public boolean isCompound();
+
+    /**
+     * The number of subtypes for this CType.
+     */
+    public int size();
+
+    int compare(Composite o1, Composite o2);
+
+    /**
+     * Gets a subtype of this CType.
+     */
+    public AbstractType<?> subtype(int i);
+
+    /**
+     * A builder of Composite.
+     */
+    public CBuilder builder();
+
+    /**
+     * Convenience method to build composites from their component.
+     *
+     * The arguments can be either ByteBuffer or actual objects of the type
+     * corresponding to their position.
+     */
+    public Composite make(Object... components);
+
+    /**
+     * Validates a composite.
+     */
+    public void validate(Composite name);
+
+    /**
+     * Converts a composite to a user-readable string.
+     */
+    public String getString(Composite c);
+
+    /**
+     * See AbstractType#isCompatibleWith.
+     */
+    public boolean isCompatibleWith(CType previous);
+
+    /**
+     * Returns a new CType that is equivalent to this CType but with
+     * one of the subtype replaced by the provided new type.
+     */
+    public CType setSubtype(int position, AbstractType<?> newType);
+
+    /**
+     * Deserialize a Composite from a ByteBuffer.
+     *
+     * This is meant for thrift/cql2 to convert the fully serialized buffer we
+     * get from the clients to composites.
+     */
+    public Composite fromByteBuffer(ByteBuffer bb);
+
+    /**
+     * Returns a AbstractType corresponding to this CType for thrift/cql2 sake.
+     *
+     * If the CType is a "simple" one, this just return the wrapped type, otherwise
+     * it returns the corresponding org.apache.cassandra.db.marshal.CompositeType.
+     *
+     * This is only meant to be use for backward compatibility (particularly for
+     * thrift/cql2) but it's not meant to be used internally.
+     */
+    public AbstractType<?> asAbstractType();
+
+
+    /**********************************************************/
+
+    /*
+     * Follows a number of per-CType instances for the Comparator and Serializer used throughout
+     * the code. The reason we need this is that we want the per-CType/per-CellNameType Composite/CellName
+     * serializers, which means the following instances have to depend on the type too.
+     */
+
+    public Comparator<Composite> reverseComparator();
+    public Comparator<IndexInfo> indexComparator();
+    public Comparator<IndexInfo> indexReverseComparator();
+
+    public Serializer serializer();
+
+    public ISerializer<IndexInfo> indexSerializer();
+    public IVersionedSerializer<ColumnSlice> sliceSerializer();
+    public IVersionedSerializer<SliceQueryFilter> sliceQueryFilterSerializer();
+    public DeletionInfo.Serializer deletionInfoSerializer();
+    public RangeTombstone.Serializer rangeTombstoneSerializer();
+    public RowIndexEntry.Serializer rowIndexEntrySerializer();
+
+    public interface Serializer extends ISerializer<Composite>
+    {
+        public void skip(DataInput in) throws IOException;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/CellName.java b/src/java/org/apache/cassandra/db/composites/CellName.java
new file mode 100644
index 0000000..4d778d3
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CellName.java

@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A CellName is a Composite, but for which, for the sake of CQL3, we
+ * distinguish different parts: a CellName has first a number of clustering
+ * components, followed by the CQL3 column name, and then possibly followed by
+ * a collection element part.
+ *
+ * The clustering prefix can itself be composed of multiple component. It can
+ * also be empty if the table has no clustering keys. In general, the CQL3
+ * column name follows. However, some type of COMPACT STORAGE layout do not
+ * store the CQL3 column name in the cell name and so this part can be null (we
+ * call "dense" the cells whose name don't store the CQL3 column name).
+ *
+ * Lastly, if the cell is part of a CQL3 collection, we'll have a last
+ * component (a UUID for lists, an element for sets and a key for maps).
+ */
+public interface CellName extends Composite
+{
+    /**
+     * The number of clustering components.
+     *
+     * It can be 0 if the table has no clustering columns, and it can be
+     * equal to size() if the table is dense() (in which case cql3ColumnName()
+     * will be null).
+     */
+    public int clusteringSize();
+
+    /**
+     * The name of the CQL3 column this cell represents.
+     *
+     * Will be null for cells of "dense" tables.
+     * @param metadata
+     */
+    public ColumnIdentifier cql3ColumnName(CFMetaData metadata);
+
+    /**
+     * The value of the collection element, or null if the cell is not part
+     * of a collection (i.e. if !isCollectionCell()).
+     */
+    public ByteBuffer collectionElement();
+    public boolean isCollectionCell();
+
+    /**
+     * Whether this cell is part of the same CQL3 row as the other cell.
+     */
+    public boolean isSameCQL3RowAs(CellNameType type, CellName other);
+
+    // If cellnames were sharing some prefix components, this will break it, so
+    // we might want to try to do better.
+    @Override
+    public CellName copy(CFMetaData cfm, AbstractAllocator allocator);
+
+    public long unsharedHeapSizeExcludingData();
+}

diff --git a/src/java/org/apache/cassandra/db/composites/CellNameType.java b/src/java/org/apache/cassandra/db/composites/CellNameType.java
new file mode 100644
index 0000000..4f45d41
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CellNameType.java

@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.CQL3Row;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.ColumnSerializer;
+import org.apache.cassandra.db.OnDiskAtom;
+import org.apache.cassandra.db.filter.IDiskAtomFilter;
+import org.apache.cassandra.db.filter.NamesQueryFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.ColumnToCollectionType;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.IVersionedSerializer;
+
+/**
+ * The type of CellNames.
+ *
+ * In the same way that a CellName is a Composite, a CellNameType is a CType, but
+ * with a number of method specific to cell names.
+ *
+ * On top of the dichotomy simple/truly-composite of composites, cell names comes
+ * in 2 variants: "dense" and "sparse". The sparse ones are CellName where one of
+ * the component (the last or second-to-last for collections) is used to store the
+ * CQL3 column name. Dense are those for which it's not the case.
+ *
+ * In other words, we have 4 types of CellName/CellNameType which correspond to the
+ * 4 type of table layout that we need to distinguish:
+ *   1. Simple (non-truly-composite) dense: this is the dynamic thrift CFs whose
+ *      comparator is not composite.
+ *   2. Composite dense: this is the dynamic thrift CFs with a CompositeType comparator.
+ *   3. Simple (non-truly-composite) sparse: this is the thrift static CFs (that
+ *      don't have a composite comparator).
+ *   4. Composite sparse: this is the CQL3 layout (note that this is the only one that
+ *      support collections).
+ */
+public interface CellNameType extends CType
+{
+    /**
+     * Whether or not the cell names for this type are dense.
+     */
+    public boolean isDense();
+
+    /**
+     * The number of clustering columns for the table this is the type of.
+     */
+    public int clusteringPrefixSize();
+
+    /**
+     * A builder for the clustering prefix.
+     */
+    public CBuilder prefixBuilder();
+
+    /**
+     * The prefix to use for static columns.
+     *
+     * Note that create() methods below for creating CellName automatically handle static columns already
+     * for convenience, and so there is not need to pass this prefix for them. There is few other cases
+     * where we need the prefix directly however.
+     */
+    public Composite staticPrefix();
+
+    /**
+     * Whether or not there is some collections defined in this type.
+     */
+    public boolean hasCollections();
+
+    /**
+     * Whether or not this type layout support collections.
+     */
+    public boolean supportCollections();
+
+    /**
+     * The type of the collections (or null if the type has not collections).
+     */
+    public ColumnToCollectionType collectionType();
+
+    /**
+     * Return the new type obtained by adding/updating to the new collection type for the provided column name
+     * to this type.
+     */
+    public CellNameType addOrUpdateCollection(ColumnIdentifier columnName, CollectionType newCollection);
+
+    /**
+     * Returns a new CellNameType that is equivalent to this one but with one
+     * of the subtype replaced by the provided new type.
+     */
+    @Override
+    public CellNameType setSubtype(int position, AbstractType<?> newType);
+
+    /**
+     * Creates a row marker for the CQL3 having the provided clustering prefix.
+     *
+     * Note that this is only valid for CQL3 tables (isCompound() and !isDense()) and should
+     * only be called for them.
+     */
+    public CellName rowMarker(Composite prefix);
+
+    /**
+     * Creates a new CellName given a clustering prefix and a CQL3 column.
+     *
+     * Note that for dense types, the column can be null as a shortcut for designing the only
+     * COMPACT_VALUE column of the table.
+     */
+    public CellName create(Composite prefix, ColumnDefinition column);
+
+    /**
+     * Creates a new collection CellName given a clustering prefix, a CQL3 column and the collection element.
+     */
+    public CellName create(Composite prefix, ColumnDefinition column, ByteBuffer collectionElement);
+
+    /**
+     * Convenience method to create cell names given its components.
+     *
+     * This is equivalent to CType#make() but return a full cell name (and thus
+     * require all the components of the name).
+     */
+    public CellName makeCellName(Object... components);
+
+    /**
+     * Deserialize a Composite from a ByteBuffer.
+     *
+     * This is equilvalent to CType#fromByteBuffer but assumes the buffer is a full cell
+     * name. This is meant for thrift/cql2 to convert the fully serialized buffer we
+     * get from the clients.
+     */
+    public CellName cellFromByteBuffer(ByteBuffer bb);
+
+    /**
+     * Creates a new CQL3Row builder for this type. See CQL3Row for details.
+     */
+    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now);
+
+    // The two following methods are used to pass the declared regular column names (in CFMetaData)
+    // to the CellNameType. This is only used for optimization sake, see SparseCellNameType.
+    public void addCQL3Column(ColumnIdentifier id);
+    public void removeCQL3Column(ColumnIdentifier id);
+
+    /**
+     * Creates a new Deserializer. This is used by AtomDeserializer to do incremental and on-demand
+     * deserialization of the on disk atoms. See AtomDeserializer for details.
+     */
+    public Deserializer newDeserializer(DataInput in);
+
+    /*
+     * Same as in CType, follows a number of per-CellNameType instances for the Comparator and Serializer used
+     * throughout the code (those that require full CellName versus just Composite).
+     */
+
+    // Ultimately, those might be split into an IVersionedSerializer and an ISSTableSerializer
+    public ISerializer<CellName> cellSerializer();
+
+    public Comparator<Cell> columnComparator(boolean isRightNative);
+    public Comparator<Object> asymmetricColumnComparator(boolean isRightNative);
+    public Comparator<Cell> columnReverseComparator();
+    public Comparator<OnDiskAtom> onDiskAtomComparator();
+
+    public ColumnSerializer columnSerializer();
+    public OnDiskAtom.Serializer onDiskAtomSerializer();
+    public IVersionedSerializer<NamesQueryFilter> namesQueryFilterSerializer();
+    public IVersionedSerializer<IDiskAtomFilter> diskAtomFilterSerializer();
+
+    public interface Deserializer
+    {
+        /**
+         * Whether this deserializer is done or not, i.e. whether we're reached the end of row marker.
+         */
+        public boolean hasNext() throws IOException;
+
+        /**
+         * Whether or not some name has been read but not consumed by readNext.
+         */
+        public boolean hasUnprocessed() throws IOException;
+
+        /**
+         * Comparare the next name to read to the provided Composite.
+         * This does not consume the next name.
+         */
+        public int compareNextTo(Composite composite) throws IOException;
+
+        /**
+         * Actually consume the next name and return it.
+         */
+        public Composite readNext() throws IOException;
+
+        /**
+         * Skip the next name (consuming it).
+         */
+        public void skipNext() throws IOException;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/CellNames.java b/src/java/org/apache/cassandra/db/composites/CellNames.java
new file mode 100644
index 0000000..b941166
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CellNames.java

@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ColumnToCollectionType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+
+public abstract class CellNames
+{
+    private CellNames() {}
+
+    public static CellNameType fromAbstractType(AbstractType<?> type, boolean isDense)
+    {
+        if (isDense)
+        {
+            if (type instanceof CompositeType)
+            {
+                return new CompoundDenseCellNameType(((CompositeType)type).types);
+            }
+            else
+            {
+                return new SimpleDenseCellNameType(type);
+            }
+        }
+        else
+        {
+            if (type instanceof CompositeType)
+            {
+                List<AbstractType<?>> types = ((CompositeType)type).types;
+                if (types.get(types.size() - 1) instanceof ColumnToCollectionType)
+                {
+                    // We don't allow collection for super columns, so the "name" type *must* be UTF8
+                    assert types.get(types.size() - 2) instanceof UTF8Type;
+                    return new CompoundSparseCellNameType.WithCollection(types.subList(0, types.size() - 2), (ColumnToCollectionType)types.get(types.size() - 1));
+                }
+                else
+                {
+                    AbstractType<?> nameType = types.get(types.size() - 1);
+                    return new CompoundSparseCellNameType(types.subList(0, types.size() - 1), nameType);
+                }
+            }
+            else
+            {
+                return new SimpleSparseCellNameType(type);
+            }
+        }
+    }
+
+    // Mainly for tests and a few cases where we know what we need and didn't wanted to pass the type around.
+    // Avoid in general, prefer the CellNameType methods.
+    public static CellName simpleDense(ByteBuffer bb)
+    {
+        assert bb.hasRemaining();
+        return new SimpleDenseCellName(bb);
+    }
+
+    public static CellName simpleSparse(ColumnIdentifier identifier)
+    {
+        return new SimpleSparseCellName(identifier);
+    }
+
+    // Mainly for tests and a few cases where we know what we need and didn't wanted to pass the type around
+    // Avoid in general, prefer the CellNameType methods.
+    public static CellName compositeDense(ByteBuffer... bbs)
+    {
+        return new CompoundDenseCellName(bbs);
+    }
+
+    public static CellName compositeSparse(ByteBuffer[] bbs, ColumnIdentifier identifier, boolean isStatic)
+    {
+        return new CompoundSparseCellName(bbs, identifier, isStatic);
+    }
+
+    public static CellName compositeSparseWithCollection(ByteBuffer[] bbs, ByteBuffer collectionElement, ColumnIdentifier identifier, boolean isStatic)
+    {
+        return new CompoundSparseCellName.WithCollection(bbs, identifier, collectionElement, isStatic);
+    }
+
+    public static String getColumnsString(CellNameType type, Iterable<Cell> columns)
+    {
+        StringBuilder builder = new StringBuilder();
+        for (Cell cell : columns)
+            builder.append(cell.getString(type)).append(",");
+        return builder.toString();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/Composite.java b/src/java/org/apache/cassandra/db/composites/Composite.java
new file mode 100644
index 0000000..b15daef
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/Composite.java

@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.cache.IMeasurableMemory;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A composite value.
+ *
+ * This can be though as a list of ByteBuffer, except that this also include an
+ * 'end-of-component' flag, that allow precise selection of composite ranges.
+ *
+ * We also make a difference between "true" composites and the "simple" ones. The
+ * non-truly composite will have a size() == 1 but differs from true composites with
+ * size() == 1 in the way they are stored. Most code shouldn't have to care about the
+ * difference.
+ */
+public interface Composite extends IMeasurableMemory
+{
+    public enum EOC
+    {
+        START(-1), NONE(-1), END(1);
+
+        // If composite p has this EOC and is a strict prefix of composite c, then this
+        // the result of the comparison of p and c. Basically, p sorts before c unless
+        // it's EOC is END.
+        public final int prefixComparisonResult;
+
+        private EOC(int prefixComparisonResult)
+        {
+            this.prefixComparisonResult = prefixComparisonResult;
+        }
+
+        public static EOC from(int eoc)
+        {
+            return eoc == 0 ? NONE : (eoc < 0 ? START : END);
+        }
+    }
+
+    public int size();
+    public boolean isEmpty();
+    public ByteBuffer get(int i);
+
+    public EOC eoc();
+    public Composite withEOC(EOC eoc);
+    public Composite start();
+    public Composite end();
+    public ColumnSlice slice();
+
+    public boolean isStatic();
+
+    public boolean isPrefixOf(CType type, Composite other);
+
+    public ByteBuffer toByteBuffer();
+
+    public int dataSize();
+    public Composite copy(CFMetaData cfm, AbstractAllocator allocator);
+}

diff --git a/src/java/org/apache/cassandra/db/composites/Composites.java b/src/java/org/apache/cassandra/db/composites/Composites.java
new file mode 100644
index 0000000..f6626e0
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/Composites.java

@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public abstract class Composites
+{
+    private Composites() {}
+
+    public static final Composite EMPTY = new EmptyComposite();
+
+    static final CBuilder EMPTY_BUILDER = new CBuilder()
+    {
+        public int remainingCount() { return 0; }
+
+        public CBuilder add(ByteBuffer value) { throw new IllegalStateException(); }
+        public CBuilder add(Object value) { throw new IllegalStateException(); }
+
+        public Composite build() { return EMPTY; }
+        public Composite buildWith(ByteBuffer value) { throw new IllegalStateException(); }
+        public Composite buildWith(List<ByteBuffer> values) { throw new IllegalStateException(); }
+    };
+
+    private static class EmptyComposite implements Composite
+    {
+        public boolean isEmpty()
+        {
+            return true;
+        }
+
+        public int size()
+        {
+            return 0;
+        }
+
+        public ByteBuffer get(int i)
+        {
+            if (i > 0)
+                throw new IndexOutOfBoundsException();
+
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        }
+
+        public EOC eoc()
+        {
+            return EOC.NONE;
+        }
+
+        public Composite start()
+        {
+            // Note that SimpleCType/AbstractSimpleCellNameType compare method
+            // indirectly rely on the fact that EMPTY == EMPTY.start() == EMPTY.end()
+            // (or more precisely on the fact that the EOC is NONE for all of those).
+            return this;
+        }
+
+        public Composite end()
+        {
+            // Note that SimpleCType/AbstractSimpleCellNameType compare method
+            // indirectly rely on the fact that EMPTY == EMPTY.start() == EMPTY.end()
+            // (or more precisely on the fact that the EOC is NONE for all of those).
+            return this;
+        }
+
+        public Composite withEOC(EOC newEoc)
+        {
+            // Note that SimpleCType/AbstractSimpleCellNameType compare method
+            // indirectly rely on the fact that EMPTY == EMPTY.start() == EMPTY.end()
+            // (or more precisely on the fact that the EOC is NONE for all of those).
+            return this;
+        }
+
+        public ColumnSlice slice()
+        {
+            return ColumnSlice.ALL_COLUMNS;
+        }
+
+        public ByteBuffer toByteBuffer()
+        {
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        }
+
+        public boolean isStatic()
+        {
+            return false;
+        }
+
+        public int dataSize()
+        {
+            return 0;
+        }
+
+        public long unsharedHeapSize()
+        {
+            return 0;
+        }
+
+        public boolean isPrefixOf(CType type, Composite c)
+        {
+            return true;
+        }
+
+        public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
+        {
+            return this;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundCType.java b/src/java/org/apache/cassandra/db/composites/CompoundCType.java
new file mode 100644
index 0000000..0458748
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CompoundCType.java

@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CompositeType;
+
+/**
+ * A truly-composite CType.
+ */
+public class CompoundCType extends AbstractCType
+{
+    final List<AbstractType<?>> types;
+
+    // It's up to the caller to pass a list that is effectively immutable
+    public CompoundCType(List<AbstractType<?>> types)
+    {
+        super(isByteOrderComparable(types));
+        this.types = types;
+    }
+
+    public boolean isCompound()
+    {
+        return true;
+    }
+
+    public int size()
+    {
+        return types.size();
+    }
+
+    public AbstractType<?> subtype(int i)
+    {
+        return types.get(i);
+    }
+
+    public Composite fromByteBuffer(ByteBuffer bytes)
+    {
+        if (!bytes.hasRemaining())
+            return Composites.EMPTY;
+
+        ByteBuffer[] elements = new ByteBuffer[size()];
+        int idx = bytes.position(), i = 0;
+        byte eoc = 0;
+
+        boolean isStatic = false;
+        if (CompositeType.isStaticName(bytes))
+        {
+            isStatic = true;
+            idx += 2;
+        }
+
+        while (idx < bytes.limit())
+        {
+            checkRemaining(bytes, idx, 2);
+            int length = bytes.getShort(idx) & 0xFFFF;
+            idx += 2;
+
+            checkRemaining(bytes, idx, length + 1);
+            elements[i++] = sliceBytes(bytes, idx, length);
+            idx += length;
+            eoc = bytes.get(idx++);
+        }
+        return new CompoundComposite(elements, i, isStatic).withEOC(Composite.EOC.from(eoc));
+    }
+
+    public CBuilder builder()
+    {
+        return new CompoundCBuilder(this);
+    }
+
+    public CompoundCType setSubtype(int position, AbstractType<?> newType)
+    {
+        List<AbstractType<?>> newTypes = new ArrayList<AbstractType<?>>(types);
+        newTypes.set(position, newType);
+        return new CompoundCType(newTypes);
+    }
+
+    public AbstractType<?> asAbstractType()
+    {
+        return CompositeType.getInstance(types);
+    }
+
+    public static class CompoundCBuilder implements CBuilder
+    {
+        private final CType type;
+        private final ByteBuffer[] values;
+        private int size;
+        private boolean built;
+
+        public CompoundCBuilder(CType type)
+        {
+            this.type = type;
+            this.values = new ByteBuffer[type.size()];
+        }
+
+        public int remainingCount()
+        {
+            return values.length - size;
+        }
+
+        public CBuilder add(ByteBuffer value)
+        {
+            if (isDone())
+                throw new IllegalStateException();
+            values[size++] = value;
+            return this;
+        }
+
+        public CBuilder add(Object value)
+        {
+            return add(((AbstractType)type.subtype(size)).decompose(value));
+        }
+
+        private boolean isDone()
+        {
+            return remainingCount() == 0 || built;
+        }
+
+        public Composite build()
+        {
+            if (size == 0)
+                return Composites.EMPTY;
+
+            // We don't allow to add more element to a builder that has been built so
+            // that we don't have to copy values.
+            built = true;
+
+            // If the builder is full and we're building a dense cell name, then we can
+            // directly allocate the CellName object as it's complete.
+            if (size == values.length && type instanceof CellNameType && ((CellNameType)type).isDense())
+                return new CompoundDenseCellName(values);
+            return new CompoundComposite(values, size, false);
+        }
+
+        public Composite buildWith(ByteBuffer value)
+        {
+            ByteBuffer[] newValues = Arrays.copyOf(values, values.length);
+            newValues[size] = value;
+            // Same as above
+            if (size+1 == newValues.length && type instanceof CellNameType && ((CellNameType)type).isDense())
+                return new CompoundDenseCellName(newValues);
+
+            return new CompoundComposite(newValues, size+1, false);
+        }
+
+        public Composite buildWith(List<ByteBuffer> newValues)
+        {
+            ByteBuffer[] buffers = Arrays.copyOf(values, values.length);
+            int newSize = size;
+            for (ByteBuffer value : newValues)
+                buffers[newSize++] = value;
+
+            if (newSize == buffers.length && type instanceof CellNameType && ((CellNameType)type).isDense())
+                return new CompoundDenseCellName(buffers);
+
+            return new CompoundComposite(buffers, newSize, false);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundComposite.java b/src/java/org/apache/cassandra/db/composites/CompoundComposite.java
new file mode 100644
index 0000000..7a21b01
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CompoundComposite.java

@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A "truly-composite" Composite.
+ */
+public class CompoundComposite extends AbstractComposite
+{
+    private static final long HEAP_SIZE = ObjectSizes.measure(new CompoundComposite(null, 0, false));
+
+    // We could use a List, but we'll create such object *a lot* and using a array+size is not
+    // all that harder, so we save the List object allocation.
+    final ByteBuffer[] elements;
+    final int size;
+    final boolean isStatic;
+
+    CompoundComposite(ByteBuffer[] elements, int size, boolean isStatic)
+    {
+        this.elements = elements;
+        this.size = size;
+        this.isStatic = isStatic;
+    }
+
+    public int size()
+    {
+        return size;
+    }
+
+    public ByteBuffer get(int i)
+    {
+        // Note: most consumer should validate that i is within bounds. However, for backward compatibility
+        // reasons, composite dense tables can have names that don't have all their component of the clustering
+        // columns, which may end up here with i > size(). For those calls, it's actually simpler to return null
+        // than to force the caller to special case.
+        return i >= size() ? null : elements[i];
+    }
+
+    @Override
+    public boolean isStatic()
+    {
+        return isStatic;
+    }
+
+    protected ByteBuffer[] elementsCopy(AbstractAllocator allocator)
+    {
+        ByteBuffer[] elementsCopy = new ByteBuffer[size];
+        for (int i = 0; i < size; i++)
+            elementsCopy[i] = allocator.clone(elements[i]);
+        return elementsCopy;
+    }
+
+    public long unsharedHeapSize()
+    {
+        return HEAP_SIZE + ObjectSizes.sizeOnHeapOf(elements);
+    }
+
+    public long unsharedHeapSizeExcludingData()
+    {
+        return HEAP_SIZE + ObjectSizes.sizeOnHeapExcludingData(elements);
+    }
+
+    public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        return new CompoundComposite(elementsCopy(allocator), size, isStatic);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundDenseCellName.java b/src/java/org/apache/cassandra/db/composites/CompoundDenseCellName.java
new file mode 100644
index 0000000..1f471a8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CompoundDenseCellName.java

@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.ObjectSizes;
+
+public class CompoundDenseCellName extends CompoundComposite implements CellName
+{
+
+    private static final long HEAP_SIZE = ObjectSizes.measure(new CompoundDenseCellName(new ByteBuffer[0]));
+
+    // Not meant to be used directly, you should use the CellNameType method instead
+    CompoundDenseCellName(ByteBuffer[] elements)
+    {
+        super(elements, elements.length, false);
+    }
+
+    CompoundDenseCellName(ByteBuffer[] elements, int size)
+    {
+        super(elements, size, false);
+    }
+
+    public int clusteringSize()
+    {
+        return size;
+    }
+
+    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
+    {
+        return null;
+    }
+
+    public ByteBuffer collectionElement()
+    {
+        return null;
+    }
+
+    public boolean isCollectionCell()
+    {
+        return false;
+    }
+
+    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
+    {
+        // Dense cell imply one cell by CQL row so no other cell will be the same row.
+        return type.compare(this, other) == 0;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return HEAP_SIZE + ObjectSizes.sizeOnHeapOf(elements);
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return HEAP_SIZE + ObjectSizes.sizeOnHeapExcludingData(elements);
+    }
+
+    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        return new CompoundDenseCellName(elementsCopy(allocator));
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundDenseCellNameType.java b/src/java/org/apache/cassandra/db/composites/CompoundDenseCellNameType.java
new file mode 100644
index 0000000..2e409fb
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CompoundDenseCellNameType.java

@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.CQL3Row;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.AbstractType;
+
+public class CompoundDenseCellNameType extends AbstractCompoundCellNameType
+{
+    public CompoundDenseCellNameType(List<AbstractType<?>> types)
+    {
+        this(new CompoundCType(types));
+    }
+
+    private CompoundDenseCellNameType(CompoundCType type)
+    {
+        super(type, type);
+    }
+
+    public CellNameType setSubtype(int position, AbstractType<?> newType)
+    {
+        if (position != 0)
+            throw new IllegalArgumentException();
+        return new SimpleDenseCellNameType(newType);
+    }
+
+    public boolean isDense()
+    {
+        return true;
+    }
+
+    public CellName create(Composite prefix, ColumnDefinition column)
+    {
+        // We ignore the column because it's just the COMPACT_VALUE name which is not store in the cell name (and it can be null anyway)
+        if (prefix instanceof CellName)
+            return (CellName)prefix;
+
+        // as noted below in makeWith(), compound dense cell names don't have to include all components
+        assert prefix instanceof CompoundComposite;
+        CompoundComposite lc = (CompoundComposite)prefix;
+        return new CompoundDenseCellName(lc.elements, lc.size);
+    }
+
+    protected Composite makeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
+    {
+        assert !isStatic;
+        // A composite dense table cell name don't have to have all the component set to qualify as a
+        // proper CellName (for backward compatibility reasons mostly), so always return a cellName
+        CompoundDenseCellName c = new CompoundDenseCellName(components, size);
+        return eoc != Composite.EOC.NONE ? c.withEOC(eoc) : c;
+    }
+
+    protected Composite copyAndMakeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
+    {
+        return makeWith(Arrays.copyOfRange(components, 0, size), size, eoc, isStatic);
+    }
+
+    public void addCQL3Column(ColumnIdentifier id) {}
+    public void removeCQL3Column(ColumnIdentifier id) {}
+
+    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now)
+    {
+        return makeDenseCQL3RowBuilder(now);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundSparseCellName.java b/src/java/org/apache/cassandra/db/composites/CompoundSparseCellName.java
new file mode 100644
index 0000000..03af6d0
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CompoundSparseCellName.java

@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.ObjectSizes;
+
+public class CompoundSparseCellName extends CompoundComposite implements CellName
+{
+    private static final ByteBuffer[] EMPTY_PREFIX = new ByteBuffer[0];
+
+    private static final long HEAP_SIZE = ObjectSizes.measure(new CompoundSparseCellName(null, false));
+
+    protected final ColumnIdentifier columnName;
+
+    // Not meant to be used directly, you should use the CellNameType method instead
+    CompoundSparseCellName(ColumnIdentifier columnName, boolean isStatic)
+    {
+        this(EMPTY_PREFIX, columnName, isStatic);
+    }
+
+    CompoundSparseCellName(ByteBuffer[] elements, ColumnIdentifier columnName, boolean isStatic)
+    {
+        this(elements, elements.length, columnName, isStatic);
+    }
+
+    CompoundSparseCellName(ByteBuffer[] elements, int size, ColumnIdentifier columnName, boolean isStatic)
+    {
+        super(elements, size, isStatic);
+        this.columnName = columnName;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return HEAP_SIZE + ObjectSizes.sizeOnHeapOf(elements);
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return HEAP_SIZE + ObjectSizes.sizeOnHeapExcludingData(elements);
+    }
+
+    public int size()
+    {
+        return size + 1;
+    }
+
+    public ByteBuffer get(int i)
+    {
+        return i == size ? columnName.bytes : elements[i];
+    }
+
+    public int clusteringSize()
+    {
+        return size;
+    }
+
+    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
+    {
+        return columnName;
+    }
+
+    public ByteBuffer collectionElement()
+    {
+        return null;
+    }
+
+    public boolean isCollectionCell()
+    {
+        return false;
+    }
+
+    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
+    {
+        if (clusteringSize() != other.clusteringSize() || other.isStatic() != isStatic())
+            return false;
+
+        for (int i = 0; i < clusteringSize(); i++)
+        {
+            if (type.subtype(i).compare(elements[i], other.get(i)) != 0)
+                return false;
+        }
+        return true;
+    }
+
+    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        if (elements.length == 0)
+            return this;
+
+        // We don't copy columnName because it's interned in SparseCellNameType
+        return new CompoundSparseCellName(elementsCopy(allocator), columnName, isStatic());
+    }
+
+    public static class WithCollection extends CompoundSparseCellName
+    {
+        private static final long HEAP_SIZE = ObjectSizes.measure(new WithCollection(null, ByteBufferUtil.EMPTY_BYTE_BUFFER, false));
+
+        private final ByteBuffer collectionElement;
+
+        WithCollection(ColumnIdentifier columnName, ByteBuffer collectionElement, boolean isStatic)
+        {
+            this(EMPTY_PREFIX, columnName, collectionElement, isStatic);
+        }
+
+        WithCollection(ByteBuffer[] elements, ColumnIdentifier columnName, ByteBuffer collectionElement, boolean isStatic)
+        {
+            this(elements, elements.length, columnName, collectionElement, isStatic);
+        }
+
+        WithCollection(ByteBuffer[] elements, int size, ColumnIdentifier columnName, ByteBuffer collectionElement, boolean isStatic)
+        {
+            super(elements, size, columnName, isStatic);
+            this.collectionElement = collectionElement;
+        }
+
+        public int size()
+        {
+            return size + 2;
+        }
+
+        public ByteBuffer get(int i)
+        {
+            return i == size + 1 ? collectionElement : super.get(i);
+        }
+
+        @Override
+        public ByteBuffer collectionElement()
+        {
+            return collectionElement;
+        }
+
+        @Override
+        public boolean isCollectionCell()
+        {
+            return true;
+        }
+
+        @Override
+        public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
+        {
+            // We don't copy columnName because it's interned in SparseCellNameType
+            return new CompoundSparseCellName.WithCollection(elements.length == 0 ? elements : elementsCopy(allocator), size, columnName, allocator.clone(collectionElement), isStatic());
+        }
+
+        @Override
+        public long unsharedHeapSize()
+        {
+            return HEAP_SIZE + ObjectSizes.sizeOnHeapOf(elements)
+                   + ObjectSizes.sizeOnHeapExcludingData(collectionElement);
+        }
+
+        @Override
+        public long unsharedHeapSizeExcludingData()
+        {
+            return HEAP_SIZE + ObjectSizes.sizeOnHeapExcludingData(elements)
+                   + ObjectSizes.sizeOnHeapExcludingData(collectionElement);
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundSparseCellNameType.java b/src/java/org/apache/cassandra/db/composites/CompoundSparseCellNameType.java
new file mode 100644
index 0000000..27b3271
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/CompoundSparseCellNameType.java

@@ -0,0 +1,333 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.CQL3Row;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.ColumnToCollectionType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+public class CompoundSparseCellNameType extends AbstractCompoundCellNameType
+{
+    public static final ColumnIdentifier rowMarkerId = new ColumnIdentifier(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance);
+    private static final CellName rowMarkerNoPrefix = new CompoundSparseCellName(rowMarkerId, false);
+
+    // For CQL3 columns, this is always UTF8Type. However, for compatibility with super columns, we need to allow it to be non-UTF8.
+    private final AbstractType<?> columnNameType;
+    protected final Map<ByteBuffer, ColumnIdentifier> internedIds;
+
+    private final Composite staticPrefix;
+
+    public CompoundSparseCellNameType(List<AbstractType<?>> types)
+    {
+        this(types, UTF8Type.instance);
+    }
+
+    public CompoundSparseCellNameType(List<AbstractType<?>> types, AbstractType<?> columnNameType)
+    {
+        this(new CompoundCType(types), columnNameType);
+    }
+
+    private CompoundSparseCellNameType(CompoundCType clusteringType, AbstractType<?> columnNameType)
+    {
+        this(clusteringType, columnNameType, makeCType(clusteringType, columnNameType, null), new HashMap<ByteBuffer, ColumnIdentifier>());
+    }
+
+    private CompoundSparseCellNameType(CompoundCType clusteringType, AbstractType<?> columnNameType, CompoundCType fullType, Map<ByteBuffer, ColumnIdentifier> internedIds)
+    {
+        super(clusteringType, fullType);
+        this.columnNameType = columnNameType;
+        this.internedIds = internedIds;
+        this.staticPrefix = makeStaticPrefix(clusteringType.size());
+    }
+
+    private static Composite makeStaticPrefix(int size)
+    {
+        ByteBuffer[] elements = new ByteBuffer[size];
+        for (int i = 0; i < size; i++)
+            elements[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+
+        return new CompoundComposite(elements, size, true)
+        {
+            @Override
+            public boolean isStatic()
+            {
+                return true;
+            }
+
+            @Override
+            public long unsharedHeapSize()
+            {
+                // We'll share this for a given type.
+                return 0;
+            }
+
+            @Override
+            public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
+            {
+                return this;
+            }
+        };
+    }
+
+    protected static CompoundCType makeCType(CompoundCType clusteringType, AbstractType<?> columnNameType, ColumnToCollectionType collectionType)
+    {
+        List<AbstractType<?>> allSubtypes = new ArrayList<AbstractType<?>>(clusteringType.size() + (collectionType == null ? 1 : 2));
+        for (int i = 0; i < clusteringType.size(); i++)
+            allSubtypes.add(clusteringType.subtype(i));
+        allSubtypes.add(columnNameType);
+        if (collectionType != null)
+            allSubtypes.add(collectionType);
+        return new CompoundCType(allSubtypes);
+    }
+
+    public CellNameType setSubtype(int position, AbstractType<?> newType)
+    {
+        if (position < clusteringSize)
+            return new CompoundSparseCellNameType(clusteringType.setSubtype(position, newType), columnNameType, fullType.setSubtype(position, newType), internedIds);
+
+        if (position == clusteringSize)
+            throw new IllegalArgumentException();
+
+        throw new IndexOutOfBoundsException();
+    }
+
+    @Override
+    public CellNameType addOrUpdateCollection(ColumnIdentifier columnName, CollectionType newCollection)
+    {
+        return new WithCollection(clusteringType, ColumnToCollectionType.getInstance(Collections.singletonMap(columnName.bytes, newCollection)), internedIds);
+    }
+
+    public boolean isDense()
+    {
+        return false;
+    }
+
+    public boolean supportCollections()
+    {
+        return true;
+    }
+
+    public Composite staticPrefix()
+    {
+        return staticPrefix;
+    }
+
+    public CellName create(Composite prefix, ColumnDefinition column)
+    {
+        return create(prefix, column.name, column.isStatic());
+    }
+
+    private CellName create(Composite prefix, ColumnIdentifier columnName, boolean isStatic)
+    {
+        if (isStatic)
+            prefix = staticPrefix();
+
+        assert prefix.size() == clusteringSize;
+
+        if (prefix.isEmpty())
+            return new CompoundSparseCellName(columnName, isStatic);
+
+        assert prefix instanceof CompoundComposite;
+        CompoundComposite lc = (CompoundComposite)prefix;
+        return new CompoundSparseCellName(lc.elements, clusteringSize, columnName, isStatic);
+    }
+
+    public CellName rowMarker(Composite prefix)
+    {
+        assert !prefix.isStatic(); // static columns don't really create rows, they shouldn't have a row marker
+        if (prefix.isEmpty())
+            return rowMarkerNoPrefix;
+
+        return create(prefix, rowMarkerId, false);
+    }
+
+    protected ColumnIdentifier idFor(ByteBuffer bb)
+    {
+        ColumnIdentifier id = internedIds.get(bb);
+        return id == null ? new ColumnIdentifier(bb, columnNameType) : id;
+    }
+
+    protected Composite makeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
+    {
+        if (size < clusteringSize + 1 || eoc != Composite.EOC.NONE)
+            return new CompoundComposite(components, size, isStatic).withEOC(eoc);
+
+        return new CompoundSparseCellName(components, clusteringSize, idFor(components[clusteringSize]), isStatic);
+    }
+
+    protected Composite copyAndMakeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
+    {
+        if (size < clusteringSize + 1 || eoc != Composite.EOC.NONE)
+            return new CompoundComposite(Arrays.copyOfRange(components, 0, size), size, isStatic).withEOC(eoc);
+
+        ByteBuffer[] clusteringColumns = Arrays.copyOfRange(components, 0, clusteringSize);
+        return new CompoundSparseCellName(clusteringColumns, idFor(components[clusteringSize]), isStatic);
+    }
+
+    public void addCQL3Column(ColumnIdentifier id)
+    {
+        internedIds.put(id.bytes, id);
+    }
+
+    public void removeCQL3Column(ColumnIdentifier id)
+    {
+        internedIds.remove(id.bytes);
+    }
+
+    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now)
+    {
+        return makeSparseCQL3RowBuilder(metadata, this, now);
+    }
+
+    public static class WithCollection extends CompoundSparseCellNameType
+    {
+        private final ColumnToCollectionType collectionType;
+
+        public WithCollection(List<AbstractType<?>> types, ColumnToCollectionType collectionType)
+        {
+            this(new CompoundCType(types), collectionType);
+        }
+
+        WithCollection(CompoundCType clusteringType, ColumnToCollectionType collectionType)
+        {
+            this(clusteringType, collectionType, new HashMap<ByteBuffer, ColumnIdentifier>());
+        }
+
+        private WithCollection(CompoundCType clusteringType, ColumnToCollectionType collectionType, Map<ByteBuffer, ColumnIdentifier> internedIds)
+        {
+            this(clusteringType, makeCType(clusteringType, UTF8Type.instance, collectionType), collectionType, internedIds);
+        }
+
+        private WithCollection(CompoundCType clusteringType, CompoundCType fullCType, ColumnToCollectionType collectionType, Map<ByteBuffer, ColumnIdentifier> internedIds)
+        {
+            super(clusteringType, UTF8Type.instance, fullCType, internedIds);
+            this.collectionType = collectionType;
+        }
+
+        @Override
+        public CellNameType setSubtype(int position, AbstractType<?> newType)
+        {
+            if (position < clusteringSize)
+                return new WithCollection(clusteringType.setSubtype(position, newType), collectionType, internedIds);
+
+            throw position >= fullType.size() ? new IndexOutOfBoundsException() : new IllegalArgumentException();
+        }
+
+        @Override
+        public CellNameType addOrUpdateCollection(ColumnIdentifier columnName, CollectionType newCollection)
+        {
+            Map<ByteBuffer, CollectionType> newMap = new HashMap<>(collectionType.defined);
+            newMap.put(columnName.bytes, newCollection);
+            return new WithCollection(clusteringType, ColumnToCollectionType.getInstance(newMap), internedIds);
+        }
+
+        @Override
+        public CellName create(Composite prefix, ColumnDefinition column, ByteBuffer collectionElement)
+        {
+            if (column.isStatic())
+                prefix = staticPrefix();
+
+            assert prefix.size() == clusteringSize;
+
+            if (prefix.isEmpty())
+                return new CompoundSparseCellName.WithCollection(column.name, collectionElement, column.isStatic());
+
+            assert prefix instanceof CompoundComposite;
+            CompoundComposite lc = (CompoundComposite)prefix;
+            return new CompoundSparseCellName.WithCollection(lc.elements, clusteringSize, column.name, collectionElement, column.isStatic());
+        }
+
+        @Override
+        public int compare(Composite c1, Composite c2)
+        {
+            if (c1.isStatic() != c2.isStatic())
+            {
+                // Static sorts before non-static no matter what, except for empty which
+                // always sort first
+                if (c1.isEmpty())
+                    return c2.isEmpty() ? 0 : -1;
+                if (c2.isEmpty())
+                    return 1;
+                return c1.isStatic() ? -1 : 1;
+            }
+
+            int s1 = c1.size();
+            int s2 = c2.size();
+            int minSize = Math.min(s1, s2);
+
+            ByteBuffer previous = null;
+            for (int i = 0; i < minSize; i++)
+            {
+                AbstractType<?> comparator = subtype(i);
+                ByteBuffer value1 = c1.get(i);
+                ByteBuffer value2 = c2.get(i);
+
+                int cmp = comparator.compareCollectionMembers(value1, value2, previous);
+                if (cmp != 0)
+                    return cmp;
+
+                previous = value1;
+            }
+
+            if (s1 == s2)
+                return c1.eoc().compareTo(c2.eoc());
+            return s1 < s2 ? c1.eoc().prefixComparisonResult : -c2.eoc().prefixComparisonResult;
+        }
+
+        @Override
+        public boolean hasCollections()
+        {
+            return true;
+        }
+
+        @Override
+        public ColumnToCollectionType collectionType()
+        {
+            return collectionType;
+        }
+
+        @Override
+        protected Composite makeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
+        {
+            if (size < fullSize)
+                return super.makeWith(components, size, eoc, isStatic);
+
+            return new CompoundSparseCellName.WithCollection(components, clusteringSize, idFor(components[clusteringSize]), components[fullSize - 1], isStatic);
+        }
+
+        protected Composite copyAndMakeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
+        {
+            if (size < fullSize)
+                return super.copyAndMakeWith(components, size, eoc, isStatic);
+
+            ByteBuffer[] clusteringColumns = Arrays.copyOfRange(components, 0, clusteringSize);
+            return new CompoundSparseCellName.WithCollection(clusteringColumns, idFor(components[clusteringSize]), components[clusteringSize + 1], isStatic);
+        }
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleCType.java b/src/java/org/apache/cassandra/db/composites/SimpleCType.java
new file mode 100644
index 0000000..c824179
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/SimpleCType.java

@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * A not truly-composite CType.
+ */
+public class SimpleCType extends AbstractCType
+{
+    protected final AbstractType<?> type;
+
+    public SimpleCType(AbstractType<?> type)
+    {
+        super(type.isByteOrderComparable());
+        this.type = type;
+    }
+
+    public boolean isCompound()
+    {
+        return false;
+    }
+
+    public int size()
+    {
+        return 1;
+    }
+
+    public int compare(Composite c1, Composite c2)
+    {
+        if (isByteOrderComparable)
+            return AbstractSimpleCellNameType.compareUnsigned(c1, c2);
+
+        assert !(c1.isEmpty() | c2.isEmpty());
+        // This method assumes that simple composites never have an EOC != NONE. This assumption
+        // stands in particular on the fact that a Composites.EMPTY never has a non-NONE EOC. If
+        // this ever change, we'll need to update this.
+        return type.compare(c1.get(0), c2.get(0));
+    }
+
+    public AbstractType<?> subtype(int i)
+    {
+        if (i != 0)
+            throw new IndexOutOfBoundsException();
+        return type;
+    }
+
+    public Composite fromByteBuffer(ByteBuffer bytes)
+    {
+        return !bytes.hasRemaining() ? Composites.EMPTY : new SimpleComposite(bytes);
+    }
+
+    public CBuilder builder()
+    {
+        return new SimpleCBuilder(this);
+    }
+
+    public CType setSubtype(int position, AbstractType<?> newType)
+    {
+        if (position != 0)
+            throw new IndexOutOfBoundsException();
+        return new SimpleCType(newType);
+    }
+
+    // Use sparingly, it defeats the purpose
+    public AbstractType<?> asAbstractType()
+    {
+        return type;
+    }
+
+    public static class SimpleCBuilder implements CBuilder
+    {
+        private final CType type;
+        private ByteBuffer value;
+
+        public SimpleCBuilder(CType type)
+        {
+            this.type = type;
+        }
+
+        public int remainingCount()
+        {
+            return value == null ? 1 : 0;
+        }
+
+        public CBuilder add(ByteBuffer value)
+        {
+            if (this.value != null)
+                throw new IllegalStateException();
+            this.value = value;
+            return this;
+        }
+
+        public CBuilder add(Object value)
+        {
+            return add(((AbstractType)type.subtype(0)).decompose(value));
+        }
+
+        public Composite build()
+        {
+            if (value == null || !value.hasRemaining())
+                return Composites.EMPTY;
+
+            // If we're building a dense cell name, then we can directly allocate the
+            // CellName object as it's complete.
+            if (type instanceof CellNameType && ((CellNameType)type).isDense())
+                return new SimpleDenseCellName(value);
+
+            return new SimpleComposite(value);
+        }
+
+        public Composite buildWith(ByteBuffer value)
+        {
+            if (this.value != null)
+                throw new IllegalStateException();
+
+            if (value == null || !value.hasRemaining())
+                return Composites.EMPTY;
+
+            // If we're building a dense cell name, then we can directly allocate the
+            // CellName object as it's complete.
+            if (type instanceof CellNameType && ((CellNameType)type).isDense())
+                return new SimpleDenseCellName(value);
+
+            return new SimpleComposite(value);
+        }
+
+        public Composite buildWith(List<ByteBuffer> values)
+        {
+            if (values.size() > 1)
+                throw new IllegalStateException();
+            if (values.isEmpty())
+                return Composites.EMPTY;
+            return buildWith(values.get(0));
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleComposite.java b/src/java/org/apache/cassandra/db/composites/SimpleComposite.java
new file mode 100644
index 0000000..3c80d9f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/SimpleComposite.java

@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.ObjectSizes;
+
+/**
+ * A "simple" (not-truly-composite) Composite.
+ */
+public class SimpleComposite extends AbstractComposite
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new SimpleComposite(ByteBuffer.allocate(1)));
+
+    protected final ByteBuffer element;
+
+    SimpleComposite(ByteBuffer element)
+    {
+        // We have to be careful with empty ByteBuffers as we shouldn't store them.
+        // To avoid errors (and so isEmpty() works as we intend), we don't allow simpleComposite with
+        // an empty element (but it's ok for CompoundComposite, it's a row marker in that case).
+        assert element.hasRemaining();
+        this.element = element;
+    }
+
+    public int size()
+    {
+        return 1;
+    }
+
+    public ByteBuffer get(int i)
+    {
+        if (i != 0)
+            throw new IndexOutOfBoundsException();
+
+        return element;
+    }
+
+    @Override
+    public Composite withEOC(EOC newEoc)
+    {
+        // EOC makes no sense for not truly composites.
+        return this;
+    }
+
+    @Override
+    public ByteBuffer toByteBuffer()
+    {
+        return element;
+    }
+
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOnHeapOf(element);
+    }
+
+    public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        return new SimpleComposite(allocator.clone(element));
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleDenseCellName.java b/src/java/org/apache/cassandra/db/composites/SimpleDenseCellName.java
new file mode 100644
index 0000000..2ca7d23
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/SimpleDenseCellName.java

@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.ObjectSizes;
+
+public class SimpleDenseCellName extends SimpleComposite implements CellName
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new SimpleDenseCellName(ByteBuffer.allocate(1)));
+
+    // Not meant to be used directly, you should use the CellNameType method instead
+    SimpleDenseCellName(ByteBuffer element)
+    {
+        super(element);
+    }
+
+    public int clusteringSize()
+    {
+        return 1;
+    }
+
+    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
+    {
+        return null;
+    }
+
+    public ByteBuffer collectionElement()
+    {
+        return null;
+    }
+
+    public boolean isCollectionCell()
+    {
+        return false;
+    }
+
+    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
+    {
+        // Dense cell imply one cell by CQL row so no other cell will be the same row.
+        return type.compare(this, other) == 0;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOnHeapOf(element);
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOnHeapExcludingData(element);
+    }
+
+    // If cellnames were sharing some prefix components, this will break it, so
+    // we might want to try to do better.
+    @Override
+    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        return new SimpleDenseCellName(allocator.clone(element));
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleDenseCellNameType.java b/src/java/org/apache/cassandra/db/composites/SimpleDenseCellNameType.java
new file mode 100644
index 0000000..3db4bc4
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/SimpleDenseCellNameType.java

@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.CQL3Row;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.AbstractType;
+
+public class SimpleDenseCellNameType extends AbstractSimpleCellNameType
+{
+    public SimpleDenseCellNameType(AbstractType<?> type)
+    {
+        super(type);
+    }
+
+    public int clusteringPrefixSize()
+    {
+        return 1;
+    }
+
+    public CBuilder prefixBuilder()
+    {
+        // Simple dense is "all" prefix
+        return builder();
+    }
+
+    public CellNameType setSubtype(int position, AbstractType<?> newType)
+    {
+        if (position != 0)
+            throw new IllegalArgumentException();
+        return new SimpleDenseCellNameType(newType);
+    }
+
+    public boolean isDense()
+    {
+        return true;
+    }
+
+    public CellName create(Composite prefix, ColumnDefinition column)
+    {
+        assert prefix.size() == 1;
+        // We ignore the column because it's just the COMPACT_VALUE name which is not store in the cell name
+        return new SimpleDenseCellName(prefix.get(0));
+    }
+
+    @Override
+    public Composite fromByteBuffer(ByteBuffer bb)
+    {
+        return !bb.hasRemaining()
+             ? Composites.EMPTY
+             : new SimpleDenseCellName(bb);
+    }
+
+    public void addCQL3Column(ColumnIdentifier id) {}
+    public void removeCQL3Column(ColumnIdentifier id) {}
+
+    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now)
+    {
+        return makeDenseCQL3RowBuilder(now);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleSparseCellName.java b/src/java/org/apache/cassandra/db/composites/SimpleSparseCellName.java
new file mode 100644
index 0000000..c6351f1
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/SimpleSparseCellName.java

@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.ObjectSizes;
+
+public class SimpleSparseCellName extends AbstractComposite implements CellName
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new SimpleSparseCellName(null));
+
+    private final ColumnIdentifier columnName;
+
+    // Not meant to be used directly, you should use the CellNameType method instead
+    SimpleSparseCellName(ColumnIdentifier columnName)
+    {
+        this.columnName = columnName;
+    }
+
+    public int size()
+    {
+        return 1;
+    }
+
+    public ByteBuffer get(int i)
+    {
+        if (i != 0)
+            throw new IndexOutOfBoundsException();
+
+        return columnName.bytes;
+    }
+
+    @Override
+    public Composite withEOC(EOC newEoc)
+    {
+        // EOC makes no sense for not truly composites.
+        return this;
+    }
+
+    @Override
+    public ByteBuffer toByteBuffer()
+    {
+        return columnName.bytes;
+    }
+
+    public int clusteringSize()
+    {
+        return 0;
+    }
+
+    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
+    {
+        return columnName;
+    }
+
+    public ByteBuffer collectionElement()
+    {
+        return null;
+    }
+
+    public boolean isCollectionCell()
+    {
+        return false;
+    }
+
+    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
+    {
+        return true;
+    }
+
+    public long unsharedHeapSizeExcludingData()
+    {
+        return EMPTY_SIZE + columnName.unsharedHeapSizeExcludingData();
+    }
+
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE + columnName.unsharedHeapSize();
+    }
+
+    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        return new SimpleSparseCellName(columnName.clone(allocator));
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleSparseCellNameType.java b/src/java/org/apache/cassandra/db/composites/SimpleSparseCellNameType.java
new file mode 100644
index 0000000..5ce0deb
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/SimpleSparseCellNameType.java

@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.CQL3Row;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.AbstractType;
+
+public class SimpleSparseCellNameType extends AbstractSimpleCellNameType
+{
+    // Simple sparse means static thrift CF or non-clustered CQL3. This means that cell names will mainly
+    // be those that have been declared and we can intern the whole CellName instances.
+    private final Map<ByteBuffer, CellName> internedNames;
+
+    public SimpleSparseCellNameType(AbstractType<?> type)
+    {
+        this(type, new HashMap<ByteBuffer, CellName>());
+    }
+
+    private SimpleSparseCellNameType(AbstractType<?> type, Map<ByteBuffer, CellName> internedNames)
+    {
+        super(type);
+        this.internedNames = internedNames;
+    }
+
+    public int clusteringPrefixSize()
+    {
+        return 0;
+    }
+
+    public CellNameType setSubtype(int position, AbstractType<?> newType)
+    {
+        if (position != 0)
+            throw new IllegalArgumentException();
+        return new SimpleSparseCellNameType(newType, internedNames);
+    }
+
+    public CBuilder prefixBuilder()
+    {
+        return Composites.EMPTY_BUILDER;
+    }
+
+    public boolean isDense()
+    {
+        return false;
+    }
+
+    public CellName create(Composite prefix, ColumnDefinition column)
+    {
+        assert prefix.isEmpty();
+        CellName cn = internedNames.get(column.name.bytes);
+        return cn == null ? new SimpleSparseCellName(column.name) : cn;
+    }
+
+    @Override
+    public Composite fromByteBuffer(ByteBuffer bb)
+    {
+        if (!bb.hasRemaining())
+            return Composites.EMPTY;
+
+        CellName cn = internedNames.get(bb);
+        return cn == null ? new SimpleSparseCellName(new ColumnIdentifier(bb, type)) : cn;
+    }
+
+    public void addCQL3Column(ColumnIdentifier id)
+    {
+        internedNames.put(id.bytes, new SimpleSparseInternedCellName(id));
+    }
+
+    public void removeCQL3Column(ColumnIdentifier id)
+    {
+        internedNames.remove(id.bytes);
+    }
+
+    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now)
+    {
+        return makeSparseCQL3RowBuilder(metadata, this, now);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleSparseInternedCellName.java b/src/java/org/apache/cassandra/db/composites/SimpleSparseInternedCellName.java
new file mode 100644
index 0000000..c613720
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/composites/SimpleSparseInternedCellName.java

@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+public class SimpleSparseInternedCellName extends SimpleSparseCellName
+{
+
+    // Not meant to be used directly, you should use the CellNameType method instead
+    SimpleSparseInternedCellName(ColumnIdentifier columnName)
+    {
+        super(columnName);
+    }
+
+    @Override
+    public long unsharedHeapSizeExcludingData()
+    {
+        return 0;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return 0;
+    }
+
+    @Override
+    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
+    {
+        // We're interning those instance in SparceCellNameType so don't need to copy.
+        return this;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/context/CounterContext.java b/src/java/org/apache/cassandra/db/context/CounterContext.java
index 48d2bd0..455ffc7 100644
--- a/src/java/org/apache/cassandra/db/context/CounterContext.java
+++ b/src/java/org/apache/cassandra/db/context/CounterContext.java

@@ -26,6 +26,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.db.ClockAndCount;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.serializers.MarshalException;
@@ -72,7 +73,7 @@
  * rules work this way, see CASSANDRA-1938 - specifically the 1938_discussion
  * attachment (doesn't cover global shards, see CASSANDRA-4775 for that).
  */
-public class CounterContext implements IContext
+public class CounterContext
 {
     private static final int HEADER_SIZE_LENGTH = TypeSizes.NATIVE.sizeof(Short.MAX_VALUE);
     private static final int HEADER_ELT_LENGTH = TypeSizes.NATIVE.sizeof(Short.MAX_VALUE);
@@ -82,6 +83,11 @@
 
     private static final Logger logger = LoggerFactory.getLogger(CounterContext.class);
 
+    public static enum Relationship
+    {
+        EQUAL, GREATER_THAN, LESS_THAN, DISJOINT
+    }
+
     // lazy-load singleton
     private static class LazyHolder
     {
@@ -94,21 +100,33 @@
     }
 
     /**
-     * Creates a counter context with a single local shard.
+     * Creates a counter context with a single global, 2.1+ shard (a result of increment).
      */
-    public ByteBuffer createLocal(long count, Allocator allocator)
+    public ByteBuffer createGlobal(CounterId id, long clock, long count)
     {
-        ContextState state = ContextState.allocate(0, 1, 0, allocator);
+        ContextState state = ContextState.allocate(1, 0, 0);
+        state.writeGlobal(id, clock, count);
+        return state.context;
+    }
+
+    /**
+     * Creates a counter context with a single local shard.
+     * For use by tests of compatibility with pre-2.1 counters only.
+     */
+    public ByteBuffer createLocal(long count)
+    {
+        ContextState state = ContextState.allocate(0, 1, 0);
         state.writeLocal(CounterId.getLocalId(), 1L, count);
         return state.context;
     }
 
     /**
      * Creates a counter context with a single remote shard.
+     * For use by tests of compatibility with pre-2.1 counters only.
      */
-    public ByteBuffer createRemote(CounterId id, long clock, long count, Allocator allocator)
+    public ByteBuffer createRemote(CounterId id, long clock, long count)
     {
-        ContextState state = ContextState.allocate(0, 0, 1, allocator);
+        ContextState state = ContextState.allocate(0, 0, 1);
         state.writeRemote(id, clock, count);
         return state.context;
     }
@@ -135,11 +153,11 @@
      *
      * @param left counter context.
      * @param right counter context.
-     * @return the ContextRelationship between the contexts.
+     * @return the Relationship between the contexts.
      */
-    public ContextRelationship diff(ByteBuffer left, ByteBuffer right)
+    public Relationship diff(ByteBuffer left, ByteBuffer right)
     {
-        ContextRelationship relationship = ContextRelationship.EQUAL;
+        Relationship relationship = Relationship.EQUAL;
         ContextState leftState = ContextState.wrap(left);
         ContextState rightState = ContextState.wrap(right);
 
@@ -165,45 +183,25 @@
                     {
                         // Inconsistent shard (see the corresponding code in merge()). We return DISJOINT in this
                         // case so that it will be treated as a difference, allowing read-repair to work.
-                        return ContextRelationship.DISJOINT;
-                    }
-                    else
-                    {
-                        continue;
+                        return Relationship.DISJOINT;
                     }
                 }
                 else if ((leftClock >= 0 && rightClock > 0 && leftClock > rightClock)
                       || (leftClock < 0 && (rightClock > 0 || leftClock < rightClock)))
                 {
-                    if (relationship == ContextRelationship.EQUAL)
-                    {
-                        relationship = ContextRelationship.GREATER_THAN;
-                    }
-                    else if (relationship == ContextRelationship.GREATER_THAN)
-                    {
-                        continue;
-                    }
-                    else
-                    {
-                        // relationship == ContextRelationship.LESS_THAN
-                        return ContextRelationship.DISJOINT;
-                    }
+                    if (relationship == Relationship.EQUAL)
+                        relationship = Relationship.GREATER_THAN;
+                    else if (relationship == Relationship.LESS_THAN)
+                        return Relationship.DISJOINT;
+                    // relationship == Relationship.GREATER_THAN
                 }
                 else
                 {
-                    if (relationship == ContextRelationship.EQUAL)
-                    {
-                        relationship = ContextRelationship.LESS_THAN;
-                    }
-                    else if (relationship == ContextRelationship.GREATER_THAN)
-                    {
-                        return ContextRelationship.DISJOINT;
-                    }
-                    else
-                    {
-                        // relationship == ContextRelationship.LESS_THAN
-                        continue;
-                    }
+                    if (relationship == Relationship.EQUAL)
+                        relationship = Relationship.LESS_THAN;
+                    else if (relationship == Relationship.GREATER_THAN)
+                        return Relationship.DISJOINT;
+                    // relationship == Relationship.LESS_THAN
                 }
             }
             else if (compareId > 0)
@@ -211,63 +209,40 @@
                 // only advance the right context
                 rightState.moveToNext();
 
-                if (relationship == ContextRelationship.EQUAL)
-                {
-                    relationship = ContextRelationship.LESS_THAN;
-                }
-                else if (relationship == ContextRelationship.GREATER_THAN)
-                {
-                    return ContextRelationship.DISJOINT;
-                }
-                else
-                {
-                    // relationship == ContextRelationship.LESS_THAN
-                    continue;
-                }
+                if (relationship == Relationship.EQUAL)
+                    relationship = Relationship.LESS_THAN;
+                else if (relationship == Relationship.GREATER_THAN)
+                    return Relationship.DISJOINT;
+                // relationship == Relationship.LESS_THAN
             }
             else // compareId < 0
             {
                 // only advance the left context
                 leftState.moveToNext();
 
-                if (relationship == ContextRelationship.EQUAL)
-                {
-                    relationship = ContextRelationship.GREATER_THAN;
-                }
-                else if (relationship == ContextRelationship.GREATER_THAN)
-                {
-                    continue;
-                }
-                else
-                // relationship == ContextRelationship.LESS_THAN
-                {
-                    return ContextRelationship.DISJOINT;
-                }
+                if (relationship == Relationship.EQUAL)
+                    relationship = Relationship.GREATER_THAN;
+                else if (relationship == Relationship.LESS_THAN)
+                    return Relationship.DISJOINT;
+                // relationship == Relationship.GREATER_THAN
             }
         }
 
         // check final lengths
         if (leftState.hasRemaining())
         {
-            if (relationship == ContextRelationship.EQUAL)
-            {
-                return ContextRelationship.GREATER_THAN;
-            }
-            else if (relationship == ContextRelationship.LESS_THAN)
-            {
-                return ContextRelationship.DISJOINT;
-            }
+            if (relationship == Relationship.EQUAL)
+                return Relationship.GREATER_THAN;
+            else if (relationship == Relationship.LESS_THAN)
+                return Relationship.DISJOINT;
         }
-        else if (rightState.hasRemaining())
+
+        if (rightState.hasRemaining())
         {
-            if (relationship == ContextRelationship.EQUAL)
-            {
-                return ContextRelationship.LESS_THAN;
-            }
-            else if (relationship == ContextRelationship.GREATER_THAN)
-            {
-                return ContextRelationship.DISJOINT;
-            }
+            if (relationship == Relationship.EQUAL)
+                return Relationship.LESS_THAN;
+            else if (relationship == Relationship.GREATER_THAN)
+                return Relationship.DISJOINT;
         }
 
         return relationship;
@@ -278,10 +253,12 @@
      *
      * @param left counter context.
      * @param right counter context.
-     * @param allocator An allocator for the merged value.
      */
-    public ByteBuffer merge(ByteBuffer left, ByteBuffer right, Allocator allocator)
+    public ByteBuffer merge(ByteBuffer left, ByteBuffer right)
     {
+        boolean leftIsSuperSet = true;
+        boolean rightIsSuperSet = true;
+
         int globalCount = 0;
         int localCount = 0;
         int remoteCount = 0;
@@ -294,6 +271,14 @@
             int cmp = leftState.compareIdTo(rightState);
             if (cmp == 0)
             {
+                Relationship rel = compare(leftState, rightState);
+                if (rel == Relationship.GREATER_THAN)
+                    rightIsSuperSet = false;
+                else if (rel == Relationship.LESS_THAN)
+                    leftIsSuperSet = false;
+                else if (rel == Relationship.DISJOINT)
+                    leftIsSuperSet = rightIsSuperSet = false;
+
                 if (leftState.isGlobal() || rightState.isGlobal())
                     globalCount += 1;
                 else if (leftState.isLocal() || rightState.isLocal())
@@ -306,6 +291,8 @@
             }
             else if (cmp > 0)
             {
+                leftIsSuperSet = false;
+
                 if (rightState.isGlobal())
                     globalCount += 1;
                 else if (rightState.isLocal())
@@ -317,6 +304,8 @@
             }
             else // cmp < 0
             {
+                rightIsSuperSet = false;
+
                 if (leftState.isGlobal())
                     globalCount += 1;
                 else if (leftState.isLocal())
@@ -328,6 +317,17 @@
             }
         }
 
+        if (leftState.hasRemaining())
+            rightIsSuperSet = false;
+        else if (rightState.hasRemaining())
+            leftIsSuperSet = false;
+
+        // if one of the contexts is a superset, return it early.
+        if (leftIsSuperSet)
+            return left;
+        else if (rightIsSuperSet)
+            return right;
+
         while (leftState.hasRemaining())
         {
             if (leftState.isGlobal())
@@ -355,7 +355,7 @@
         leftState.reset();
         rightState.reset();
 
-        return merge(ContextState.allocate(globalCount, localCount, remoteCount, allocator), leftState, rightState);
+        return merge(ContextState.allocate(globalCount, localCount, remoteCount), leftState, rightState);
     }
 
     private ByteBuffer merge(ContextState mergedState, ContextState leftState, ContextState rightState)
@@ -365,7 +365,16 @@
             int cmp = leftState.compareIdTo(rightState);
             if (cmp == 0)
             {
-                mergeTie(mergedState, leftState, rightState);
+                Relationship rel = compare(leftState, rightState);
+                if (rel == Relationship.DISJOINT) // two local shards
+                    mergedState.writeLocal(leftState.getCounterId(),
+                                           leftState.getClock() + rightState.getClock(),
+                                           leftState.getCount() + rightState.getCount());
+                else if (rel == Relationship.GREATER_THAN)
+                    leftState.copyTo(mergedState);
+                else // EQUAL or LESS_THAN
+                    rightState.copyTo(mergedState);
+
                 rightState.moveToNext();
                 leftState.moveToNext();
             }
@@ -396,20 +405,26 @@
         return mergedState.context;
     }
 
-    private void mergeTie(ContextState mergedState, ContextState leftState, ContextState rightState)
+    /*
+     * Compares two shards, returns:
+     * - GREATER_THAN if leftState overrides rightState
+     * - LESS_THAN if rightState overrides leftState
+     * - EQUAL for two equal, non-local, shards
+     * - DISJOINT for any two local shards
+     */
+    private Relationship compare(ContextState leftState, ContextState rightState)
     {
+        long leftClock = leftState.getClock();
+        long leftCount = leftState.getCount();
+        long rightClock = rightState.getClock();
+        long rightCount = rightState.getCount();
+
         if (leftState.isGlobal() || rightState.isGlobal())
         {
             if (leftState.isGlobal() && rightState.isGlobal())
             {
-                long leftClock = leftState.getClock();
-                long rightClock = rightState.getClock();
-
                 if (leftClock == rightClock)
                 {
-                    long leftCount = leftState.getCount();
-                    long rightCount = rightState.getCount();
-
                     // Can happen if an sstable gets lost and disk failure policy is set to 'best effort'
                     if (leftCount != rightCount && CompactionManager.isCompactionManager.get())
                     {
@@ -420,69 +435,60 @@
                     }
 
                     if (leftCount > rightCount)
-                        leftState.copyTo(mergedState);
+                        return Relationship.GREATER_THAN;
+                    else if (leftCount == rightCount)
+                        return Relationship.EQUAL;
                     else
-                        rightState.copyTo(mergedState);
+                        return Relationship.LESS_THAN;
                 }
                 else
                 {
-                    (leftClock > rightClock ? leftState : rightState).copyTo(mergedState);
+                    return leftClock > rightClock ? Relationship.GREATER_THAN : Relationship.LESS_THAN;
                 }
             }
             else // only one is global - keep that one
             {
-                (leftState.isGlobal() ? leftState : rightState).copyTo(mergedState);
+                return leftState.isGlobal() ? Relationship.GREATER_THAN : Relationship.LESS_THAN;
             }
         }
-        else if (leftState.isLocal() || rightState.isLocal())
+
+        if (leftState.isLocal() || rightState.isLocal())
         {
             // Local id and at least one is a local shard.
             if (leftState.isLocal() && rightState.isLocal())
-            {
-                // both local - sum
-                long clock = leftState.getClock() + rightState.getClock();
-                long count = leftState.getCount() + rightState.getCount();
-                mergedState.writeLocal(leftState.getCounterId(), clock, count);
-            }
+                return Relationship.DISJOINT;
             else // only one is local - keep that one
-            {
-                (leftState.isLocal() ? leftState : rightState).copyTo(mergedState);
-            }
+                return leftState.isLocal() ? Relationship.GREATER_THAN : Relationship.LESS_THAN;
         }
-        else // both are remote shards
+
+        // both are remote shards
+        if (leftClock == rightClock)
         {
-            long leftClock = leftState.getClock();
-            long rightClock = rightState.getClock();
-
-            if (leftClock == rightClock)
+            // We should never see non-local shards w/ same id+clock but different counts. However, if we do
+            // we should "heal" the problem by being deterministic in our selection of shard - and
+            // log the occurrence so that the operator will know something is wrong.
+            if (leftCount != rightCount && CompactionManager.isCompactionManager.get())
             {
-                // We should never see non-local shards w/ same id+clock but different counts. However, if we do
-                // we should "heal" the problem by being deterministic in our selection of shard - and
-                // log the occurrence so that the operator will know something is wrong.
-                long leftCount = leftState.getCount();
-                long rightCount = rightState.getCount();
-
-                if (leftCount != rightCount && CompactionManager.isCompactionManager.get())
-                {
-                    logger.warn("invalid remote counter shard detected; ({}, {}, {}) and ({}, {}, {}) differ only in "
-                                + "count; will pick highest to self-heal on compaction",
-                                leftState.getCounterId(), leftClock, leftCount,
-                                rightState.getCounterId(), rightClock, rightCount);
-                }
-
-                if (leftCount > rightCount)
-                    leftState.copyTo(mergedState);
-                else
-                    rightState.copyTo(mergedState);
+                logger.warn("invalid remote counter shard detected; ({}, {}, {}) and ({}, {}, {}) differ only in "
+                            + "count; will pick highest to self-heal on compaction",
+                            leftState.getCounterId(), leftClock, leftCount,
+                            rightState.getCounterId(), rightClock, rightCount);
             }
+
+            if (leftCount > rightCount)
+                return Relationship.GREATER_THAN;
+            else if (leftCount == rightCount)
+                return Relationship.EQUAL;
             else
-            {
-                if ((leftClock >= 0 && rightClock > 0 && leftClock >= rightClock)
-                        || (leftClock < 0 && (rightClock > 0 || leftClock < rightClock)))
-                    leftState.copyTo(mergedState);
-                else
-                    rightState.copyTo(mergedState);
-            }
+                return Relationship.LESS_THAN;
+        }
+        else
+        {
+            if ((leftClock >= 0 && rightClock > 0 && leftClock >= rightClock)
+                    || (leftClock < 0 && (rightClock > 0 || leftClock < rightClock)))
+                return Relationship.GREATER_THAN;
+            else
+                return Relationship.LESS_THAN;
         }
     }
 
@@ -527,14 +533,9 @@
     public long total(ByteBuffer context)
     {
         long total = 0L;
-
         // we could use a ContextState but it is easy enough that we avoid the object creation
         for (int offset = context.position() + headerLength(context); offset < context.limit(); offset += STEP_LENGTH)
-        {
-            long count = context.getLong(offset + CounterId.LENGTH + CLOCK_LENGTH);
-            total += count;
-        }
-
+            total += context.getLong(offset + CounterId.LENGTH + CLOCK_LENGTH);
         return total;
     }
 
@@ -545,6 +546,24 @@
     }
 
     /**
+     * Detects whether or not the context has any legacy (local or remote) shards in it.
+     */
+    public boolean hasLegacyShards(ByteBuffer context)
+    {
+        int totalCount = (context.remaining() - headerLength(context)) / STEP_LENGTH;
+        int localAndGlobalCount = Math.abs(context.getShort(context.position()));
+
+        if (localAndGlobalCount < totalCount)
+            return true; // remote shard(s) present
+
+        for (int i = 0; i < localAndGlobalCount; i++)
+            if (context.getShort(context.position() + HEADER_SIZE_LENGTH + i * HEADER_ELT_LENGTH) >= 0)
+                return true; // found a local shard
+
+        return false;
+    }
+
+    /**
      * Mark context to delete local references afterward.
      * Marking is done by multiply #elt by -1 to preserve header length
      * and #elt count in order to clear all local refs later.
@@ -642,24 +661,54 @@
     }
 
     /**
-     * Checks whether the provided context has a count for the provided
-     * CounterId.
-     *
-     * TODO: since the context is sorted, we could implement a binary search.
-     * This is however not called in any critical path and contexts will be
-     * fairly small so it doesn't matter much.
+     * Returns the clock and the count associated with the local counter id, or (0, 0) if no such shard is present.
      */
-    public boolean hasCounterId(ByteBuffer context, CounterId id)
+    public ClockAndCount getLocalClockAndCount(ByteBuffer context)
     {
-        // we could use a ContextState but it is easy enough that we avoid the object creation
-        for (int offset = context.position() + headerLength(context); offset < context.limit(); offset += STEP_LENGTH)
+        return getClockAndCountOf(context, CounterId.getLocalId());
+    }
+
+    /**
+     * Returns the clock and the count associated with the given counter id, or (0, 0) if no such shard is present.
+     */
+    @VisibleForTesting
+    public ClockAndCount getClockAndCountOf(ByteBuffer context, CounterId id)
+    {
+        int position = findPositionOf(context, id);
+        if (position == -1)
+            return ClockAndCount.BLANK;
+
+        long clock = context.getLong(position + CounterId.LENGTH);
+        long count = context.getLong(position + CounterId.LENGTH + CLOCK_LENGTH);
+        return ClockAndCount.create(clock, count);
+    }
+
+    /**
+     * Finds the position of a shard with the given id within the context (via binary search).
+     */
+    @VisibleForTesting
+    public int findPositionOf(ByteBuffer context, CounterId id)
+    {
+        int headerLength = headerLength(context);
+        int offset = context.position() + headerLength;
+
+        int left = 0;
+        int right = (context.remaining() - headerLength) / STEP_LENGTH - 1;
+
+        while (right >= left)
         {
-            if (id.equals(CounterId.wrap(context, offset)))
-            {
-                return true;
-            }
+            int middle = (left + right) / 2;
+            int cmp = compareId(context, offset + middle * STEP_LENGTH, id.bytes(), id.bytes().position());
+
+            if (cmp == -1)
+                left = middle + 1;
+            else if (cmp == 0)
+                return offset + middle * STEP_LENGTH;
+            else
+                right = middle - 1;
         }
-        return false;
+
+        return -1; // position not found
     }
 
     /**
@@ -699,12 +748,12 @@
          * Allocate a new context big enough for globalCount + localCount + remoteCount elements
          * and return the initial corresponding ContextState.
          */
-        public static ContextState allocate(int globalCount, int localCount, int remoteCount, Allocator allocator)
+        public static ContextState allocate(int globalCount, int localCount, int remoteCount)
         {
             int headerLength = HEADER_SIZE_LENGTH + (globalCount + localCount) * HEADER_ELT_LENGTH;
             int bodyLength = (globalCount + localCount + remoteCount) * STEP_LENGTH;
 
-            ByteBuffer buffer = allocator.allocate(headerLength + bodyLength);
+            ByteBuffer buffer = ByteBuffer.allocate(headerLength + bodyLength);
             buffer.putShort(buffer.position(), (short) (globalCount + localCount));
 
             return ContextState.wrap(buffer);
@@ -754,20 +803,7 @@
 
         public void copyTo(ContextState other)
         {
-            ByteBufferUtil.arrayCopy(context,
-                                     context.position() + bodyOffset,
-                                     other.context,
-                                     other.context.position() + other.bodyOffset,
-                                     STEP_LENGTH);
-
-            if (currentIsGlobal)
-                other.context.putShort(other.context.position() + other.headerOffset, (short) (other.getElementIndex() + Short.MIN_VALUE));
-            else if (currentIsLocal)
-                other.context.putShort(other.context.position() + other.headerOffset, (short) other.getElementIndex());
-
-            other.currentIsGlobal = currentIsGlobal;
-            other.currentIsLocal = currentIsLocal;
-            other.moveToNext();
+            other.writeElement(getCounterId(), getClock(), getCount(), currentIsGlobal, currentIsLocal);
         }
 
         public int compareIdTo(ContextState other)
@@ -802,17 +838,18 @@
             return context.getLong(context.position() + bodyOffset + CounterId.LENGTH + CLOCK_LENGTH);
         }
 
-        // In 2.0 only used by the unit tests.
         public void writeGlobal(CounterId id, long clock, long count)
         {
             writeElement(id, clock, count, true, false);
         }
 
+        // In 2.1 only used by the unit tests.
         public void writeLocal(CounterId id, long clock, long count)
         {
             writeElement(id, clock, count, false, true);
         }
 
+        // In 2.1 only used by the unit tests.
         public void writeRemote(CounterId id, long clock, long count)
         {
             writeElement(id, clock, count, false, false);

diff --git a/src/java/org/apache/cassandra/db/context/IContext.java b/src/java/org/apache/cassandra/db/context/IContext.java
deleted file mode 100644
index ab10f55..0000000
--- a/src/java/org/apache/cassandra/db/context/IContext.java
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.context;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.utils.Allocator;
-
-/**
- * An opaque commutative context.
- *
- * Maintains a ByteBuffer context that represents a partitioned commutative value.
- */
-public interface IContext
-{
-    public static enum ContextRelationship
-    {
-        EQUAL,
-        GREATER_THAN,
-        LESS_THAN,
-        DISJOINT
-    };
-
-    /**
-     * Determine the relationship between two contexts.
-     *
-     * EQUAL:        Equal set of nodes and every count is equal.
-     * GREATER_THAN: Superset of nodes and every count is equal or greater than its corollary.
-     * LESS_THAN:    Subset of nodes and every count is equal or less than its corollary.
-     * DISJOINT:     Node sets are not equal and/or counts are not all greater or less than.
-     *
-     * @param left
-     *            context.
-     * @param right
-     *            context.
-     * @return the ContextRelationship between the contexts.
-     */
-    public ContextRelationship diff(ByteBuffer left, ByteBuffer right);
-
-    /**
-     * Return a context w/ an aggregated count for each node id.
-     *
-     * @param left
-     *            context.
-     * @param right
-     *            context.
-     * @param allocator
-     *            an allocator to allocate the new context from.
-     */
-    public ByteBuffer merge(ByteBuffer left, ByteBuffer right, Allocator allocator);
-
-    /**
-     * Human-readable String from context.
-     *
-     * @param context
-     *            context.
-     * @return a human-readable String of the context.
-     */
-    public String toString(ByteBuffer context);
-}

diff --git a/src/java/org/apache/cassandra/db/filter/ColumnCounter.java b/src/java/org/apache/cassandra/db/filter/ColumnCounter.java
index 814d8ed..86cfc40 100644
--- a/src/java/org/apache/cassandra/db/filter/ColumnCounter.java
+++ b/src/java/org/apache/cassandra/db/filter/ColumnCounter.java

@@ -20,13 +20,11 @@
  */
 package org.apache.cassandra.db.filter;
 
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class ColumnCounter
 {
@@ -39,17 +37,17 @@
         this.timestamp = timestamp;
     }
 
-    public void count(Column column, DeletionInfo.InOrderTester tester)
+    public void count(Cell cell, DeletionInfo.InOrderTester tester)
     {
-        if (!isLive(column, tester, timestamp))
+        if (!isLive(cell, tester, timestamp))
             ignored++;
         else
             live++;
     }
 
-    protected static boolean isLive(Column column, DeletionInfo.InOrderTester tester, long timestamp)
+    protected static boolean isLive(Cell cell, DeletionInfo.InOrderTester tester, long timestamp)
     {
-        return column.isLive(timestamp) && (!tester.isDeleted(column));
+        return cell.isLive(timestamp) && !tester.isDeleted(cell);
     }
 
     public int live()
@@ -68,17 +66,16 @@
             return this;
 
         DeletionInfo.InOrderTester tester = container.inOrderDeletionTester();
-        for (Column c : container)
+        for (Cell c : container)
             count(c, tester);
         return this;
     }
 
     public static class GroupByPrefix extends ColumnCounter
     {
-        private final CompositeType type;
+        private final CellNameType type;
         private final int toGroup;
-        private ByteBuffer[] previous;
-        private boolean previousGroupIsStatic;
+        private CellName previous;
 
         /**
          * A column counter that count only 1 for all the columns sharing a
@@ -90,7 +87,7 @@
          *                column. If 0, all columns are grouped, otherwise we group
          *                those for which the {@code toGroup} first component are equals.
          */
-        public GroupByPrefix(long timestamp, CompositeType type, int toGroup)
+        public GroupByPrefix(long timestamp, CellNameType type, int toGroup)
         {
             super(timestamp);
             this.type = type;
@@ -99,9 +96,9 @@
             assert toGroup == 0 || type != null;
         }
 
-        public void count(Column column, DeletionInfo.InOrderTester tester)
+        public void count(Cell cell, DeletionInfo.InOrderTester tester)
         {
-            if (!isLive(column, tester, timestamp))
+            if (!isLive(cell, tester, timestamp))
             {
                 ignored++;
                 return;
@@ -113,22 +110,17 @@
                 return;
             }
 
-            ByteBuffer[] current = type.split(column.name());
-            assert current.length >= toGroup;
+            CellName current = cell.name();
+            assert current.size() >= toGroup;
 
-            if (previous == null)
+            if (previous != null)
             {
-                // Only the first group can be static
-                previousGroupIsStatic = type.isStaticName(column.name());
-            }
-            else
-            {
-                boolean isSameGroup = previousGroupIsStatic == type.isStaticName(column.name());
+                boolean isSameGroup = previous.isStatic() == current.isStatic();
                 if (isSameGroup)
                 {
                     for (int i = 0; i < toGroup; i++)
                     {
-                        if (ByteBufferUtil.compareUnsigned(previous[i], current[i]) != 0)
+                        if (type.subtype(i).compare(previous.get(i), current.get(i)) != 0)
                         {
                             isSameGroup = false;
                             break;
@@ -142,10 +134,9 @@
                 // We want to count the static group as 1 (CQL) row only if it's the only
                 // group in the partition. So, since we have already counted it at this point,
                 // just don't count the 2nd group if there is one and the first one was static
-                if (previousGroupIsStatic)
+                if (previous.isStatic())
                 {
                     previous = current;
-                    previousGroupIsStatic = false;
                     return;
                 }
             }

diff --git a/src/java/org/apache/cassandra/db/filter/ColumnSlice.java b/src/java/org/apache/cassandra/db/filter/ColumnSlice.java
index 9eff12a..1cc348c 100644
--- a/src/java/org/apache/cassandra/db/filter/ColumnSlice.java
+++ b/src/java/org/apache/cassandra/db/filter/ColumnSlice.java

@@ -19,48 +19,219 @@
 
 import java.io.*;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Comparator;
-import java.util.Iterator;
-import java.util.NavigableMap;
-
-import com.google.common.collect.AbstractIterator;
+import java.util.List;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class ColumnSlice
 {
-    public static final Serializer serializer = new Serializer();
-
-    public static final ColumnSlice ALL_COLUMNS = new ColumnSlice(ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+    public static final ColumnSlice ALL_COLUMNS = new ColumnSlice(Composites.EMPTY, Composites.EMPTY);
     public static final ColumnSlice[] ALL_COLUMNS_ARRAY = new ColumnSlice[]{ ALL_COLUMNS };
 
-    public final ByteBuffer start;
-    public final ByteBuffer finish;
+    public final Composite start;
+    public final Composite finish;
 
-    public ColumnSlice(ByteBuffer start, ByteBuffer finish)
+    public ColumnSlice(Composite start, Composite finish)
     {
         assert start != null && finish != null;
         this.start = start;
         this.finish = finish;
     }
 
-    public boolean isAlwaysEmpty(AbstractType<?> comparator, boolean reversed)
+    public boolean isAlwaysEmpty(CellNameType comparator, boolean reversed)
     {
-        Comparator<ByteBuffer> orderedComparator = reversed ? comparator.reverseComparator : comparator;
-        return (start.remaining() > 0 && finish.remaining() > 0 && orderedComparator.compare(start, finish) > 0);
+        Comparator<Composite> orderedComparator = reversed ? comparator.reverseComparator() : comparator;
+        return !start.isEmpty() && !finish.isEmpty() && orderedComparator.compare(start, finish) > 0;
     }
 
-    public boolean includes(Comparator<ByteBuffer> cmp, ByteBuffer name)
+    public boolean includes(Comparator<Composite> cmp, Composite name)
     {
-        return cmp.compare(start, name) <= 0 && (finish.equals(ByteBufferUtil.EMPTY_BYTE_BUFFER) || cmp.compare(finish, name) >= 0);
+        return (start.isEmpty() || cmp.compare(start, name) <= 0) && (finish.isEmpty() || cmp.compare(finish, name) >= 0);
     }
 
-    public boolean isBefore(Comparator<ByteBuffer> cmp, ByteBuffer name)
+    public boolean isBefore(Comparator<Composite> cmp, Composite name)
     {
-        return !finish.equals(ByteBufferUtil.EMPTY_BYTE_BUFFER) && cmp.compare(finish, name) < 0;
+        return !finish.isEmpty() && cmp.compare(finish, name) < 0;
+    }
+
+    public boolean intersects(List<ByteBuffer> minCellNames, List<ByteBuffer> maxCellNames, CellNameType comparator, boolean reversed)
+    {
+        Composite sStart = reversed ? finish : start;
+        Composite sEnd = reversed ? start : finish;
+
+        if (compare(sStart, maxCellNames, comparator, true) > 0 || compare(sEnd, minCellNames, comparator, false) < 0)
+            return false;
+
+        // We could safely return true here, but there's a minor optimization: if the first component is restricted
+        // to a single value, we can check that the second component falls within the min/max for that component
+        // (and repeat for all components).
+        for (int i = 0; i < minCellNames.size() && i < maxCellNames.size(); i++)
+        {
+            AbstractType<?> t = comparator.subtype(i);
+            ByteBuffer s = i < sStart.size() ? sStart.get(i) : ByteBufferUtil.EMPTY_BYTE_BUFFER;
+            ByteBuffer f = i < sEnd.size() ? sEnd.get(i) : ByteBufferUtil.EMPTY_BYTE_BUFFER;
+
+            // we already know the first component falls within its min/max range (otherwise we wouldn't get here)
+            if (i > 0 && (i < sEnd.size() && t.compare(f, minCellNames.get(i)) < 0 ||
+                          i < sStart.size() && t.compare(s, maxCellNames.get(i)) > 0))
+                return false;
+
+            // if this component isn't equal in the start and finish, we don't need to check any more
+            if (i >= sStart.size() || i >= sEnd.size() || t.compare(s, f) != 0)
+                break;
+        }
+
+        return true;
+    }
+
+    /** Helper method for intersects() */
+    private int compare(Composite sliceBounds, List<ByteBuffer> sstableBounds, CellNameType comparator, boolean isSliceStart)
+    {
+        for (int i = 0; i < sstableBounds.size(); i++)
+        {
+            if (i >= sliceBounds.size())
+            {
+                // When isSliceStart is true, we're comparing the end of the slice against the min cell name for the sstable,
+                // so the slice is something like [(1, 0), (1, 0)], and the sstable max is something like (1, 0, 1).
+                // We want to return -1 (slice start is smaller than max column name) so that we say the slice intersects.
+                // The opposite is true when dealing with the end slice.  For example, with the same slice and a min
+                // cell name of (1, 0, 1), we want to return 1 (slice end is bigger than min column name).
+                return isSliceStart ? -1 : 1;
+            }
+
+            int comparison = comparator.subtype(i).compare(sliceBounds.get(i), sstableBounds.get(i));
+            if (comparison != 0)
+                return comparison;
+        }
+
+        // the slice bound and sstable bound have been equal in all components so far
+        if (sliceBounds.size() > sstableBounds.size())
+        {
+            // We have the opposite situation from the one described above.  With a slice of [(1, 0), (1, 0)],
+            // and a min/max cell name of (1), we want to say the slice start is smaller than the max and the slice
+            // end is larger than the min.
+            return isSliceStart ? -1 : 1;
+        }
+
+        return 0;
+    }
+
+    /**
+     * Validates that the provided slice array contains only non-overlapped slices valid for a query {@code reversed}
+     * or not on a table using {@code comparator}.
+     */
+    public static boolean validateSlices(ColumnSlice[] slices, CellNameType type, boolean reversed)
+    {
+        Comparator<Composite> comparator = reversed ? type.reverseComparator() : type;
+
+        for (int i = 0; i < slices.length; i++)
+        {
+            Composite start = slices[i].start;
+            Composite finish = slices[i].finish;
+
+            if (start.isEmpty() || finish.isEmpty())
+            {
+                if (start.isEmpty() && i > 0)
+                    return false;
+
+                if (finish.isEmpty())
+                    return i == slices.length - 1;
+            }
+            else
+            {
+                // !finish.isEmpty() is imposed by prior loop
+                if (i > 0 && comparator.compare(slices[i - 1].finish, start) >= 0)
+                    return false;
+
+                if (comparator.compare(start, finish) > 0)
+                    return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Takes an array of slices (potentially overlapping and in any order, though each individual slice must have
+     * its start before or equal its end in {@code comparator} orde) and return an equivalent array of non-overlapping
+     * slices in {@code comparator order}.
+     *
+     * @param slices an array of slices. This may be modified by this method.
+     * @param comparator the order in which to sort the slices.
+     * @return the smallest possible array of non-overlapping slices in {@code compator} order. If the original
+     * slices are already non-overlapping and in comparator order, this may or may not return the provided slices
+     * directly.
+     */
+    public static ColumnSlice[] deoverlapSlices(ColumnSlice[] slices, final Comparator<Composite> comparator)
+    {
+        if (slices.length <= 1)
+            return slices;
+
+        Arrays.sort(slices, new Comparator<ColumnSlice>()
+        {
+            @Override
+            public int compare(ColumnSlice s1, ColumnSlice s2)
+            {
+                if (s1.start.isEmpty() || s2.start.isEmpty())
+                {
+                    if (s1.start.isEmpty() != s2.start.isEmpty())
+                        return s1.start.isEmpty() ? -1 : 1;
+                }
+                else
+                {
+                    int c = comparator.compare(s1.start, s2.start);
+                    if (c != 0)
+                        return c;
+                }
+
+                // For the finish, empty always means greater
+                return s1.finish.isEmpty() || s2.finish.isEmpty()
+                     ? (s1.finish.isEmpty() ? 1 : -1)
+                     : comparator.compare(s1.finish, s2.finish);
+            }
+        });
+
+        List<ColumnSlice> slicesCopy = new ArrayList<>(slices.length);
+
+        ColumnSlice last = slices[0];
+
+        for (int i = 1; i < slices.length; i++)
+        {
+            ColumnSlice s2 = slices[i];
+
+            boolean includesStart = last.includes(comparator, s2.start);
+            boolean includesFinish = s2.finish.isEmpty() ? last.finish.isEmpty() : last.includes(comparator, s2.finish);
+
+            if (includesStart && includesFinish)
+                continue;
+
+            if (!includesStart && !includesFinish)
+            {
+                slicesCopy.add(last);
+                last = s2;
+                continue;
+            }
+
+            if (includesStart)
+            {
+                last = new ColumnSlice(last.start, s2.finish);
+                continue;
+            }
+
+            assert !includesFinish;
+        }
+
+        slicesCopy.add(last);
+
+        return slicesCopy.toArray(new ColumnSlice[slicesCopy.size()]);
     }
 
     @Override
@@ -82,84 +253,37 @@
     @Override
     public String toString()
     {
-        return "[" + ByteBufferUtil.bytesToHex(start) + ", " + ByteBufferUtil.bytesToHex(finish) + "]";
+        return "[" + ByteBufferUtil.bytesToHex(start.toByteBuffer()) + ", " + ByteBufferUtil.bytesToHex(finish.toByteBuffer()) + "]";
     }
 
     public static class Serializer implements IVersionedSerializer<ColumnSlice>
     {
-        public void serialize(ColumnSlice cs, DataOutput out, int version) throws IOException
+        private final CType type;
+
+        public Serializer(CType type)
         {
-            ByteBufferUtil.writeWithShortLength(cs.start, out);
-            ByteBufferUtil.writeWithShortLength(cs.finish, out);
+            this.type = type;
+        }
+
+        public void serialize(ColumnSlice cs, DataOutputPlus out, int version) throws IOException
+        {
+            ISerializer<Composite> serializer = type.serializer();
+            serializer.serialize(cs.start, out);
+            serializer.serialize(cs.finish, out);
         }
 
         public ColumnSlice deserialize(DataInput in, int version) throws IOException
         {
-            ByteBuffer start = ByteBufferUtil.readWithShortLength(in);
-            ByteBuffer finish = ByteBufferUtil.readWithShortLength(in);
+            ISerializer<Composite> serializer = type.serializer();
+            Composite start = serializer.deserialize(in);
+            Composite finish = serializer.deserialize(in);
             return new ColumnSlice(start, finish);
         }
 
         public long serializedSize(ColumnSlice cs, int version)
         {
-            TypeSizes sizes = TypeSizes.NATIVE;
-
-            int startSize = cs.start.remaining();
-            int finishSize = cs.finish.remaining();
-
-            int size = 0;
-            size += sizes.sizeof((short) startSize) + startSize;
-            size += sizes.sizeof((short) finishSize) + finishSize;
-            return size;
-        }
-    }
-
-    public static class NavigableMapIterator extends AbstractIterator<Column>
-    {
-        private final NavigableMap<ByteBuffer, Column> map;
-        private final ColumnSlice[] slices;
-
-        private int idx = 0;
-        private Iterator<Column> currentSlice;
-
-        public NavigableMapIterator(NavigableMap<ByteBuffer, Column> map, ColumnSlice[] slices)
-        {
-            this.map = map;
-            this.slices = slices;
-        }
-
-        protected Column computeNext()
-        {
-            if (currentSlice == null)
-            {
-                if (idx >= slices.length)
-                    return endOfData();
-
-                ColumnSlice slice = slices[idx++];
-                // Note: we specialize the case of start == "" and finish = "" because it is slightly more efficient, but also they have a specific
-                // meaning (namely, they always extend to the beginning/end of the range).
-                if (slice.start.remaining() == 0)
-                {
-                    if (slice.finish.remaining() == 0)
-                        currentSlice = map.values().iterator();
-                    else
-                        currentSlice = map.headMap(slice.finish, true).values().iterator();
-                }
-                else if (slice.finish.remaining() == 0)
-                {
-                    currentSlice = map.tailMap(slice.start, true).values().iterator();
-                }
-                else
-                {
-                    currentSlice = map.subMap(slice.start, true, slice.finish, true).values().iterator();
-                }
-            }
-
-            if (currentSlice.hasNext())
-                return currentSlice.next();
-
-            currentSlice = null;
-            return computeNext();
+            ISerializer<Composite> serializer = type.serializer();
+            return serializer.serializedSize(cs.start, TypeSizes.NATIVE) + serializer.serializedSize(cs.finish, TypeSizes.NATIVE);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java b/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java
index 5c3662b..b152472 100644
--- a/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java

@@ -18,23 +18,20 @@
 package org.apache.cassandra.db.filter;
 
 import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-import java.util.SortedSet;
-import java.util.TreeSet;
+import java.util.*;
 
+import com.google.common.base.Objects;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
 import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
 
 /**
  * Extends a column filter (IFilter) to include a number of IndexExpression.
@@ -131,9 +128,9 @@
      * @return true if the provided data satisfies all the expressions from
      * the clause of this filter.
      */
-    public abstract boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, ColumnNameBuilder builder);
+    public abstract boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, Composite prefix, ByteBuffer collectionElement);
 
-    public static boolean satisfies(int comparison, IndexOperator op)
+    public static boolean satisfies(int comparison, IndexExpression.Operator op)
     {
         switch (op)
         {
@@ -152,6 +149,18 @@
         }
     }
 
+    @Override
+    public String toString()
+    {
+        return Objects.toStringHelper(this)
+                      .add("dataRange", dataRange)
+                      .add("maxResults", maxResults)
+                      .add("currentLimit", currentLimit)
+                      .add("timestamp", timestamp)
+                      .add("countCQL3Rows", countCQL3Rows)
+                      .toString();
+    }
+
     public static class WithClauses extends ExtendedFilter
     {
         private final List<IndexExpression> clause;
@@ -182,7 +191,7 @@
              * We also don't want to do for paging ranges as the actual filter depends on the row key (it would
              * probably be possible to make it work but we won't really use it so we don't bother).
              */
-            if (cfs.getComparator() instanceof CompositeType || dataRange instanceof DataRange.Paging)
+            if (cfs.getComparator().isCompound() || dataRange instanceof DataRange.Paging)
                 return null;
 
             IDiskAtomFilter filter = dataRange.columnFilter(null); // ok since not a paging range
@@ -202,11 +211,9 @@
                 assert filter instanceof NamesQueryFilter;
                 if (!clause.isEmpty())
                 {
-                    SortedSet<ByteBuffer> columns = new TreeSet<ByteBuffer>(cfs.getComparator());
+                    SortedSet<CellName> columns = new TreeSet<CellName>(cfs.getComparator());
                     for (IndexExpression expr : clause)
-                    {
-                        columns.add(expr.column_name);
-                    }
+                        columns.add(cfs.getComparator().cellFromByteBuffer(expr.column));
                     columns.addAll(((NamesQueryFilter) filter).columns);
                     return ((NamesQueryFilter) filter).withUpdatedColumns(columns);
                 }
@@ -237,7 +244,7 @@
 
             for (IndexExpression expr : clause)
             {
-                if (data.getColumn(expr.column_name) == null)
+                if (data.getColumn(data.getComparator().cellFromByteBuffer(expr.column)) == null)
                 {
                     logger.debug("adding extraFilter to cover additional expressions");
                     return true;
@@ -255,19 +262,20 @@
              * 2) We don't yet allow non-indexed range slice with filters in CQL3 (i.e. this will never be
              * called by CFS.filter() for composites).
              */
-            assert !(cfs.getComparator() instanceof CompositeType) : "Sequential scan with filters is not supported (if you just created an index, you "
-                                                                     + "need to wait for the creation to be propagated to all nodes before querying it)";
+            assert !(cfs.getComparator().isCompound()) : "Sequential scan with filters is not supported (if you just created an index, you "
+                                                         + "need to wait for the creation to be propagated to all nodes before querying it)";
 
-            if (!needsExtraQuery(rowKey.key, data))
+            if (!needsExtraQuery(rowKey.getKey(), data))
                 return null;
 
             // Note: for counters we must be careful to not add a column that was already there (to avoid overcount). That is
             // why we do the dance of avoiding to query any column we already have (it's also more efficient anyway)
-            SortedSet<ByteBuffer> columns = new TreeSet<ByteBuffer>(cfs.getComparator());
+            SortedSet<CellName> columns = new TreeSet<CellName>(cfs.getComparator());
             for (IndexExpression expr : clause)
             {
-                if (data.getColumn(expr.column_name) == null)
-                    columns.add(expr.column_name);
+                CellName name = data.getComparator().cellFromByteBuffer(expr.column);
+                if (data.getColumn(name) == null)
+                    columns.add(name);
             }
             assert !columns.isEmpty();
             return new NamesQueryFilter(columns);
@@ -279,68 +287,134 @@
                 return data;
 
             ColumnFamily pruned = data.cloneMeShallow();
-            IDiskAtomFilter filter = dataRange.columnFilter(rowKey.key);
-            OnDiskAtomIterator iter = filter.getColumnFamilyIterator(rowKey, data);
+            IDiskAtomFilter filter = dataRange.columnFilter(rowKey.getKey());
+            Iterator<Cell> iter = filter.getColumnIterator(data);
             filter.collectReducedColumns(pruned, QueryFilter.gatherTombstones(pruned, iter), cfs.gcBefore(timestamp), timestamp);
             return pruned;
         }
 
-        public boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, ColumnNameBuilder builder)
+        public boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, Composite prefix, ByteBuffer collectionElement)
         {
-            // We enforces even the primary clause because reads are not synchronized with writes and it is thus possible to have a race
-            // where the index returned a row which doesn't have the primary column when we actually read it
             for (IndexExpression expression : clause)
             {
-                ColumnDefinition def = data.metadata().getColumnDefinition(expression.column_name);
+                ColumnDefinition def = data.metadata().getColumnDefinition(expression.column);
                 ByteBuffer dataValue = null;
                 AbstractType<?> validator = null;
                 if (def == null)
                 {
                     // This can't happen with CQL3 as this should be rejected upfront. For thrift however,
-                    // column name are not predefined. But that means the column name correspond to an internal one.
-                    Column column = data.getColumn(expression.column_name);
-                    if (column != null)
+                    // cell name are not predefined. But that means the cell name correspond to an internal one.
+                    Cell cell = data.getColumn(data.getComparator().cellFromByteBuffer(expression.column));
+                    if (cell != null)
                     {
-                        dataValue = column.value();
+                        dataValue = cell.value();
                         validator = data.metadata().getDefaultValidator();
                     }
                 }
                 else
                 {
-                    dataValue = extractDataValue(def, rowKey.key, data, builder);
-                    validator = def.getValidator();
+                    if (def.type.isCollection())
+                    {
+                        if (!collectionSatisfies(def, data, prefix, expression, collectionElement))
+                            return false;
+                        continue;
+                    }
+
+                    dataValue = extractDataValue(def, rowKey.getKey(), data, prefix);
+                    validator = def.type;
                 }
 
                 if (dataValue == null)
                     return false;
 
                 int v = validator.compare(dataValue, expression.value);
-                if (!satisfies(v, expression.op))
+                if (!satisfies(v, expression.operator))
                     return false;
             }
             return true;
         }
 
-        private ByteBuffer extractDataValue(ColumnDefinition def, ByteBuffer rowKey, ColumnFamily data, ColumnNameBuilder builder)
+        private static boolean collectionSatisfies(ColumnDefinition def, ColumnFamily data, Composite prefix, IndexExpression expr, ByteBuffer collectionElement)
         {
-            switch (def.type)
+            assert def.type.isCollection();
+            CollectionType type = (CollectionType)def.type;
+
+            if (expr.operator == IndexExpression.Operator.CONTAINS)
+            {
+                // get a slice of the collection cells
+                Iterator<Cell> iter = data.iterator(new ColumnSlice[]{ data.getComparator().create(prefix, def).slice() });
+                while (iter.hasNext())
+                {
+                    Cell cell = iter.next();
+                    if (type.kind == CollectionType.Kind.SET)
+                    {
+                        if (type.nameComparator().compare(cell.name().collectionElement(), expr.value) == 0)
+                            return true;
+                    }
+                    else
+                    {
+                        if (type.valueComparator().compare(cell.value(), expr.value) == 0)
+                            return true;
+                    }
+                }
+
+                return false;
+            }
+
+            switch (type.kind)
+            {
+                case LIST:
+                    assert collectionElement != null;
+                    return type.valueComparator().compare(data.getColumn(data.getComparator().create(prefix, def, collectionElement)).value(), expr.value) == 0;
+                case SET:
+                    return data.getColumn(data.getComparator().create(prefix, def, expr.value)) != null;
+                case MAP:
+                    if (expr.operator == IndexExpression.Operator.CONTAINS_KEY)
+                    {
+                        return data.getColumn(data.getComparator().create(prefix, def, expr.value)) != null;
+                    }
+                    else
+                    {
+                        assert collectionElement != null;
+                        return type.valueComparator().compare(data.getColumn(data.getComparator().create(prefix, def, collectionElement)).value(), expr.value) == 0;
+                    }
+            }
+            throw new AssertionError();
+        }
+
+        private ByteBuffer extractDataValue(ColumnDefinition def, ByteBuffer rowKey, ColumnFamily data, Composite prefix)
+        {
+            switch (def.kind)
             {
                 case PARTITION_KEY:
-                    return def.componentIndex == null
+                    return def.isOnAllComponents()
                          ? rowKey
-                         : ((CompositeType)data.metadata().getKeyValidator()).split(rowKey)[def.componentIndex];
-                case CLUSTERING_KEY:
-                    return builder.get(def.componentIndex);
+                         : ((CompositeType)data.metadata().getKeyValidator()).split(rowKey)[def.position()];
+                case CLUSTERING_COLUMN:
+                    return prefix.get(def.position());
                 case REGULAR:
-                    ByteBuffer colName = builder == null ? def.name : builder.copy().add(def.name).build();
-                    Column column = data.getColumn(colName);
-                    return column == null ? null : column.value();
+                    CellName cname = prefix == null
+                                   ? data.getComparator().cellFromByteBuffer(def.name.bytes)
+                                   : data.getComparator().create(prefix, def);
+
+                    Cell cell = data.getColumn(cname);
+                    return cell == null ? null : cell.value();
                 case COMPACT_VALUE:
                     assert data.getColumnCount() == 1;
                     return data.getSortedColumns().iterator().next().value();
             }
             throw new AssertionError();
         }
+
+        @Override
+        public String toString()
+        {
+            return Objects.toStringHelper(this)
+                          .add("dataRange", dataRange)
+                          .add("timestamp", timestamp)
+                          .add("clause", clause)
+                          .toString();
+        }
     }
 
     private static class EmptyClauseFilter extends ExtendedFilter
@@ -365,7 +439,7 @@
             return data;
         }
 
-        public boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, ColumnNameBuilder builder)
+        public boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, Composite prefix, ByteBuffer collectionElement)
         {
             return true;
         }

diff --git a/src/java/org/apache/cassandra/db/filter/IDiskAtomFilter.java b/src/java/org/apache/cassandra/db/filter/IDiskAtomFilter.java
index 69a8950..3750c75 100644
--- a/src/java/org/apache/cassandra/db/filter/IDiskAtomFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/IDiskAtomFilter.java

@@ -18,17 +18,18 @@
 package org.apache.cassandra.db.filter;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Comparator;
 import java.util.Iterator;
-import java.nio.ByteBuffer;
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.composites.CType;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileDataInput;
 
 /**
@@ -44,7 +45,9 @@
      * returns an iterator that returns columns from the given columnFamily
      * matching the Filter criteria in sorted order.
      */
-    public OnDiskAtomIterator getColumnFamilyIterator(DecoratedKey key, ColumnFamily cf);
+    public Iterator<Cell> getColumnIterator(ColumnFamily cf);
+
+    public OnDiskAtomIterator getColumnIterator(DecoratedKey key, ColumnFamily cf);
 
     /**
      * Get an iterator that returns columns from the given SSTable using the opened file
@@ -66,55 +69,65 @@
      * by the filter code, which should have some limit on the number of columns
      * to avoid running out of memory on large rows.
      */
-    public void collectReducedColumns(ColumnFamily container, Iterator<Column> reducedColumns, int gcBefore, long now);
+    public void collectReducedColumns(ColumnFamily container, Iterator<Cell> reducedColumns, int gcBefore, long now);
 
-    public Comparator<Column> getColumnComparator(AbstractType<?> comparator);
+    public Comparator<Cell> getColumnComparator(CellNameType comparator);
 
     public boolean isReversed();
     public void updateColumnsLimit(int newLimit);
 
     public int getLiveCount(ColumnFamily cf, long now);
-    public ColumnCounter columnCounter(AbstractType<?> comparator, long now);
+    public ColumnCounter columnCounter(CellNameType comparator, long now);
 
     public IDiskAtomFilter cloneShallow();
-    public boolean maySelectPrefix(Comparator<ByteBuffer> cmp, ByteBuffer prefix);
+    public boolean maySelectPrefix(CType type, Composite prefix);
 
-    boolean shouldInclude(SSTableReader sstable);
+    public boolean shouldInclude(SSTableReader sstable);
+
+    public boolean countCQL3Rows(CellNameType comparator);
+
+    public boolean isHeadFilter();
+
+    /**
+     * Whether the provided cf, that is assumed to contain the head of the
+     * partition, contains enough data to cover this filter.
+     */
+    public boolean isFullyCoveredBy(ColumnFamily cf, long now);
 
     public static class Serializer implements IVersionedSerializer<IDiskAtomFilter>
     {
-        public static Serializer instance = new Serializer();
+        private final CellNameType type;
 
-        public void serialize(IDiskAtomFilter filter, DataOutput out, int version) throws IOException
+        public Serializer(CellNameType type)
+        {
+            this.type = type;
+        }
+
+        public void serialize(IDiskAtomFilter filter, DataOutputPlus out, int version) throws IOException
         {
             if (filter instanceof SliceQueryFilter)
             {
                 out.writeByte(0);
-                SliceQueryFilter.serializer.serialize((SliceQueryFilter)filter, out, version);
+                type.sliceQueryFilterSerializer().serialize((SliceQueryFilter)filter, out, version);
             }
             else
             {
                 out.writeByte(1);
-                NamesQueryFilter.serializer.serialize((NamesQueryFilter)filter, out, version);
+                type.namesQueryFilterSerializer().serialize((NamesQueryFilter)filter, out, version);
             }
         }
 
         public IDiskAtomFilter deserialize(DataInput in, int version) throws IOException
         {
-            throw new UnsupportedOperationException();
-        }
-
-        public IDiskAtomFilter deserialize(DataInput in, int version, AbstractType<?> comparator) throws IOException
-        {
-            int type = in.readByte();
-            if (type == 0)
+            int b = in.readByte();
+            if (b == 0)
             {
-                return SliceQueryFilter.serializer.deserialize(in, version);
+                return type.sliceQueryFilterSerializer().deserialize(in, version);
             }
             else
             {
-                assert type == 1;
-                return NamesQueryFilter.serializer.deserialize(in, version, comparator);
+                assert b == 1;
+                return type.namesQueryFilterSerializer().deserialize(in, version);
             }
         }
 
@@ -122,10 +135,12 @@
         {
             int size = 1;
             if (filter instanceof SliceQueryFilter)
-                size += SliceQueryFilter.serializer.serializedSize((SliceQueryFilter)filter, version);
+                size += type.sliceQueryFilterSerializer().serializedSize((SliceQueryFilter)filter, version);
             else
-                size += NamesQueryFilter.serializer.serializedSize((NamesQueryFilter)filter, version);
+                size += type.namesQueryFilterSerializer().serializedSize((NamesQueryFilter)filter, version);
             return size;
         }
     }
+
+    public Iterator<RangeTombstone> getRangeTombstoneIterator(ColumnFamily source);
 }

diff --git a/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java b/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java
index 4ff8091..77a5dc7 100644
--- a/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java

@@ -18,9 +18,7 @@
 package org.apache.cassandra.db.filter;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.SortedSet;
@@ -28,58 +26,62 @@
 
 import org.apache.commons.lang3.StringUtils;
 import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.Iterators;
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.columniterator.SSTableNamesIterator;
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 
 public class NamesQueryFilter implements IDiskAtomFilter
 {
-    public static final Serializer serializer = new Serializer();
-
-    public final SortedSet<ByteBuffer> columns;
+    public final SortedSet<CellName> columns;
 
     // If true, getLiveCount will always return either 0 or 1. This uses the fact that we know 
     // CQL3 will never use a name filter with cell names spanning multiple CQL3 rows.
     private final boolean countCQL3Rows;
 
-    public NamesQueryFilter(SortedSet<ByteBuffer> columns)
+    public NamesQueryFilter(SortedSet<CellName> columns)
     {
         this(columns, false);
     }
 
-    public NamesQueryFilter(SortedSet<ByteBuffer> columns, boolean countCQL3Rows)
+    public NamesQueryFilter(SortedSet<CellName> columns, boolean countCQL3Rows)
     {
         this.columns = columns;
         this.countCQL3Rows = countCQL3Rows;
     }
 
-    public NamesQueryFilter(ByteBuffer column, Comparator<ByteBuffer> comparator)
-    {
-        this(FBUtilities.singleton(column, comparator));
-    }
-
     public NamesQueryFilter cloneShallow()
     {
         // NQF is immutable as far as shallow cloning is concerned, so save the allocation.
         return this;
     }
 
-    public NamesQueryFilter withUpdatedColumns(SortedSet<ByteBuffer> newColumns)
+    public NamesQueryFilter withUpdatedColumns(SortedSet<CellName> newColumns)
     {
        return new NamesQueryFilter(newColumns, countCQL3Rows);
     }
 
-    public OnDiskAtomIterator getColumnFamilyIterator(DecoratedKey key, ColumnFamily cf)
+    @SuppressWarnings("unchecked")
+    public Iterator<Cell> getColumnIterator(ColumnFamily cf)
     {
         assert cf != null;
-        return new ByNameColumnIterator(columns.iterator(), cf, key);
+        return (Iterator<Cell>) (Iterator<?>) new ByNameColumnIterator(columns.iterator(), null, cf);
+    }
+
+    public OnDiskAtomIterator getColumnIterator(DecoratedKey key, ColumnFamily cf)
+    {
+        assert cf != null;
+        return new ByNameColumnIterator(columns.iterator(), key, cf);
     }
 
     public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable, DecoratedKey key)
@@ -92,16 +94,16 @@
         return new SSTableNamesIterator(sstable, file, key, columns, indexEntry);
     }
 
-    public void collectReducedColumns(ColumnFamily container, Iterator<Column> reducedColumns, int gcBefore, long now)
+    public void collectReducedColumns(ColumnFamily container, Iterator<Cell> reducedColumns, int gcBefore, long now)
     {
         DeletionInfo.InOrderTester tester = container.inOrderDeletionTester();
         while (reducedColumns.hasNext())
-            container.addIfRelevant(reducedColumns.next(), tester, gcBefore);
+            container.maybeAppendColumn(reducedColumns.next(), tester, gcBefore);
     }
 
-    public Comparator<Column> getColumnComparator(AbstractType<?> comparator)
+    public Comparator<Cell> getColumnComparator(CellNameType comparator)
     {
-        return comparator.columnComparator;
+        return comparator.columnComparator(false);
     }
 
     @Override
@@ -129,19 +131,19 @@
             return cf.hasOnlyTombstones(now) ? 0 : 1;
 
         int count = 0;
-        for (Column column : cf)
+        for (Cell cell : cf)
         {
-            if (column.isLive(now))
+            if (cell.isLive(now))
                 count++;
         }
         return count;
     }
 
-    public boolean maySelectPrefix(Comparator<ByteBuffer> cmp, ByteBuffer prefix)
+    public boolean maySelectPrefix(CType type, Composite prefix)
     {
-        for (ByteBuffer column : columns)
+        for (CellName column : columns)
         {
-            if (ByteBufferUtil.isPrefix(prefix, column))
+            if (prefix.isPrefixOf(type, column))
                 return true;
         }
         return false;
@@ -152,12 +154,33 @@
         return true;
     }
 
-    public boolean countCQL3Rows()
+    public boolean isFullyCoveredBy(ColumnFamily cf, long now)
+    {
+        // cf will cover all the requested columns if the range it covers include
+        // all said columns
+        CellName first = cf.iterator(ColumnSlice.ALL_COLUMNS_ARRAY).next().name();
+        CellName last = cf.reverseIterator(ColumnSlice.ALL_COLUMNS_ARRAY).next().name();
+
+        return cf.getComparator().compare(first, columns.first()) <= 0
+            && cf.getComparator().compare(columns.last(), last) <= 0;
+    }
+
+    public boolean isHeadFilter()
+    {
+        return false;
+    }
+
+    public boolean countCQL3Rows(CellNameType comparator)
     {
         return countCQL3Rows;
     }
 
-    public ColumnCounter columnCounter(AbstractType<?> comparator, long now)
+    public boolean countCQL3Rows()
+    {
+        return countCQL3Rows(null);
+    }
+
+    public ColumnCounter columnCounter(CellNameType comparator, long now)
     {
         return countCQL3Rows
              ? new ColumnCounter.GroupByPrefix(now, null, 0)
@@ -168,15 +191,27 @@
     {
         private final ColumnFamily cf;
         private final DecoratedKey key;
-        private final Iterator<ByteBuffer> iter;
+        private final Iterator<CellName> iter;
 
-        public ByNameColumnIterator(Iterator<ByteBuffer> iter, ColumnFamily cf, DecoratedKey key)
+        public ByNameColumnIterator(Iterator<CellName> iter, DecoratedKey key, ColumnFamily cf)
         {
             this.iter = iter;
             this.cf = cf;
             this.key = key;
         }
 
+        protected OnDiskAtom computeNext()
+        {
+            while (iter.hasNext())
+            {
+                CellName current = iter.next();
+                Cell cell = cf.getColumn(current);
+                if (cell != null)
+                    return cell;
+            }
+            return endOfData();
+        }
+
         public ColumnFamily getColumnFamily()
         {
             return cf;
@@ -187,44 +222,36 @@
             return key;
         }
 
-        protected OnDiskAtom computeNext()
-        {
-            while (iter.hasNext())
-            {
-                ByteBuffer current = iter.next();
-                Column column = cf.getColumn(current);
-                if (column != null)
-                    return column;
-            }
-            return endOfData();
-        }
-
         public void close() throws IOException { }
     }
 
     public static class Serializer implements IVersionedSerializer<NamesQueryFilter>
     {
-        public void serialize(NamesQueryFilter f, DataOutput out, int version) throws IOException
+        private CellNameType type;
+
+        public Serializer(CellNameType type)
+        {
+            this.type = type;
+        }
+
+        public void serialize(NamesQueryFilter f, DataOutputPlus out, int version) throws IOException
         {
             out.writeInt(f.columns.size());
-            for (ByteBuffer cName : f.columns)
+            ISerializer<CellName> serializer = type.cellSerializer();
+            for (CellName cName : f.columns)
             {
-                ByteBufferUtil.writeWithShortLength(cName, out);
+                serializer.serialize(cName, out);
             }
             out.writeBoolean(f.countCQL3Rows);
         }
 
         public NamesQueryFilter deserialize(DataInput in, int version) throws IOException
         {
-            throw new UnsupportedOperationException();
-        }
-
-        public NamesQueryFilter deserialize(DataInput in, int version, AbstractType comparator) throws IOException
-        {
             int size = in.readInt();
-            SortedSet<ByteBuffer> columns = new TreeSet<ByteBuffer>(comparator);
+            SortedSet<CellName> columns = new TreeSet<CellName>(type);
+            ISerializer<CellName> serializer = type.cellSerializer();
             for (int i = 0; i < size; ++i)
-                columns.add(ByteBufferUtil.readWithShortLength(in));
+                columns.add(serializer.deserialize(in));
             boolean countCQL3Rows = in.readBoolean();
             return new NamesQueryFilter(columns, countCQL3Rows);
         }
@@ -233,13 +260,40 @@
         {
             TypeSizes sizes = TypeSizes.NATIVE;
             int size = sizes.sizeof(f.columns.size());
-            for (ByteBuffer cName : f.columns)
-            {
-                int cNameSize = cName.remaining();
-                size += sizes.sizeof((short) cNameSize) + cNameSize;
-            }
+            ISerializer<CellName> serializer = type.cellSerializer();
+            for (CellName cName : f.columns)
+                size += serializer.serializedSize(cName, sizes);
             size += sizes.sizeof(f.countCQL3Rows);
             return size;
         }
     }
+
+    public Iterator<RangeTombstone> getRangeTombstoneIterator(final ColumnFamily source)
+    {
+        if (!source.deletionInfo().hasRanges())
+            return Iterators.<RangeTombstone>emptyIterator();
+
+        return new AbstractIterator<RangeTombstone>()
+        {
+            private final Iterator<CellName> names = columns.iterator();
+            private RangeTombstone lastFindRange;
+
+            protected RangeTombstone computeNext()
+            {
+                while (names.hasNext())
+                {
+                    CellName next = names.next();
+                    if (lastFindRange != null && lastFindRange.includes(source.getComparator(), next))
+                        return lastFindRange;
+
+                    // We keep the last range around as since names are in sort order, it's
+                    // possible it will match the next name too.
+                    lastFindRange = source.deletionInfo().rangeCovering(next);
+                    if (lastFindRange != null)
+                        return lastFindRange;
+                }
+                return endOfData();
+            }
+        };
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/filter/QueryFilter.java b/src/java/org/apache/cassandra/db/filter/QueryFilter.java
index 7136ae8..f58fa9f 100644
--- a/src/java/org/apache/cassandra/db/filter/QueryFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/QueryFilter.java

@@ -17,15 +17,23 @@
  */
 package org.apache.cassandra.db.filter;
 
-import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.SortedSet;
 
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.OnDiskAtom;
+import org.apache.cassandra.db.RangeTombstone;
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
+import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.utils.HeapAllocator;
 import org.apache.cassandra.utils.MergeIterator;
 
 public class QueryFilter
@@ -43,18 +51,10 @@
         this.timestamp = timestamp;
     }
 
-    public OnDiskAtomIterator getMemtableColumnIterator(Memtable memtable)
-    {
-        ColumnFamily cf = memtable.getColumnFamily(key);
-        if (cf == null)
-            return null;
-        return getColumnFamilyIterator(cf);
-    }
-
-    public OnDiskAtomIterator getColumnFamilyIterator(ColumnFamily cf)
+    public Iterator<Cell> getIterator(ColumnFamily cf)
     {
         assert cf != null;
-        return filter.getColumnFamilyIterator(key, cf);
+        return filter.getColumnIterator(cf);
     }
 
     public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable)
@@ -62,75 +62,90 @@
         return filter.getSSTableColumnIterator(sstable, key);
     }
 
-    public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry)
-    {
-        return filter.getSSTableColumnIterator(sstable, file, key, indexEntry);
-    }
-
-    public void collateOnDiskAtom(final ColumnFamily returnCF, List<? extends Iterator<? extends OnDiskAtom>> toCollate, final int gcBefore)
+    public void collateOnDiskAtom(ColumnFamily returnCF,
+                                  List<? extends Iterator<? extends OnDiskAtom>> toCollate,
+                                  int gcBefore)
     {
         collateOnDiskAtom(returnCF, toCollate, filter, gcBefore, timestamp);
     }
 
-    public static void collateOnDiskAtom(final ColumnFamily returnCF, List<? extends Iterator<? extends OnDiskAtom>> toCollate, IDiskAtomFilter filter, int gcBefore, long timestamp)
+    public static void collateOnDiskAtom(ColumnFamily returnCF,
+                                         List<? extends Iterator<? extends OnDiskAtom>> toCollate,
+                                         IDiskAtomFilter filter,
+                                         int gcBefore,
+                                         long timestamp)
     {
-        List<Iterator<Column>> filteredIterators = new ArrayList<Iterator<Column>>(toCollate.size());
+        List<Iterator<Cell>> filteredIterators = new ArrayList<>(toCollate.size());
         for (Iterator<? extends OnDiskAtom> iter : toCollate)
             filteredIterators.add(gatherTombstones(returnCF, iter));
         collateColumns(returnCF, filteredIterators, filter, gcBefore, timestamp);
     }
 
-    /**
-     * When there is only a single source of atoms, we can skip the collate step
-     */
+    // When there is only a single source of atoms, we can skip the collate step
     public void collateOnDiskAtom(ColumnFamily returnCF, Iterator<? extends OnDiskAtom> toCollate, int gcBefore)
     {
-        Iterator<Column> columns = gatherTombstones(returnCF, toCollate);
-        filter.collectReducedColumns(returnCF, columns, gcBefore, timestamp);
+        filter.collectReducedColumns(returnCF, gatherTombstones(returnCF, toCollate), gcBefore, timestamp);
     }
 
-    public void collateColumns(final ColumnFamily returnCF, List<? extends Iterator<Column>> toCollate, int gcBefore)
+    public void collateColumns(ColumnFamily returnCF, List<? extends Iterator<Cell>> toCollate, int gcBefore)
     {
         collateColumns(returnCF, toCollate, filter, gcBefore, timestamp);
     }
 
-    public static void collateColumns(final ColumnFamily returnCF, List<? extends Iterator<Column>> toCollate, IDiskAtomFilter filter, int gcBefore, long timestamp)
+    public static void collateColumns(ColumnFamily returnCF,
+                                      List<? extends Iterator<Cell>> toCollate,
+                                      IDiskAtomFilter filter,
+                                      int gcBefore,
+                                      long timestamp)
     {
-        final Comparator<Column> fcomp = filter.getColumnComparator(returnCF.getComparator());
+        Comparator<Cell> comparator = filter.getColumnComparator(returnCF.getComparator());
+
+        Iterator<Cell> reduced = toCollate.size() == 1
+                               ? toCollate.get(0)
+                               : MergeIterator.get(toCollate, comparator, getReducer(comparator));
+
+        filter.collectReducedColumns(returnCF, reduced, gcBefore, timestamp);
+    }
+
+    private static MergeIterator.Reducer<Cell, Cell> getReducer(final Comparator<Cell> comparator)
+    {
         // define a 'reduced' iterator that merges columns w/ the same name, which
         // greatly simplifies computing liveColumns in the presence of tombstones.
-        MergeIterator.Reducer<Column, Column> reducer = new MergeIterator.Reducer<Column, Column>()
+        return new MergeIterator.Reducer<Cell, Cell>()
         {
-            Column current;
+            Cell current;
 
-            public void reduce(Column next)
+            public void reduce(Cell next)
             {
-                assert current == null || fcomp.compare(current, next) == 0;
-                current = current == null ? next : current.reconcile(next, HeapAllocator.instance);
+                assert current == null || comparator.compare(current, next) == 0;
+                current = current == null ? next : current.reconcile(next);
             }
 
-            protected Column getReduced()
+            protected Cell getReduced()
             {
                 assert current != null;
-                Column toReturn = current;
+                Cell toReturn = current;
                 current = null;
                 return toReturn;
             }
-        };
-        Iterator<Column> reduced = MergeIterator.get(toCollate, fcomp, reducer);
 
-        filter.collectReducedColumns(returnCF, reduced, gcBefore, timestamp);
+            @Override
+            public boolean trivialReduceIsTrivial()
+            {
+                return true;
+            }
+        };
     }
 
     /**
      * Given an iterator of on disk atom, returns an iterator that filters the tombstone range
      * markers adding them to {@code returnCF} and returns the normal column.
      */
-    public static Iterator<Column> gatherTombstones(final ColumnFamily returnCF, final Iterator<? extends OnDiskAtom> iter)
+    public static Iterator<Cell> gatherTombstones(final ColumnFamily returnCF, final Iterator<? extends OnDiskAtom> iter)
     {
-        return new Iterator<Column>()
+        return new Iterator<Cell>()
         {
-            private Column next;
+            private Cell next;
 
             public boolean hasNext()
             {
@@ -141,13 +156,13 @@
                 return next != null;
             }
 
-            public Column next()
+            public Cell next()
             {
                 if (next == null)
                     getNext();
 
                 assert next != null;
-                Column toReturn = next;
+                Cell toReturn = next;
                 next = null;
                 return toReturn;
             }
@@ -158,9 +173,9 @@
                 {
                     OnDiskAtom atom = iter.next();
 
-                    if (atom instanceof Column)
+                    if (atom instanceof Cell)
                     {
-                        next = (Column)atom;
+                        next = (Cell)atom;
                         break;
                     }
                     else
@@ -194,8 +209,8 @@
      */
     public static QueryFilter getSliceFilter(DecoratedKey key,
                                              String cfName,
-                                             ByteBuffer start,
-                                             ByteBuffer finish,
+                                             Composite start,
+                                             Composite finish,
                                              boolean reversed,
                                              int limit,
                                              long timestamp)
@@ -218,7 +233,7 @@
      * @param cfName column family to query
      * @param columns the column names to restrict the results to, sorted in comparator order
      */
-    public static QueryFilter getNamesFilter(DecoratedKey key, String cfName, SortedSet<ByteBuffer> columns, long timestamp)
+    public static QueryFilter getNamesFilter(DecoratedKey key, String cfName, SortedSet<CellName> columns, long timestamp)
     {
         return new QueryFilter(key, cfName, new NamesQueryFilter(columns), timestamp);
     }
@@ -233,4 +248,13 @@
     {
         return filter.shouldInclude(sstable);
     }
+
+    public void delete(DeletionInfo target, ColumnFamily source)
+    {
+        target.add(source.deletionInfo().getTopLevelDeletion());
+        // source is the CF currently in the memtable, and it can be large compared to what the filter selects,
+        // so only consider those range tombstones that the filter do select.
+        for (Iterator<RangeTombstone> iter = filter.getRangeTombstoneIterator(source); iter.hasNext(); )
+            target.add(iter.next(), source.getComparator());
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/filter/QueryPath.java b/src/java/org/apache/cassandra/db/filter/QueryPath.java
deleted file mode 100644
index 26d15a1..0000000
--- a/src/java/org/apache/cassandra/db/filter/QueryPath.java
+++ /dev/null

@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.filter;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-/**
- * This class is obsolete internally, but kept for wire compatibility with
- * older nodes. I.e. we kept it only for the serialization part.
- */
-public class QueryPath
-{
-    public final String columnFamilyName;
-    public final ByteBuffer superColumnName;
-    public final ByteBuffer columnName;
-
-    public QueryPath(String columnFamilyName, ByteBuffer superColumnName, ByteBuffer columnName)
-    {
-        this.columnFamilyName = columnFamilyName;
-        this.superColumnName = superColumnName;
-        this.columnName = columnName;
-    }
-
-    public QueryPath(String columnFamilyName, ByteBuffer superColumnName)
-    {
-        this(columnFamilyName, superColumnName, null);
-    }
-
-    @Override
-    public String toString()
-    {
-        return getClass().getSimpleName() + "(" +
-               "columnFamilyName='" + columnFamilyName + '\'' +
-               ", superColumnName='" + superColumnName + '\'' +
-               ", columnName='" + columnName + '\'' +
-               ')';
-    }
-
-    public void serialize(DataOutput out) throws IOException
-    {
-        assert !"".equals(columnFamilyName);
-        assert superColumnName == null || superColumnName.remaining() > 0;
-        assert columnName == null || columnName.remaining() > 0;
-        out.writeUTF(columnFamilyName == null ? "" : columnFamilyName);
-        ByteBufferUtil.writeWithShortLength(superColumnName == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : superColumnName, out);
-        ByteBufferUtil.writeWithShortLength(columnName == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : columnName, out);
-    }
-
-    public static QueryPath deserialize(DataInput din) throws IOException
-    {
-        String cfName = din.readUTF();
-        ByteBuffer scName = ByteBufferUtil.readWithShortLength(din);
-        ByteBuffer cName = ByteBufferUtil.readWithShortLength(din);
-        return new QueryPath(cfName.isEmpty() ? null : cfName,
-                             scName.remaining() == 0 ? null : scName,
-                             cName.remaining() == 0 ? null : cName);
-    }
-
-    public int serializedSize(TypeSizes typeSizes)
-    {
-        int size = 0;
-
-        if (columnFamilyName == null)
-            size += typeSizes.sizeof((short) 0);
-        else
-            size += typeSizes.sizeof(columnFamilyName);
-
-        if (superColumnName == null)
-        {
-            size += typeSizes.sizeof((short) 0);
-        }
-        else
-        {
-            int scNameSize = superColumnName.remaining();
-            size += typeSizes.sizeof((short) scNameSize);
-            size += scNameSize;
-        }
-
-        if (columnName == null)
-        {
-            size += typeSizes.sizeof((short) 0);
-        }
-        else
-        {
-            int cNameSize = columnName.remaining();
-            size += typeSizes.sizeof((short) cNameSize);
-            size += cNameSize;
-        }
-
-        return size;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java b/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java
index e0ccc2f..71d9095 100644
--- a/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java

@@ -17,12 +17,13 @@
  */
 package org.apache.cassandra.db.filter;
 
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.io.DataInput;
+import java.io.IOException;
 import java.util.*;
 
+import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.Iterators;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -30,17 +31,19 @@
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.columniterator.SSTableSliceIterator;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.tracing.Tracing;
 
 public class SliceQueryFilter implements IDiskAtomFilter
 {
     private static final Logger logger = LoggerFactory.getLogger(SliceQueryFilter.class);
-    public static final Serializer serializer = new Serializer();
 
     public final ColumnSlice[] slices;
     public final boolean reversed;
@@ -50,14 +53,24 @@
     // Not serialized, just a ack for range slices to find the number of live column counted, even when we group
     private ColumnCounter columnCounter;
 
-    public SliceQueryFilter(ByteBuffer start, ByteBuffer finish, boolean reversed, int count)
+    public SliceQueryFilter(Composite start, Composite finish, boolean reversed, int count)
     {
-        this(new ColumnSlice[] { new ColumnSlice(start, finish) }, reversed, count);
+        this(new ColumnSlice(start, finish), reversed, count);
     }
 
-    public SliceQueryFilter(ByteBuffer start, ByteBuffer finish, boolean reversed, int count, int compositesToGroup)
+    public SliceQueryFilter(Composite start, Composite finish, boolean reversed, int count, int compositesToGroup)
     {
-        this(new ColumnSlice[] { new ColumnSlice(start, finish) }, reversed, count, compositesToGroup);
+        this(new ColumnSlice(start, finish), reversed, count, compositesToGroup);
+    }
+
+    public SliceQueryFilter(ColumnSlice slice, boolean reversed, int count)
+    {
+        this(new ColumnSlice[]{ slice }, reversed, count);
+    }
+
+    public SliceQueryFilter(ColumnSlice slice, boolean reversed, int count, int compositesToGroup)
+    {
+        this(new ColumnSlice[]{ slice }, reversed, count, compositesToGroup);
     }
 
     /**
@@ -92,23 +105,21 @@
         return new SliceQueryFilter(newSlices, reversed, count, compositesToGroup);
     }
 
-    public SliceQueryFilter withUpdatedStart(ByteBuffer newStart, AbstractType<?> comparator)
+    public SliceQueryFilter withUpdatedStart(Composite newStart, CellNameType comparator)
     {
-        Comparator<ByteBuffer> cmp = reversed ? comparator.reverseComparator : comparator;
+        Comparator<Composite> cmp = reversed ? comparator.reverseComparator() : comparator;
 
-        List<ColumnSlice> newSlices = new ArrayList<ColumnSlice>();
+        List<ColumnSlice> newSlices = new ArrayList<>(slices.length);
         boolean pastNewStart = false;
-        for (int i = 0; i < slices.length; i++)
+        for (ColumnSlice slice : slices)
         {
-            ColumnSlice slice = slices[i];
-
             if (pastNewStart)
             {
                 newSlices.add(slice);
                 continue;
             }
 
-            if (slices[i].isBefore(cmp, newStart))
+            if (slice.isBefore(cmp, newStart))
                 continue;
 
             if (slice.includes(cmp, newStart))
@@ -121,15 +132,16 @@
         return withUpdatedSlices(newSlices.toArray(new ColumnSlice[newSlices.size()]));
     }
 
-    public SliceQueryFilter withUpdatedSlice(ByteBuffer start, ByteBuffer finish)
-    {
-        return new SliceQueryFilter(new ColumnSlice[]{ new ColumnSlice(start, finish) }, reversed, count, compositesToGroup);
-    }
-
-    public OnDiskAtomIterator getColumnFamilyIterator(final DecoratedKey key, final ColumnFamily cf)
+    public Iterator<Cell> getColumnIterator(ColumnFamily cf)
     {
         assert cf != null;
-        final Iterator<Column> filteredIter = reversed ? cf.reverseIterator(slices) : cf.iterator(slices);
+        return reversed ? cf.reverseIterator(slices) : cf.iterator(slices);
+    }
+
+    public OnDiskAtomIterator getColumnIterator(final DecoratedKey key, final ColumnFamily cf)
+    {
+        assert cf != null;
+        final Iterator<Cell> iter = getColumnIterator(cf);
 
         return new OnDiskAtomIterator()
         {
@@ -145,12 +157,12 @@
 
             public boolean hasNext()
             {
-                return filteredIter.hasNext();
+                return iter.hasNext();
             }
 
             public OnDiskAtom next()
             {
-                return filteredIter.next();
+                return iter.next();
             }
 
             public void close() throws IOException { }
@@ -172,24 +184,24 @@
         return new SSTableSliceIterator(sstable, file, key, slices, reversed, indexEntry);
     }
 
-    public Comparator<Column> getColumnComparator(AbstractType<?> comparator)
+    public Comparator<Cell> getColumnComparator(CellNameType comparator)
     {
-        return reversed ? comparator.columnReverseComparator : comparator.columnComparator;
+        return reversed ? comparator.columnReverseComparator() : comparator.columnComparator(false);
     }
 
-    public void collectReducedColumns(ColumnFamily container, Iterator<Column> reducedColumns, int gcBefore, long now)
+    public void collectReducedColumns(ColumnFamily container, Iterator<Cell> reducedColumns, int gcBefore, long now)
     {
         columnCounter = columnCounter(container.getComparator(), now);
         DeletionInfo.InOrderTester tester = container.deletionInfo().inOrderTester(reversed);
 
         while (reducedColumns.hasNext())
         {
-            Column column = reducedColumns.next();
+            Cell cell = reducedColumns.next();
             if (logger.isTraceEnabled())
                 logger.trace(String.format("collecting %s of %s: %s",
-                                           columnCounter.live(), count, column.getString(container.getComparator())));
+                                           columnCounter.live(), count, cell.getString(container.getComparator())));
 
-            columnCounter.count(column, tester);
+            columnCounter.count(cell, tester);
 
             if (columnCounter.live() > count)
                 break;
@@ -202,18 +214,17 @@
                 throw new TombstoneOverwhelmingException();
             }
 
-            container.addIfRelevant(column, tester, gcBefore);
+            container.maybeAppendColumn(cell, tester, gcBefore);
         }
 
         Tracing.trace("Read {} live and {} tombstoned cells", columnCounter.live(), columnCounter.ignored());
         if (respectTombstoneThresholds() && columnCounter.ignored() > DatabaseDescriptor.getTombstoneWarnThreshold())
         {
             StringBuilder sb = new StringBuilder();
-            AbstractType<?> type = container.metadata().comparator;
+            CellNameType type = container.metadata().comparator;
             for (ColumnSlice sl : slices)
             {
-                if (sl == null)
-                    continue;
+                assert sl != null;
 
                 sb.append('[');
                 sb.append(type.getString(sl.start));
@@ -237,30 +248,30 @@
         return columnCounter(cf.getComparator(), now).countAll(cf).live();
     }
 
-    public ColumnCounter columnCounter(AbstractType<?> comparator, long now)
+    public ColumnCounter columnCounter(CellNameType comparator, long now)
     {
         if (compositesToGroup < 0)
             return new ColumnCounter(now);
         else if (compositesToGroup == 0)
             return new ColumnCounter.GroupByPrefix(now, null, 0);
         else
-            return new ColumnCounter.GroupByPrefix(now, (CompositeType)comparator, compositesToGroup);
+            return new ColumnCounter.GroupByPrefix(now, comparator, compositesToGroup);
     }
 
     public void trim(ColumnFamily cf, int trimTo, long now)
     {
         ColumnCounter counter = columnCounter(cf.getComparator(), now);
 
-        Collection<Column> columns = reversed
+        Collection<Cell> cells = reversed
                                    ? cf.getReverseSortedColumns()
                                    : cf.getSortedColumns();
 
         DeletionInfo.InOrderTester tester = cf.deletionInfo().inOrderTester(reversed);
 
-        for (Iterator<Column> iter = columns.iterator(); iter.hasNext(); )
+        for (Iterator<Cell> iter = cells.iterator(); iter.hasNext(); )
         {
-            Column column = iter.next();
-            counter.count(column, tester);
+            Cell cell = iter.next();
+            counter.count(cell, tester);
 
             if (counter.live() > trimTo)
             {
@@ -274,17 +285,17 @@
         }
     }
 
-    public ByteBuffer start()
+    public Composite start()
     {
         return this.slices[0].start;
     }
 
-    public ByteBuffer finish()
+    public Composite finish()
     {
         return this.slices[slices.length - 1].finish;
     }
 
-    public void setStart(ByteBuffer start)
+    public void setStart(Composite start)
     {
         assert slices.length == 1;
         this.slices[0] = new ColumnSlice(start, this.slices[0].finish);
@@ -321,10 +332,10 @@
         count = newLimit;
     }
 
-    public boolean maySelectPrefix(Comparator<ByteBuffer> cmp, ByteBuffer prefix)
+    public boolean maySelectPrefix(CType type, Composite prefix)
     {
         for (ColumnSlice slice : slices)
-            if (slice.includes(cmp, prefix))
+            if (slice.includes(type, prefix))
                 return true;
         return false;
     }
@@ -333,22 +344,67 @@
     {
         List<ByteBuffer> minColumnNames = sstable.getSSTableMetadata().minColumnNames;
         List<ByteBuffer> maxColumnNames = sstable.getSSTableMetadata().maxColumnNames;
-        assert minColumnNames.size() == maxColumnNames.size();
-        AbstractType<?> comparator = sstable.metadata.comparator;
+        CellNameType comparator = sstable.metadata.comparator;
 
         if (minColumnNames.isEmpty() || maxColumnNames.isEmpty())
             return true;
 
-        return comparator.intersects(minColumnNames, maxColumnNames, this);
+        for (ColumnSlice slice : slices)
+            if (slice.intersects(minColumnNames, maxColumnNames, comparator, reversed))
+                return true;
+
+        return false;
+    }
+
+    public boolean isHeadFilter()
+    {
+        return slices.length == 1 && slices[0].start.isEmpty() && !reversed;
+    }
+
+    public boolean countCQL3Rows(CellNameType comparator)
+    {
+        // If comparator is dense a cell == a CQL3 rows so we're always counting CQL3 rows
+        // in particular. Otherwise, we do so only if we group the cells into CQL rows.
+        return comparator.isDense() || compositesToGroup >= 0;
+    }
+
+    public boolean isFullyCoveredBy(ColumnFamily cf, long now)
+    {
+        // cf is the beginning of a partition. It covers this filter if:
+        //   1) either this filter requests the head of the partition and request less
+        //      than what cf has to offer (note: we do need to use getLiveCount() for that
+        //      as it knows if the filter count cells or CQL3 rows).
+        //   2) the start and finish bound of this filter are included in cf.
+        if (isHeadFilter() && count <= getLiveCount(cf, now))
+            return true;
+
+        if (start().isEmpty() || finish().isEmpty() || !cf.hasColumns())
+            return false;
+
+        Composite low = isReversed() ? finish() : start();
+        Composite high = isReversed() ? start() : finish();
+
+        CellName first = cf.iterator(ColumnSlice.ALL_COLUMNS_ARRAY).next().name();
+        CellName last = cf.reverseIterator(ColumnSlice.ALL_COLUMNS_ARRAY).next().name();
+
+        return cf.getComparator().compare(first, low) <= 0
+            && cf.getComparator().compare(high, last) <= 0;
     }
 
     public static class Serializer implements IVersionedSerializer<SliceQueryFilter>
     {
-        public void serialize(SliceQueryFilter f, DataOutput out, int version) throws IOException
+        private CType type;
+
+        public Serializer(CType type)
+        {
+            this.type = type;
+        }
+
+        public void serialize(SliceQueryFilter f, DataOutputPlus out, int version) throws IOException
         {
             out.writeInt(f.slices.length);
             for (ColumnSlice slice : f.slices)
-                ColumnSlice.serializer.serialize(slice, out, version);
+                type.sliceSerializer().serialize(slice, out, version);
             out.writeBoolean(f.reversed);
             int count = f.count;
             out.writeInt(count);
@@ -361,7 +417,7 @@
             ColumnSlice[] slices;
             slices = new ColumnSlice[in.readInt()];
             for (int i = 0; i < slices.length; i++)
-                slices[i] = ColumnSlice.serializer.deserialize(in, version);
+                slices[i] = type.sliceSerializer().deserialize(in, version);
             boolean reversed = in.readBoolean();
             int count = in.readInt();
             int compositesToGroup = -1;
@@ -377,7 +433,7 @@
             int size = 0;
             size += sizes.sizeof(f.slices.length);
             for (ColumnSlice slice : f.slices)
-                size += ColumnSlice.serializer.serializedSize(slice, version);
+                size += type.sliceSerializer().serializedSize(slice, version);
             size += sizes.sizeof(f.reversed);
             size += sizes.sizeof(f.count);
 
@@ -385,4 +441,43 @@
             return size;
         }
     }
+
+    public Iterator<RangeTombstone> getRangeTombstoneIterator(final ColumnFamily source)
+    {
+        final DeletionInfo delInfo = source.deletionInfo();
+        if (!delInfo.hasRanges() || slices.length == 0)
+            return Iterators.<RangeTombstone>emptyIterator();
+
+        return new AbstractIterator<RangeTombstone>()
+        {
+            private int sliceIdx = 0;
+            private Iterator<RangeTombstone> sliceIter = currentRangeIter();
+
+            protected RangeTombstone computeNext()
+            {
+                while (true)
+                {
+                    if (sliceIter.hasNext())
+                        return sliceIter.next();
+
+                    if (!nextSlice())
+                        return endOfData();
+
+                    sliceIter = currentRangeIter();
+                }
+            }
+
+            private Iterator<RangeTombstone> currentRangeIter()
+            {
+                ColumnSlice slice = slices[reversed ? (slices.length - 1 - sliceIdx) : sliceIdx];
+                return reversed ? delInfo.rangeIterator(slice.finish, slice.start)
+                                : delInfo.rangeIterator(slice.start, slice.finish);
+            }
+
+            private boolean nextSlice()
+            {
+                return ++sliceIdx < slices.length;
+            }
+        };
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/index/AbstractSimplePerColumnSecondaryIndex.java b/src/java/org/apache/cassandra/db/index/AbstractSimplePerColumnSecondaryIndex.java
index 87e87cb..d8c03fb 100644
--- a/src/java/org/apache/cassandra/db/index/AbstractSimplePerColumnSecondaryIndex.java
+++ b/src/java/org/apache/cassandra/db/index/AbstractSimplePerColumnSecondaryIndex.java

@@ -18,14 +18,19 @@
 package org.apache.cassandra.db.index;
 
 import java.nio.ByteBuffer;
+import java.util.concurrent.Future;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.dht.*;
-import org.apache.cassandra.thrift.IndexExpression;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.dht.LocalToken;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
 /**
  * Implements a secondary index for a column family using a second column family
@@ -47,23 +52,34 @@
 
         columnDef = columnDefs.iterator().next();
 
-        AbstractType indexComparator = SecondaryIndex.getIndexComparator(baseCfs.metadata, columnDef);
+        CellNameType indexComparator = SecondaryIndex.getIndexComparator(baseCfs.metadata, columnDef);
         CFMetaData indexedCfMetadata = CFMetaData.newIndexMetadata(baseCfs.metadata, columnDef, indexComparator);
         indexCfs = ColumnFamilyStore.createColumnFamilyStore(baseCfs.keyspace,
                                                              indexedCfMetadata.cfName,
-                                                             new LocalPartitioner(columnDef.getValidator()),
+                                                             new LocalPartitioner(getIndexKeyComparator()),
                                                              indexedCfMetadata);
     }
 
+    protected AbstractType<?> getIndexKeyComparator()
+    {
+        return columnDef.type;
+    }
+
+    @Override
+    public DecoratedKey getIndexKeyFor(ByteBuffer value)
+    {
+        return new BufferDecoratedKey(new LocalToken(getIndexKeyComparator(), value), value);
+    }
+
     @Override
     String indexTypeForGrouping()
     {
         return "_internal_";
     }
 
-    protected abstract ByteBuffer makeIndexColumnName(ByteBuffer rowKey, Column column);
+    protected abstract CellName makeIndexColumnName(ByteBuffer rowKey, Cell cell);
 
-    protected abstract ByteBuffer getIndexedValue(ByteBuffer rowKey, Column column);
+    protected abstract ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell);
 
     protected abstract AbstractType getExpressionComparator();
 
@@ -71,51 +87,52 @@
     {
         return String.format("'%s.%s %s %s'",
                              baseCfs.name,
-                             getExpressionComparator().getString(expr.column_name),
-                             expr.op,
-                             baseCfs.metadata.getColumnDefinition(expr.column_name).getValidator().getString(expr.value));
+                             getExpressionComparator().getString(expr.column),
+                             expr.operator,
+                             baseCfs.metadata.getColumnDefinition(expr.column).type.getString(expr.value));
     }
 
-    public void delete(ByteBuffer rowKey, Column column)
+    public void delete(ByteBuffer rowKey, Cell cell, OpOrder.Group opGroup)
     {
-        if (column.isMarkedForDelete(System.currentTimeMillis()))
+        if (!cell.isLive())
             return;
 
-        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey, column));
+        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey, cell));
         int localDeletionTime = (int) (System.currentTimeMillis() / 1000);
-        ColumnFamily cfi = ArrayBackedSortedColumns.factory.create(indexCfs.metadata);
-        ByteBuffer name = makeIndexColumnName(rowKey, column);
-        assert name.remaining() > 0 && name.remaining() <= Column.MAX_NAME_LENGTH : name.remaining();
-        cfi.addTombstone(name, localDeletionTime, column.timestamp());
-        indexCfs.apply(valueKey, cfi, SecondaryIndexManager.nullUpdater);
+        ColumnFamily cfi = ArrayBackedSortedColumns.factory.create(indexCfs.metadata, false, 1);
+        cfi.addTombstone(makeIndexColumnName(rowKey, cell), localDeletionTime, cell.timestamp());
+        indexCfs.apply(valueKey, cfi, SecondaryIndexManager.nullUpdater, opGroup, null);
         if (logger.isDebugEnabled())
             logger.debug("removed index entry for cleaned-up value {}:{}", valueKey, cfi);
     }
 
-    public void insert(ByteBuffer rowKey, Column column)
+    public void insert(ByteBuffer rowKey, Cell cell, OpOrder.Group opGroup)
     {
-        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey, column));
-        ColumnFamily cfi = ArrayBackedSortedColumns.factory.create(indexCfs.metadata);
-        ByteBuffer name = makeIndexColumnName(rowKey, column);
-        assert name.remaining() > 0 && name.remaining() <= Column.MAX_NAME_LENGTH : name.remaining();
-        if (column instanceof ExpiringColumn)
+        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey, cell));
+        ColumnFamily cfi = ArrayBackedSortedColumns.factory.create(indexCfs.metadata, false, 1);
+        CellName name = makeIndexColumnName(rowKey, cell);
+        if (cell instanceof ExpiringCell)
         {
-            ExpiringColumn ec = (ExpiringColumn)column;
-            cfi.addColumn(new ExpiringColumn(name, ByteBufferUtil.EMPTY_BYTE_BUFFER, ec.timestamp(), ec.getTimeToLive(), ec.getLocalDeletionTime()));
+            ExpiringCell ec = (ExpiringCell) cell;
+            cfi.addColumn(new BufferExpiringCell(name, ByteBufferUtil.EMPTY_BYTE_BUFFER, ec.timestamp(), ec.getTimeToLive(), ec.getLocalDeletionTime()));
         }
         else
         {
-            cfi.addColumn(new Column(name, ByteBufferUtil.EMPTY_BYTE_BUFFER, column.timestamp()));
+            cfi.addColumn(new BufferCell(name, ByteBufferUtil.EMPTY_BYTE_BUFFER, cell.timestamp()));
         }
         if (logger.isDebugEnabled())
-            logger.debug("applying index row {} in {}", indexCfs.metadata.getKeyValidator().getString(valueKey.key), cfi);
+            logger.debug("applying index row {} in {}", indexCfs.metadata.getKeyValidator().getString(valueKey.getKey()), cfi);
 
-        indexCfs.apply(valueKey, cfi, SecondaryIndexManager.nullUpdater);
+        indexCfs.apply(valueKey, cfi, SecondaryIndexManager.nullUpdater, opGroup, null);
     }
 
-    public void update(ByteBuffer rowKey, Column col)
+    public void update(ByteBuffer rowKey, Cell oldCol, Cell col, OpOrder.Group opGroup)
     {
-        insert(rowKey, col);
+        // insert the new value before removing the old one, so we never have a period
+        // where the row is invisible to both queries (the opposite seems preferable); see CASSANDRA-5540                    
+        insert(rowKey, col, opGroup);
+        if (SecondaryIndexManager.shouldCleanupOldValue(oldCol, col))
+            delete(rowKey, oldCol, opGroup);
     }
 
     public void removeIndex(ByteBuffer columnName)
@@ -125,7 +142,13 @@
 
     public void forceBlockingFlush()
     {
-        indexCfs.forceBlockingFlush();
+        Future<?> wait;
+        // we synchronise on the baseCfs to make sure we are ordered correctly with other flushes to the base CFS
+        synchronized (baseCfs.getDataTracker())
+        {
+            wait = indexCfs.forceFlush();
+        }
+        FBUtilities.waitOnFuture(wait);
     }
 
     public void invalidate()
@@ -148,14 +171,14 @@
         return indexCfs.name;
     }
 
-    public long getLiveSize()
-    {
-        return indexCfs.getMemtableDataSize();
-    }
-
     public void reload()
     {
         indexCfs.metadata.reloadSecondaryIndexMetadata(baseCfs.metadata);
         indexCfs.reload();
     }
+    
+    public long estimateResultRows()
+    {
+        return getIndexCfs().getMeanColumns();
+    } 
 }

diff --git a/src/java/org/apache/cassandra/db/index/PerColumnSecondaryIndex.java b/src/java/org/apache/cassandra/db/index/PerColumnSecondaryIndex.java
index e77bd0f..79087d2 100644
--- a/src/java/org/apache/cassandra/db/index/PerColumnSecondaryIndex.java
+++ b/src/java/org/apache/cassandra/db/index/PerColumnSecondaryIndex.java

@@ -19,7 +19,8 @@
 
 import java.nio.ByteBuffer;
 
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.utils.FBUtilities;
 
 /**
@@ -29,12 +30,12 @@
 public abstract class PerColumnSecondaryIndex extends SecondaryIndex
 {
     /**
-     * Delete a column from the index
+     * Delete a column from the index.
      *
      * @param rowKey the underlying row key which is indexed
      * @param col all the column info
      */
-    public abstract void delete(ByteBuffer rowKey, Column col);
+    public abstract void delete(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup);
 
     /**
      * insert a column to the index
@@ -42,24 +43,24 @@
      * @param rowKey the underlying row key which is indexed
      * @param col all the column info
      */
-    public abstract void insert(ByteBuffer rowKey, Column col);
+    public abstract void insert(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup);
 
     /**
      * update a column from the index
      *
      * @param rowKey the underlying row key which is indexed
+     * @param oldCol the previous column info
      * @param col all the column info
      */
-    public abstract void update(ByteBuffer rowKey, Column col);
+    public abstract void update(ByteBuffer rowKey, Cell oldCol, Cell col, OpOrder.Group opGroup);
 
     public String getNameForSystemKeyspace(ByteBuffer column)
     {
         return getIndexName();
     }
 
-    @Override
-    public boolean validate(Column column)
+    public boolean validate(Cell cell)
     {
-        return column.value().remaining() < FBUtilities.MAX_UNSIGNED_SHORT;
+        return cell.value().remaining() < FBUtilities.MAX_UNSIGNED_SHORT;
     }
 }

diff --git a/src/java/org/apache/cassandra/db/index/PerRowSecondaryIndex.java b/src/java/org/apache/cassandra/db/index/PerRowSecondaryIndex.java
index 0419d83..d73d056 100644
--- a/src/java/org/apache/cassandra/db/index/PerRowSecondaryIndex.java
+++ b/src/java/org/apache/cassandra/db/index/PerRowSecondaryIndex.java

@@ -20,7 +20,8 @@
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -43,9 +44,8 @@
      *
      * @param key
      */
-    public abstract void delete(DecoratedKey key);
+    public abstract void delete(DecoratedKey key, OpOrder.Group opGroup);
 
-    @Override
     public String getNameForSystemKeyspace(ByteBuffer columnName)
     {
         try
@@ -58,8 +58,7 @@
         }
     }
 
-    @Override
-    public boolean validate(Column column)
+    public boolean validate(Cell cell)
     {
         return true;
     }

diff --git a/src/java/org/apache/cassandra/db/index/SecondaryIndex.java b/src/java/org/apache/cassandra/db/index/SecondaryIndex.java
index 64266c4..4d50fa6 100644
--- a/src/java/org/apache/cassandra/db/index/SecondaryIndex.java
+++ b/src/java/org/apache/cassandra/db/index/SecondaryIndex.java

@@ -18,8 +18,13 @@
 package org.apache.cassandra.db.index;
 
 import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.*;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.Future;
+import java.util.concurrent.FutureTask;
 
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
@@ -27,22 +32,27 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.index.keys.KeysIndex;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
 import org.apache.cassandra.db.index.composites.CompositesIndex;
+import org.apache.cassandra.db.index.keys.KeysIndex;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.marshal.LocalByPartionerType;
-import org.apache.cassandra.dht.*;
+import org.apache.cassandra.dht.LocalToken;
+import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.ReducingKeyIterator;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
 
 /**
  * Abstract base class for different types of secondary indexes.
@@ -128,13 +138,13 @@
     public void setIndexBuilt()
     {
         for (ColumnDefinition columnDef : columnDefs)
-            SystemKeyspace.setIndexBuilt(baseCfs.keyspace.getName(), getNameForSystemKeyspace(columnDef.name));
+            SystemKeyspace.setIndexBuilt(baseCfs.keyspace.getName(), getNameForSystemKeyspace(columnDef.name.bytes));
     }
 
     public void setIndexRemoved()
     {
         for (ColumnDefinition columnDef : columnDefs)
-            SystemKeyspace.setIndexRemoved(baseCfs.keyspace.getName(), getNameForSystemKeyspace(columnDef.name));
+            SystemKeyspace.setIndexRemoved(baseCfs.keyspace.getName(), getNameForSystemKeyspace(columnDef.name.bytes));
     }
 
     /**
@@ -146,16 +156,11 @@
     protected abstract SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns);
 
     /**
-     * Forces this indexes in memory data to disk
+     * Forces this indexes' in memory data to disk
      */
     public abstract void forceBlockingFlush();
 
     /**
-     * Get current amount of memory this index is consuming (in bytes)
-     */
-    public abstract long getLiveSize();
-
-    /**
      * Allow access to the underlying column family store if there is one
      * @return the underlying column family store or null
      */
@@ -204,7 +209,7 @@
         {
             SSTableReader.releaseReferences(sstables);
         }
-        logger.info("Index build of " + getIndexName() + " complete");
+        logger.info("Index build of {} complete", getIndexName());
     }
 
 
@@ -220,7 +225,7 @@
         boolean allAreBuilt = true;
         for (ColumnDefinition cdef : columnDefs)
         {
-            if (!SystemKeyspace.isIndexBuilt(baseCfs.keyspace.getName(), getNameForSystemKeyspace(cdef.name)))
+            if (!SystemKeyspace.isIndexBuilt(baseCfs.keyspace.getName(), getNameForSystemKeyspace(cdef.name.bytes)))
             {
                 allAreBuilt = false;
                 break;
@@ -271,7 +276,7 @@
         Iterator<ColumnDefinition> it = columnDefs.iterator();
         while (it.hasNext())
         {
-            if (it.next().name.equals(name))
+            if (it.next().name.bytes.equals(name))
                 it.remove();
         }
     }
@@ -284,25 +289,14 @@
     public DecoratedKey getIndexKeyFor(ByteBuffer value)
     {
         // FIXME: this imply one column definition per index
-        ByteBuffer name = columnDefs.iterator().next().name;
-        return new DecoratedKey(new LocalToken(baseCfs.metadata.getColumnDefinition(name).getValidator(), value), value);
+        ByteBuffer name = columnDefs.iterator().next().name.bytes;
+        return new BufferDecoratedKey(new LocalToken(baseCfs.metadata.getColumnDefinition(name).type, value), value);
     }
 
     /**
-     * Returns true if the provided column name is indexed by this secondary index.
-     *
-     * The default implement checks whether the name is one the columnDef name,
-     * but this should be overriden but subclass if needed.
+     * Returns true if the provided cell name is indexed by this secondary index.
      */
-    public boolean indexes(ByteBuffer name)
-    {
-        for (ColumnDefinition columnDef : columnDefs)
-        {
-            if (baseCfs.getComparator().compare(columnDef.name, name) == 0)
-                return true;
-        }
-        return false;
-    }
+    public abstract boolean indexes(CellName name);
 
     /**
      * This is the primary way to create a secondary index instance for a CF column.
@@ -350,7 +344,9 @@
         return index;
     }
 
-    public abstract boolean validate(Column column);
+    public abstract boolean validate(Cell cell);
+
+    public abstract long estimateResultRows();
 
     /**
      * Returns the index comparator for index backed by CFS, or null.
@@ -358,12 +354,12 @@
      * Note: it would be cleaner to have this be a member method. However we need this when opening indexes
      * sstables, but by then the CFS won't be fully initiated, so the SecondaryIndex object won't be accessible.
      */
-    public static AbstractType<?> getIndexComparator(CFMetaData baseMetadata, ColumnDefinition cdef)
+    public static CellNameType getIndexComparator(CFMetaData baseMetadata, ColumnDefinition cdef)
     {
         switch (cdef.getIndexType())
         {
             case KEYS:
-                return keyComparator;
+                return new SimpleDenseCellNameType(keyComparator);
             case COMPOSITES:
                 return CompositesIndex.getIndexComparator(baseMetadata, cdef);
             case CUSTOM:

diff --git a/src/java/org/apache/cassandra/db/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/db/index/SecondaryIndexManager.java
index b75f917..c2d481b 100644
--- a/src/java/org/apache/cassandra/db/index/SecondaryIndexManager.java
+++ b/src/java/org/apache/cassandra/db/index/SecondaryIndexManager.java

@@ -18,23 +18,43 @@
 package org.apache.cassandra.db.index;
 
 import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.IdentityHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentNavigableMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.Future;
 
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.config.IndexType;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.IndexExpression;
+import org.apache.cassandra.db.Row;
+import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.composites.CellName;
 import org.apache.cassandra.db.filter.ExtendedFilter;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.io.sstable.ReducingKeyIterator;
 import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexType;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
 /**
  * Manages all the indexes associated with a given CFS
@@ -46,11 +66,11 @@
 
     public static final Updater nullUpdater = new Updater()
     {
-        public void insert(Column column) { }
+        public void insert(Cell cell) { }
 
-        public void update(Column oldColumn, Column column) { }
+        public void update(Cell oldCell, Cell cell) { }
 
-        public void remove(Column current) { }
+        public void remove(Cell current) { }
 
         public void updateRowLevelIndexes() {}
     };
@@ -60,6 +80,7 @@
      */
     private final ConcurrentNavigableMap<ByteBuffer, SecondaryIndex> indexesByColumn;
 
+
     /**
      * Keeps a single instance of a SecondaryIndex for many columns when the index type
      * has isRowLevelIndex() == true
@@ -68,11 +89,13 @@
      */
     private final ConcurrentMap<Class<? extends SecondaryIndex>, SecondaryIndex> rowLevelIndexMap;
 
+
     /**
      * Keeps all secondary index instances, either per-column or per-row
      */
     private final Set<SecondaryIndex> allIndexes;
 
+
     /**
      * The underlying column family containing the source data for these indexes
      */
@@ -105,7 +128,7 @@
 
         // TODO: allow all ColumnDefinition type
         for (ColumnDefinition cdef : baseCfs.metadata.allColumns())
-            if (cdef.getIndexType() != null && !indexedColumnNames.contains(cdef.name))
+            if (cdef.getIndexType() != null && !indexedColumnNames.contains(cdef.name.bytes))
                 addIndexedColumn(cdef);
 
         for (SecondaryIndex index : allIndexes)
@@ -143,10 +166,10 @@
 
         flushIndexesBlocking();
 
-        logger.info("Index build of " + idxNames + " complete");
+        logger.info("Index build of {} complete", idxNames);
     }
 
-    public boolean indexes(ByteBuffer name, Set<SecondaryIndex> indexes)
+    public boolean indexes(CellName name, Set<SecondaryIndex> indexes)
     {
         boolean matching = false;
         for (SecondaryIndex index : indexes)
@@ -160,7 +183,7 @@
         return matching;
     }
 
-    public Set<SecondaryIndex> indexFor(ByteBuffer name, Set<SecondaryIndex> indexes)
+    public Set<SecondaryIndex> indexFor(CellName name, Set<SecondaryIndex> indexes)
     {
         Set<SecondaryIndex> matching = null;
         for (SecondaryIndex index : indexes)
@@ -175,35 +198,31 @@
         return matching == null ? Collections.<SecondaryIndex>emptySet() : matching;
     }
 
-    public boolean indexes(Column column)
+    public boolean indexes(Cell cell)
     {
-        return indexes(column.name());
+        return indexes(cell.name());
     }
 
-    public boolean indexes(ByteBuffer name)
+    public boolean indexes(CellName name)
     {
         return indexes(name, allIndexes);
     }
 
-    public Set<SecondaryIndex> indexFor(ByteBuffer name)
+    public Set<SecondaryIndex> indexFor(CellName name)
     {
         return indexFor(name, allIndexes);
     }
 
     /**
-     * @return true if the indexes can handle the clause.
+     * @return true if at least one of the indexes can handle the clause.
      */
     public boolean hasIndexFor(List<IndexExpression> clause)
     {
         if (clause == null || clause.isEmpty())
             return false;
 
-        List<SecondaryIndexSearcher> searchers = getIndexSearchersForQuery(clause);
-        if (searchers.isEmpty())
-            return false;
-
-        for (SecondaryIndexSearcher searcher : searchers)
-            if (searcher.isIndexing(clause))
+        for (SecondaryIndexSearcher searcher : getIndexSearchersForQuery(clause))
+            if (searcher.canHandleIndexClause(clause))
                 return true;
 
         return false;
@@ -248,8 +267,7 @@
      */
     public synchronized Future<?> addIndexedColumn(ColumnDefinition cdef)
     {
-
-        if (indexesByColumn.containsKey(cdef.name))
+        if (indexesByColumn.containsKey(cdef.name.bytes))
             return null;
 
         assert cdef.getIndexType() != null;
@@ -295,14 +313,14 @@
         // so we don't have to lock everything while we do the build. it's up to
         // the operator to wait
         // until the index is actually built before using in queries.
-        indexesByColumn.put(cdef.name, index);
+        indexesByColumn.put(cdef.name.bytes, index);
 
         // Add to all indexes set:
         allIndexes.add(index);
 
         // if we're just linking in the index to indexedColumns on an
         // already-built index post-restart, we're done
-        if (index.isIndexBuilt(cdef.name))
+        if (index.isIndexBuilt(cdef.name.bytes))
             return null;
 
         return index.buildIndexAsync();
@@ -332,8 +350,22 @@
      */
     public void flushIndexesBlocking()
     {
+        // despatch flushes for all CFS backed indexes
+        List<Future<?>> wait = new ArrayList<>();
+        synchronized (baseCfs.getDataTracker())
+        {
+            for (SecondaryIndex index : allIndexes)
+                if (index.getIndexCfs() != null)
+                    wait.add(index.getIndexCfs().forceFlush());
+        }
+
+        // blockingFlush any non-CFS-backed indexes
         for (SecondaryIndex index : allIndexes)
-            index.forceBlockingFlush();
+            if (index.getIndexCfs() == null)
+                index.forceBlockingFlush();
+
+        // wait for the CFS-backed index flushes to complete
+        FBUtilities.waitOnFutures(wait);
     }
 
     /**
@@ -401,23 +433,12 @@
     }
 
     /**
-     * @return total current ram size of all indexes
-     */
-    public long getTotalLiveSize()
-    {
-        long total = 0;
-        for (SecondaryIndex index : getIndexes())
-            total += index.getLiveSize();
-        return total;
-    }
-
-    /**
      * When building an index against existing data, add the given row to the index
      *
      * @param key the row key
      * @param cf the current rows data
      */
-    public void indexRow(ByteBuffer key, ColumnFamily cf)
+    public void indexRow(ByteBuffer key, ColumnFamily cf, OpOrder.Group opGroup)
     {
         // Update entire row only once per row level index
         Set<Class<? extends SecondaryIndex>> appliedRowLevelIndexes = null;
@@ -434,9 +455,9 @@
             }
             else
             {
-                for (Column column : cf)
-                    if (column.isLive(System.currentTimeMillis()) && index.indexes(column.name()))
-                        ((PerColumnSecondaryIndex) index).insert(key, column);
+                for (Cell cell : cf)
+                    if (cell.isLive() && index.indexes(cell.name()))
+                        ((PerColumnSecondaryIndex) index).insert(key, cell, opGroup);
             }
         }
     }
@@ -447,25 +468,25 @@
      * @param key the row key
      * @param indexedColumnsInRow all column names in row
      */
-    public void deleteFromIndexes(DecoratedKey key, List<Column> indexedColumnsInRow)
+    public void deleteFromIndexes(DecoratedKey key, List<Cell> indexedColumnsInRow, OpOrder.Group opGroup)
     {
         // Update entire row only once per row level index
         Set<Class<? extends SecondaryIndex>> cleanedRowLevelIndexes = null;
 
-        for (Column column : indexedColumnsInRow)
+        for (Cell cell : indexedColumnsInRow)
         {
-            for (SecondaryIndex index : indexFor(column.name()))
+            for (SecondaryIndex index : indexFor(cell.name()))
             {
                 if (index instanceof PerRowSecondaryIndex)
                 {
                     if (cleanedRowLevelIndexes == null)
                         cleanedRowLevelIndexes = new HashSet<>();
                     if (cleanedRowLevelIndexes.add(index.getClass()))
-                        ((PerRowSecondaryIndex) index).delete(key);
+                        ((PerRowSecondaryIndex) index).delete(key, opGroup);
                 }
                 else
                 {
-                    ((PerColumnSecondaryIndex) index).delete(key.key, column);
+                    ((PerColumnSecondaryIndex) index).delete(key.getKey(), cell, opGroup);
                 }
             }
         }
@@ -478,19 +499,19 @@
      * can get updated. Note: only a CF backed by AtomicSortedColumns implements
      * this behaviour fully, other types simply ignore the index updater.
      */
-    public Updater updaterFor(DecoratedKey key, ColumnFamily cf)
+    public Updater updaterFor(DecoratedKey key, ColumnFamily cf, OpOrder.Group opGroup)
     {
         return (indexesByColumn.isEmpty() && rowLevelIndexMap.isEmpty())
                 ? nullUpdater
-                : new StandardUpdater(key, cf);
+                : new StandardUpdater(key, cf, opGroup);
     }
 
     /**
      * Updated closure with only the modified row key.
      */
-    public Updater updaterFor(DecoratedKey key)
+    public Updater gcUpdaterFor(DecoratedKey key)
     {
-        return updaterFor(key, null);
+        return new GCUpdater(key);
     }
 
     /**
@@ -498,14 +519,14 @@
      * @param clause the query clause
      * @return the searchers needed to query the index
      */
-    private List<SecondaryIndexSearcher> getIndexSearchersForQuery(List<IndexExpression> clause)
+    public List<SecondaryIndexSearcher> getIndexSearchersForQuery(List<IndexExpression> clause)
     {
         Map<String, Set<ByteBuffer>> groupByIndexType = new HashMap<>();
 
         //Group columns by type
         for (IndexExpression ix : clause)
         {
-            SecondaryIndex index = getIndexForColumn(ix.column_name);
+            SecondaryIndex index = getIndexForColumn(ix.column);
 
             if (index == null)
                 continue;
@@ -518,7 +539,7 @@
                 groupByIndexType.put(index.indexTypeForGrouping(), columns);
             }
 
-            columns.add(ix.column_name);
+            columns.add(ix.column);
         }
 
         List<SecondaryIndexSearcher> indexSearchers = new ArrayList<>(groupByIndexType.size());
@@ -531,8 +552,53 @@
     }
 
     /**
+     * Validates an union of expression index types. It will throw a {@link RuntimeException} if
+     * any of the expressions in the provided clause is not valid for its index implementation.
+     * @param clause the query clause
+     * @throws org.apache.cassandra.exceptions.InvalidRequestException in case of validation errors
+     */
+    public void validateIndexSearchersForQuery(List<IndexExpression> clause) throws InvalidRequestException
+    {
+        // Group by index type
+        Map<String, Set<IndexExpression>> expressionsByIndexType = new HashMap<>();
+        Map<String, Set<ByteBuffer>> columnsByIndexType = new HashMap<>();
+        for (IndexExpression indexExpression : clause)
+        {
+            SecondaryIndex index = getIndexForColumn(indexExpression.column);
+
+            if (index == null)
+                continue;
+
+            String canonicalIndexName = index.getClass().getCanonicalName();
+            Set<IndexExpression> expressions = expressionsByIndexType.get(canonicalIndexName);
+            Set<ByteBuffer> columns = columnsByIndexType.get(canonicalIndexName);
+            if (expressions == null)
+            {
+                expressions = new HashSet<>();
+                columns = new HashSet<>();
+                expressionsByIndexType.put(canonicalIndexName, expressions);
+                columnsByIndexType.put(canonicalIndexName, columns);
+            }
+
+            expressions.add(indexExpression);
+            columns.add(indexExpression.column);
+        }
+
+        // Validate
+        for (Map.Entry<String, Set<IndexExpression>> expressions : expressionsByIndexType.entrySet())
+        {
+            Set<ByteBuffer> columns = columnsByIndexType.get(expressions.getKey());
+            SecondaryIndex secondaryIndex = getIndexForColumn(columns.iterator().next());
+            SecondaryIndexSearcher searcher = secondaryIndex.createSecondaryIndexSearcher(columns);
+            for (IndexExpression expression : expressions.getValue())
+            {
+                searcher.validate(expression);
+            }
+        }
+    }
+
+    /**
      * Performs a search across a number of column indexes
-     * TODO: add support for querying across index types
      *
      * @param filter the column range to restrict to
      * @return found indexed rows
@@ -544,11 +610,20 @@
         if (indexSearchers.isEmpty())
             return Collections.emptyList();
 
-        //We currently don't support searching across multiple index types
-        if (indexSearchers.size() > 1)
-            throw new RuntimeException("Unable to search across multiple secondary index types");
+        SecondaryIndexSearcher mostSelective = null;
+        long bestEstimate = Long.MAX_VALUE;
+        for (SecondaryIndexSearcher searcher : indexSearchers)
+        {
+            SecondaryIndex highestSelectivityIndex = searcher.highestSelectivityIndex(filter.getClause());
+            long estimate = highestSelectivityIndex.estimateResultRows();
+            if (estimate <= bestEstimate)
+            {
+                bestEstimate = estimate;
+                mostSelective = searcher;
+            }
+        }
 
-        return indexSearchers.get(0).search(filter);
+        return mostSelective.search(filter);
     }
 
     public Set<SecondaryIndex> getIndexesByNames(Set<String> idxNames)
@@ -572,103 +647,155 @@
             index.setIndexRemoved();
     }
 
-    public boolean validate(Column column)
+    public boolean validate(Cell cell)
     {
-        SecondaryIndex index = getIndexForColumn(column.name());
-        return index == null || index.validate(column);
+        for (SecondaryIndex index : indexFor(cell.name()))
+        {
+            if (!index.validate(cell))
+                return false;
+        }
+        return true;
+    }
+
+    static boolean shouldCleanupOldValue(Cell oldCell, Cell newCell)
+    {
+        // If any one of name/value/timestamp are different, then we
+        // should delete from the index. If not, then we can infer that
+        // at least one of the cells is an ExpiringColumn and that the
+        // difference is in the expiry time. In this case, we don't want to
+        // delete the old value from the index as the tombstone we insert
+        // will just hide the inserted value.
+        // Completely identical cells (including expiring columns with
+        // identical ttl & localExpirationTime) will not get this far due
+        // to the oldCell.equals(newColumn) in StandardUpdater.update
+        return !oldCell.name().equals(newCell.name())
+            || !oldCell.value().equals(newCell.value())
+            || oldCell.timestamp() != newCell.timestamp();
     }
 
     public static interface Updater
     {
         /** called when constructing the index against pre-existing data */
-        public void insert(Column column);
+        public void insert(Cell cell);
 
         /** called when updating the index from a memtable */
-        public void update(Column oldColumn, Column column);
+        public void update(Cell oldCell, Cell cell);
 
         /** called when lazy-updating the index during compaction (CASSANDRA-2897) */
-        public void remove(Column current);
+        public void remove(Cell current);
 
         /** called after memtable updates are complete (CASSANDRA-5397) */
         public void updateRowLevelIndexes();
     }
 
-    private class StandardUpdater implements Updater
+    private final class GCUpdater implements Updater
     {
         private final DecoratedKey key;
-        private final ColumnFamily cf;
 
-        public StandardUpdater(DecoratedKey key, ColumnFamily cf)
+        public GCUpdater(DecoratedKey key)
         {
             this.key = key;
-            this.cf = cf;
         }
 
-        public void insert(Column column)
+        public void insert(Cell cell)
         {
-            if (column.isMarkedForDelete(System.currentTimeMillis()))
-                return;
-
-            for (SecondaryIndex index : indexFor(column.name()))
-                if (index instanceof PerColumnSecondaryIndex)
-                    ((PerColumnSecondaryIndex) index).insert(key.key, column);
+            throw new UnsupportedOperationException();
         }
 
-        public void update(Column oldColumn, Column column)
+        public void update(Cell oldCell, Cell newCell)
         {
-            if (oldColumn.equals(column))
+            throw new UnsupportedOperationException();
+        }
+
+        public void remove(Cell cell)
+        {
+            if (!cell.isLive())
                 return;
-            
-            for (SecondaryIndex index : indexFor(column.name()))
+
+            for (SecondaryIndex index : indexFor(cell.name()))
             {
                 if (index instanceof PerColumnSecondaryIndex)
                 {
-                    // insert the new value before removing the old one, so we never have a period
-                    // where the row is invisible to both queries (the opposite seems preferable); see CASSANDRA-5540
-                    if (!column.isMarkedForDelete(System.currentTimeMillis()))
-                        ((PerColumnSecondaryIndex) index).insert(key.key, column);
-
-                    // Usually we want to delete the old value from the index, except when
-                    // name/value/timestamp are all equal, but the columns themselves
-                    // are not (as is the case when overwriting expiring columns with
-                    // identical values and ttl) Then, we don't want to delete as the
-                    // tombstone will hide the new value we just inserted; see CASSANDRA-7268
-                    if (shouldCleanupOldValue(oldColumn, column))
-                        ((PerColumnSecondaryIndex) index).delete(key.key, oldColumn);
+                    try (OpOrder.Group opGroup = baseCfs.keyspace.writeOrder.start())
+                    {
+                        ((PerColumnSecondaryIndex) index).delete(key.getKey(), cell, opGroup);
+                    }
                 }
             }
         }
 
-        public void remove(Column column)
-        {
-            if (column.isMarkedForDelete(System.currentTimeMillis()))
-                return;
-
-            for (SecondaryIndex index : indexFor(column.name()))
-                if (index instanceof PerColumnSecondaryIndex)
-                   ((PerColumnSecondaryIndex) index).delete(key.key, column);
-        }
-
         public void updateRowLevelIndexes()
         {
             for (SecondaryIndex index : rowLevelIndexMap.values())
-                ((PerRowSecondaryIndex) index).index(key.key, cf);
+                ((PerRowSecondaryIndex) index).index(key.getKey(), null);
+        }
+    }
+
+    private final class StandardUpdater implements Updater
+    {
+        private final DecoratedKey key;
+        private final ColumnFamily cf;
+        private final OpOrder.Group opGroup;
+
+        public StandardUpdater(DecoratedKey key, ColumnFamily cf, OpOrder.Group opGroup)
+        {
+            this.key = key;
+            this.cf = cf;
+            this.opGroup = opGroup;
         }
 
-        private boolean shouldCleanupOldValue(Column oldColumn, Column newColumn)
+        public void insert(Cell cell)
         {
-            // If any one of name/value/timestamp are different, then we
-            // should delete from the index. If not, then we can infer that
-            // at least one of the columns is an ExpiringColumn and that the
-            // difference is in the expiry time. In this case, we don't want to
-            // delete the old value from the index as the tombstone we insert
-            // will just hide the inserted value.
-            // Completely identical columns (including expiring columns with
-            // identical ttl & localExpirationTime) will not get this far due
-            // to the oldColumn.equals(newColumn) in StandardUpdater.update
-            return !oldColumn.name().equals(newColumn.name())
-                || !oldColumn.value().equals(newColumn.value())
-                || oldColumn.timestamp() != newColumn.timestamp();
+            if (!cell.isLive())
+                return;
+
+            for (SecondaryIndex index : indexFor(cell.name()))
+                if (index instanceof PerColumnSecondaryIndex)
+                    ((PerColumnSecondaryIndex) index).insert(key.getKey(), cell, opGroup);
         }
+
+        public void update(Cell oldCell, Cell cell)
+        {
+            if (oldCell.equals(cell))
+                return;
+            
+            for (SecondaryIndex index : indexFor(cell.name()))
+            {
+                if (index instanceof PerColumnSecondaryIndex)
+                {
+                    if (cell.isLive())
+                    {
+                        ((PerColumnSecondaryIndex) index).update(key.getKey(), oldCell, cell, opGroup);
+                    }
+                    else
+                    {
+                        // Usually we want to delete the old value from the index, except when
+                        // name/value/timestamp are all equal, but the columns themselves
+                        // are not (as is the case when overwriting expiring columns with
+                        // identical values and ttl) Then, we don't want to delete as the
+                        // tombstone will hide the new value we just inserted; see CASSANDRA-7268
+                        if (shouldCleanupOldValue(oldCell, cell))
+                            ((PerColumnSecondaryIndex) index).delete(key.getKey(), oldCell, opGroup);
+                    }
+                }
+            }
+        }
+
+        public void remove(Cell cell)
+        {
+            if (!cell.isLive())
+                return;
+
+            for (SecondaryIndex index : indexFor(cell.name()))
+                if (index instanceof PerColumnSecondaryIndex)
+                   ((PerColumnSecondaryIndex) index).delete(key.getKey(), cell, opGroup);
+        }
+
+        public void updateRowLevelIndexes()
+        {
+            for (SecondaryIndex index : rowLevelIndexMap.values())
+                ((PerRowSecondaryIndex) index).index(key.getKey(), cf);
+        }
+
     }
 }

diff --git a/src/java/org/apache/cassandra/db/index/SecondaryIndexSearcher.java b/src/java/org/apache/cassandra/db/index/SecondaryIndexSearcher.java
index e93efd1..1239c29 100644
--- a/src/java/org/apache/cassandra/db/index/SecondaryIndexSearcher.java
+++ b/src/java/org/apache/cassandra/db/index/SecondaryIndexSearcher.java

@@ -22,8 +22,7 @@
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -40,14 +39,40 @@
         this.baseCfs = indexManager.baseCfs;
     }
 
+    public SecondaryIndex highestSelectivityIndex(List<IndexExpression> clause)
+    {
+        IndexExpression expr = highestSelectivityPredicate(clause);
+        return expr == null ? null : indexManager.getIndexForColumn(expr.column);
+    }
+
     public abstract List<Row> search(ExtendedFilter filter);
 
     /**
-     * @return true this index is able to handle given clauses.
+     * @return true this index is able to handle the given index expressions.
      */
-    public boolean isIndexing(List<IndexExpression> clause)
+    public boolean canHandleIndexClause(List<IndexExpression> clause)
     {
-        return highestSelectivityPredicate(clause) != null;
+        for (IndexExpression expression : clause)
+        {
+            if (!columns.contains(expression.column) || !expression.operator.allowsIndexQuery())
+                continue;
+
+            SecondaryIndex index = indexManager.getIndexForColumn(expression.column);
+            if (index != null && index.getIndexCfs() != null)
+                return true;
+        }
+        return false;
+    }
+    
+    /**
+     * Validates the specified {@link IndexExpression}. It will throw an {@link org.apache.cassandra.exceptions.InvalidRequestException}
+     * if the provided clause is not valid for the index implementation.
+     *
+     * @param indexExpression An {@link IndexExpression} to be validated
+     * @throws org.apache.cassandra.exceptions.InvalidRequestException in case of validation errors
+     */
+    public void validate(IndexExpression indexExpression) throws InvalidRequestException
+    {
     }
 
     protected IndexExpression highestSelectivityPredicate(List<IndexExpression> clause)
@@ -59,11 +84,11 @@
         for (IndexExpression expression : clause)
         {
             // skip columns belonging to a different index type
-            if (!columns.contains(expression.column_name))
+            if (!columns.contains(expression.column))
                 continue;
 
-            SecondaryIndex index = indexManager.getIndexForColumn(expression.column_name);
-            if (index == null || index.getIndexCfs() == null || expression.op != IndexOperator.EQ)
+            SecondaryIndex index = indexManager.getIndexForColumn(expression.column);
+            if (index == null || index.getIndexCfs() == null || !expression.operator.allowsIndexQuery())
                 continue;
             int columns = index.getIndexCfs().getMeanColumns();
             candidates.put(index, columns);
@@ -78,7 +103,7 @@
             Tracing.trace("No applicable indexes found");
         else
             Tracing.trace("Candidate index mean cardinalities are {}. Scanning with {}.",
-                          FBUtilities.toString(candidates), indexManager.getIndexForColumn(best.column_name).getIndexName());
+                          FBUtilities.toString(candidates), indexManager.getIndexForColumn(best.column).getIndexName());
 
         return best;
     }

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndex.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndex.java
index 0720e83..f69f716 100644
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndex.java
+++ b/src/java/org/apache/cassandra/db/index/composites/CompositesIndex.java

@@ -22,14 +22,18 @@
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.index.AbstractSimplePerColumnSecondaryIndex;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
 import org.apache.cassandra.db.index.SecondaryIndexSearcher;
-import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 
 /**
@@ -37,9 +41,9 @@
  */
 public abstract class CompositesIndex extends AbstractSimplePerColumnSecondaryIndex
 {
-    private volatile CompositeType indexComparator;
+    private volatile CellNameType indexComparator;
 
-    protected CompositeType getIndexComparator()
+    protected CellNameType getIndexComparator()
     {
         // Yes, this is racy, but doing this more than once is not a big deal, we just want to avoid doing it every time
         // More seriously, we should fix that whole SecondaryIndex API so this can be a final and avoid all that non-sense.
@@ -53,9 +57,24 @@
 
     public static CompositesIndex create(ColumnDefinition cfDef)
     {
-        switch (cfDef.type)
+        if (cfDef.type.isCollection())
         {
-            case CLUSTERING_KEY:
+            switch (((CollectionType)cfDef.type).kind)
+            {
+                case LIST:
+                    return new CompositesIndexOnCollectionValue();
+                case SET:
+                    return new CompositesIndexOnCollectionKey();
+                case MAP:
+                    return cfDef.getIndexOptions().containsKey("index_keys")
+                         ? new CompositesIndexOnCollectionKey()
+                         : new CompositesIndexOnCollectionValue();
+            }
+        }
+
+        switch (cfDef.kind)
+        {
+            case CLUSTERING_COLUMN:
                 return new CompositesIndexOnClusteringKey();
             case REGULAR:
                 return new CompositesIndexOnRegular();
@@ -68,11 +87,26 @@
     }
 
     // Check SecondaryIndex.getIndexComparator if you want to know why this is static
-    public static CompositeType getIndexComparator(CFMetaData baseMetadata, ColumnDefinition cfDef)
+    public static CellNameType getIndexComparator(CFMetaData baseMetadata, ColumnDefinition cfDef)
     {
-        switch (cfDef.type)
+        if (cfDef.type.isCollection())
         {
-            case CLUSTERING_KEY:
+            switch (((CollectionType)cfDef.type).kind)
+            {
+                case LIST:
+                    return CompositesIndexOnCollectionValue.buildIndexComparator(baseMetadata, cfDef);
+                case SET:
+                    return CompositesIndexOnCollectionKey.buildIndexComparator(baseMetadata, cfDef);
+                case MAP:
+                    return cfDef.getIndexOptions().containsKey("index_keys")
+                         ? CompositesIndexOnCollectionKey.buildIndexComparator(baseMetadata, cfDef)
+                         : CompositesIndexOnCollectionValue.buildIndexComparator(baseMetadata, cfDef);
+            }
+        }
+
+        switch (cfDef.kind)
+        {
+            case CLUSTERING_COLUMN:
                 return CompositesIndexOnClusteringKey.buildIndexComparator(baseMetadata, cfDef);
             case REGULAR:
                 return CompositesIndexOnRegular.buildIndexComparator(baseMetadata, cfDef);
@@ -84,39 +118,32 @@
         throw new AssertionError();
     }
 
-    protected ByteBuffer makeIndexColumnName(ByteBuffer rowKey, Column column)
+    protected CellName makeIndexColumnName(ByteBuffer rowKey, Cell cell)
     {
-        return makeIndexColumnNameBuilder(rowKey, column.name()).build();
+        return getIndexComparator().create(makeIndexColumnPrefix(rowKey, cell.name()), null);
     }
 
-    protected abstract ColumnNameBuilder makeIndexColumnNameBuilder(ByteBuffer rowKey, ByteBuffer columnName);
+    protected abstract Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite columnName);
 
-    public abstract IndexedEntry decodeEntry(DecoratedKey indexedValue, Column indexEntry);
+    public abstract IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry);
 
     public abstract boolean isStale(IndexedEntry entry, ColumnFamily data, long now);
 
-    public void delete(IndexedEntry entry)
+    public void delete(IndexedEntry entry, OpOrder.Group opGroup)
     {
         int localDeletionTime = (int) (System.currentTimeMillis() / 1000);
         ColumnFamily cfi = ArrayBackedSortedColumns.factory.create(indexCfs.metadata);
         cfi.addTombstone(entry.indexEntry, localDeletionTime, entry.timestamp);
-        indexCfs.apply(entry.indexValue, cfi, SecondaryIndexManager.nullUpdater);
+        indexCfs.apply(entry.indexValue, cfi, SecondaryIndexManager.nullUpdater, opGroup, null);
         if (logger.isDebugEnabled())
             logger.debug("removed index entry for cleaned-up value {}:{}", entry.indexValue, cfi);
-
     }
 
-    protected AbstractType getExpressionComparator()
+    protected AbstractType<?> getExpressionComparator()
     {
         return baseCfs.metadata.getColumnDefinitionComparator(columnDef);
     }
 
-    protected CompositeType getBaseComparator()
-    {
-        assert baseCfs.getComparator() instanceof CompositeType;
-        return (CompositeType)baseCfs.getComparator();
-    }
-
     public SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns)
     {
         return new CompositesSearcher(baseCfs.indexManager, columns);
@@ -127,10 +154,15 @@
         ColumnDefinition columnDef = columnDefs.iterator().next();
         Map<String, String> options = new HashMap<String, String>(columnDef.getIndexOptions());
 
-        // We take no options though we used to have one called "prefix_size",
-        // so skip it silently for backward compatibility sake.
+        // We used to have an option called "prefix_size" so skip it silently for backward compatibility sake.
         options.remove("prefix_size");
 
+        if (columnDef.type.isCollection())
+        {
+            options.remove("index_values");
+            options.remove("index_keys");
+        }
+
         if (!options.isEmpty())
             throw new ConfigurationException("Unknown options provided for COMPOSITES index: " + options.keySet());
     }
@@ -138,29 +170,31 @@
     public static class IndexedEntry
     {
         public final DecoratedKey indexValue;
-        public final ByteBuffer indexEntry;
+        public final CellName indexEntry;
         public final long timestamp;
 
         public final ByteBuffer indexedKey;
-        public final ColumnNameBuilder indexedEntryNameBuilder;
+        public final Composite indexedEntryPrefix;
+        public final ByteBuffer indexedEntryCollectionKey; // may be null
 
-        public IndexedEntry(DecoratedKey indexValue, ByteBuffer indexEntry, long timestamp, ByteBuffer indexedKey, ColumnNameBuilder indexedEntryNameBuilder)
+        public IndexedEntry(DecoratedKey indexValue, CellName indexEntry, long timestamp, ByteBuffer indexedKey, Composite indexedEntryPrefix)
+        {
+            this(indexValue, indexEntry, timestamp, indexedKey, indexedEntryPrefix, null);
+        }
+
+        public IndexedEntry(DecoratedKey indexValue,
+                            CellName indexEntry,
+                            long timestamp,
+                            ByteBuffer indexedKey,
+                            Composite indexedEntryPrefix,
+                            ByteBuffer indexedEntryCollectionKey)
         {
             this.indexValue = indexValue;
             this.indexEntry = indexEntry;
             this.timestamp = timestamp;
             this.indexedKey = indexedKey;
-            this.indexedEntryNameBuilder = indexedEntryNameBuilder;
-        }
-
-        public ByteBuffer indexedEntryStart()
-        {
-            return indexedEntryNameBuilder.build();
-        }
-
-        public ByteBuffer indexedEntryEnd()
-        {
-            return indexedEntryNameBuilder.buildAsEndOfRange();
+            this.indexedEntryPrefix = indexedEntryPrefix;
+            this.indexedEntryCollectionKey = indexedEntryCollectionKey;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnClusteringKey.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnClusteringKey.java
index 954f380..d967971 100644
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnClusteringKey.java
+++ b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnClusteringKey.java

@@ -23,13 +23,13 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.db.marshal.*;
 
 /**
- * Index on a CLUSTERING_KEY column definition.
+ * Index on a CLUSTERING_COLUMN column definition.
  *
  * A cell indexed by this index will have the general form:
  *   ck_0 ... ck_n c_name : v
@@ -47,62 +47,55 @@
  */
 public class CompositesIndexOnClusteringKey extends CompositesIndex
 {
-    public static CompositeType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
+    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
     {
         // Index cell names are rk ck_0 ... ck_{i-1} ck_{i+1} ck_n, so n
         // components total (where n is the number of clustering keys)
-        int ckCount = baseMetadata.clusteringKeyColumns().size();
+        int ckCount = baseMetadata.clusteringColumns().size();
         List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(ckCount);
-        List<AbstractType<?>> ckTypes = baseMetadata.comparator.getComponents();
         types.add(SecondaryIndex.keyComparator);
-        for (int i = 0; i < columnDef.componentIndex; i++)
-            types.add(ckTypes.get(i));
-        for (int i = columnDef.componentIndex + 1; i < ckCount; i++)
-            types.add(ckTypes.get(i));
-        return CompositeType.getInstance(types);
+        for (int i = 0; i < columnDef.position(); i++)
+            types.add(baseMetadata.clusteringColumns().get(i).type);
+        for (int i = columnDef.position() + 1; i < ckCount; i++)
+            types.add(baseMetadata.clusteringColumns().get(i).type);
+        return new CompoundDenseCellNameType(types);
     }
 
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Column column)
+    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
     {
-        CompositeType baseComparator = (CompositeType)baseCfs.getComparator();
-        ByteBuffer[] components = baseComparator.split(column.name());
-        return components[columnDef.componentIndex];
+        return cell.name().get(columnDef.position());
     }
 
-    protected ColumnNameBuilder makeIndexColumnNameBuilder(ByteBuffer rowKey, ByteBuffer columnName)
+    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite columnName)
     {
-        int ckCount = baseCfs.metadata.clusteringKeyColumns().size();
-        CompositeType baseComparator = (CompositeType)baseCfs.getComparator();
-        ByteBuffer[] components = baseComparator.split(columnName);
-        CompositeType.Builder builder = getIndexComparator().builder();
+        int count = Math.min(baseCfs.metadata.clusteringColumns().size(), columnName.size());
+        CBuilder builder = getIndexComparator().prefixBuilder();
         builder.add(rowKey);
-
-        for (int i = 0; i < Math.min(components.length, columnDef.componentIndex); i++)
-            builder.add(components[i]);
-        for (int i = columnDef.componentIndex + 1; i < Math.min(components.length, ckCount); i++)
-            builder.add(components[i]);
-        return builder;
+        for (int i = 0; i < Math.min(columnDef.position(), count); i++)
+            builder.add(columnName.get(i));
+        for (int i = columnDef.position() + 1; i < count; i++)
+            builder.add(columnName.get(i));
+        return builder.build();
     }
 
-    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Column indexEntry)
+    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
     {
-        int ckCount = baseCfs.metadata.clusteringKeyColumns().size();
-        ByteBuffer[] components = getIndexComparator().split(indexEntry.name());
+        int ckCount = baseCfs.metadata.clusteringColumns().size();
 
-        ColumnNameBuilder builder = getBaseComparator().builder();
-        for (int i = 0; i < columnDef.componentIndex; i++)
-            builder.add(components[i + 1]);
+        CBuilder builder = baseCfs.getComparator().builder();
+        for (int i = 0; i < columnDef.position(); i++)
+            builder.add(indexEntry.name().get(i + 1));
 
-        builder.add(indexedValue.key);
+        builder.add(indexedValue.getKey());
 
-        for (int i = columnDef.componentIndex + 1; i < ckCount; i++)
-            builder.add(components[i]);
+        for (int i = columnDef.position() + 1; i < ckCount; i++)
+            builder.add(indexEntry.name().get(i));
 
-        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), components[0], builder);
+        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), indexEntry.name().get(0), builder.build());
     }
 
     @Override
-    public boolean indexes(ByteBuffer name)
+    public boolean indexes(CellName name)
     {
         // For now, assume this is only used in CQL3 when we know name has enough component.
         return true;

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKey.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKey.java
new file mode 100644
index 0000000..c252546
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKey.java

@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.index.composites;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CBuilder;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.composites.CompoundDenseCellNameType;
+import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.db.marshal.*;
+
+/**
+ * Index on the collection element of the cell name of a collection.
+ *
+ * A cell indexed by this index will have the general form:
+ *   ck_0 ... ck_n c_name [col_elt] : v
+ * where ck_i are the cluster keys, c_name the CQL3 column name, col_elt the
+ * collection element that we want to index (which may or may not be there depending
+ * on whether c_name is the collection we're indexing) and v the cell value.
+ *
+ * Such a cell is indexed if c_name is the indexed collection (in which case we are guaranteed to have
+ * col_elt). The index entry will be:
+ *   - row key will be col_elt value (getIndexedValue()).
+ *   - cell name will be 'rk ck_0 ... ck_n' where rk is the row key of the initial cell.
+ */
+public class CompositesIndexOnCollectionKey extends CompositesIndex
+{
+    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
+    {
+        int count = 1 + baseMetadata.clusteringColumns().size(); // row key + clustering prefix
+        List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(count);
+        types.add(SecondaryIndex.keyComparator);
+        for (int i = 0; i < count - 1; i++)
+            types.add(baseMetadata.comparator.subtype(i));
+        return new CompoundDenseCellNameType(types);
+    }
+
+    @Override
+    protected AbstractType<?> getIndexKeyComparator()
+    {
+        return ((CollectionType)columnDef.type).nameComparator();
+    }
+
+    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
+    {
+        return cell.name().get(columnDef.position() + 1);
+    }
+
+    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite cellName)
+    {
+        int count = 1 + baseCfs.metadata.clusteringColumns().size();
+        CBuilder builder = getIndexComparator().builder();
+        builder.add(rowKey);
+        for (int i = 0; i < Math.min(cellName.size(), count - 1); i++)
+            builder.add(cellName.get(i));
+        return builder.build();
+    }
+
+    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
+    {
+        int count = 1 + baseCfs.metadata.clusteringColumns().size();
+        CBuilder builder = baseCfs.getComparator().builder();
+        for (int i = 0; i < count - 1; i++)
+            builder.add(indexEntry.name().get(i + 1));
+        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), indexEntry.name().get(0), builder.build());
+    }
+
+    @Override
+    public boolean indexes(CellName name)
+    {
+        // We index if the CQL3 column name is the one of the collection we index
+        AbstractType<?> comp = baseCfs.metadata.getColumnDefinitionComparator(columnDef);
+        return name.size() > columnDef.position()
+            && comp.compare(name.get(columnDef.position()), columnDef.name.bytes) == 0;
+    }
+
+    public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
+    {
+        CellName name = data.getComparator().create(entry.indexedEntryPrefix, columnDef, entry.indexValue.getKey());
+        Cell cell = data.getColumn(name);
+        return cell == null || !cell.isLive(now);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionValue.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionValue.java
new file mode 100644
index 0000000..7a8c552
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionValue.java

@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.index.composites;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CBuilder;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.composites.CompoundDenseCellNameType;
+import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.db.marshal.*;
+
+/**
+ * Index the value of a collection cell.
+ *
+ * This is a lot like an index on REGULAR, except that we also need to make
+ * the collection key part of the index entry so that:
+ *   1) we don't have to scan the whole collection at query time to know the
+ *   entry is stale and if it still satisfies the query.
+ *   2) if a collection has multiple time the same value, we need one entry
+ *   for each so that if we delete one of the value only we only delete the
+ *   entry corresponding to that value.
+ */
+public class CompositesIndexOnCollectionValue extends CompositesIndex
+{
+    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
+    {
+        int prefixSize = columnDef.position();
+        List<AbstractType<?>> types = new ArrayList<>(prefixSize + 2);
+        types.add(SecondaryIndex.keyComparator);
+        for (int i = 0; i < prefixSize; i++)
+            types.add(baseMetadata.comparator.subtype(i));
+        types.add(((CollectionType)columnDef.type).nameComparator()); // collection key
+        return new CompoundDenseCellNameType(types);
+    }
+
+    @Override
+    protected AbstractType<?> getIndexKeyComparator()
+    {
+        return ((CollectionType)columnDef.type).valueComparator();
+    }
+
+    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
+    {
+        return cell.value();
+    }
+
+    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite cellName)
+    {
+        CBuilder builder = getIndexComparator().prefixBuilder();
+        builder.add(rowKey);
+        for (int i = 0; i < Math.min(columnDef.position(), cellName.size()); i++)
+            builder.add(cellName.get(i));
+
+        // When indexing, cellName is a full name including the collection
+        // key. When searching, restricted clustering columns are included
+        // but the collection key is not. In this case, don't try to add an
+        // element to the builder for it, as it will just end up null and
+        // error out when retrieving cells from the index cf (CASSANDRA-7525)
+        if (cellName.size() >= columnDef.position() + 1)
+            builder.add(cellName.get(columnDef.position() + 1));
+        return builder.build();
+    }
+
+    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
+    {
+        int prefixSize = columnDef.position();
+        CellName name = indexEntry.name();
+        CBuilder builder = baseCfs.getComparator().builder();
+        for (int i = 0; i < prefixSize; i++)
+            builder.add(name.get(i + 1));
+        return new IndexedEntry(indexedValue, name, indexEntry.timestamp(), name.get(0), builder.build(), name.get(prefixSize + 1));
+    }
+
+    @Override
+    public boolean indexes(CellName name)
+    {
+        AbstractType<?> comp = baseCfs.metadata.getColumnDefinitionComparator(columnDef);
+        return name.size() > columnDef.position()
+            && comp.compare(name.get(columnDef.position()), columnDef.name.bytes) == 0;
+    }
+
+    public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
+    {
+        CellName name = data.getComparator().create(entry.indexedEntryPrefix, columnDef, entry.indexedEntryCollectionKey);
+        Cell cell = data.getColumn(name);
+        return cell == null || !cell.isLive(now) || ((CollectionType) columnDef.type).valueComparator().compare(entry.indexValue.getKey(), cell.value()) != 0;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnPartitionKey.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnPartitionKey.java
index 4e2c580..b791545 100644
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnPartitionKey.java
+++ b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnPartitionKey.java

@@ -23,8 +23,8 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.db.marshal.*;
 
@@ -48,48 +48,45 @@
  */
 public class CompositesIndexOnPartitionKey extends CompositesIndex
 {
-    public static CompositeType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
+    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
     {
-        int ckCount = baseMetadata.clusteringKeyColumns().size();
+        int ckCount = baseMetadata.clusteringColumns().size();
         List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(ckCount + 1);
         types.add(SecondaryIndex.keyComparator);
-        types.addAll(baseMetadata.comparator.getComponents());
-        return CompositeType.getInstance(types);
+        for (int i = 0; i < ckCount; i++)
+            types.add(baseMetadata.comparator.subtype(i));
+        return new CompoundDenseCellNameType(types);
     }
 
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Column column)
+    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
     {
         CompositeType keyComparator = (CompositeType)baseCfs.metadata.getKeyValidator();
         ByteBuffer[] components = keyComparator.split(rowKey);
-        return components[columnDef.componentIndex];
+        return components[columnDef.position()];
     }
 
-    protected ColumnNameBuilder makeIndexColumnNameBuilder(ByteBuffer rowKey, ByteBuffer columnName)
+    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite columnName)
     {
-        int ckCount = baseCfs.metadata.clusteringKeyColumns().size();
-        CompositeType baseComparator = (CompositeType)baseCfs.getComparator();
-        ByteBuffer[] components = baseComparator.split(columnName);
-        CompositeType.Builder builder = getIndexComparator().builder();
+        int count = Math.min(baseCfs.metadata.clusteringColumns().size(), columnName.size());
+        CBuilder builder = getIndexComparator().prefixBuilder();
         builder.add(rowKey);
-        for (int i = 0; i < ckCount; i++)
-            builder.add(components[i]);
-        return builder;
+        for (int i = 0; i < count; i++)
+            builder.add(columnName.get(i));
+        return builder.build();
     }
 
-    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Column indexEntry)
+    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
     {
-        int ckCount = baseCfs.metadata.clusteringKeyColumns().size();
-        ByteBuffer[] components = getIndexComparator().split(indexEntry.name());
-
-        ColumnNameBuilder builder = getBaseComparator().builder();
+        int ckCount = baseCfs.metadata.clusteringColumns().size();
+        CBuilder builder = baseCfs.getComparator().builder();
         for (int i = 0; i < ckCount; i++)
-            builder.add(components[i + 1]);
+            builder.add(indexEntry.name().get(i + 1));
 
-        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), components[0], builder);
+        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), indexEntry.name().get(0), builder.build());
     }
 
     @Override
-    public boolean indexes(ByteBuffer name)
+    public boolean indexes(CellName name)
     {
         // Since a partition key is always full, we always index it
         return true;

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnRegular.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnRegular.java
index 7159c23..b9dc07f 100644
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnRegular.java
+++ b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnRegular.java

@@ -23,8 +23,8 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.db.marshal.*;
 
@@ -47,58 +47,50 @@
  */
 public class CompositesIndexOnRegular extends CompositesIndex
 {
-    public static CompositeType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
+    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
     {
-        int prefixSize = columnDef.componentIndex;
+        int prefixSize = columnDef.position();
         List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(prefixSize + 1);
         types.add(SecondaryIndex.keyComparator);
         for (int i = 0; i < prefixSize; i++)
-            types.add(((CompositeType)baseMetadata.comparator).types.get(i));
-        return CompositeType.getInstance(types);
+            types.add(baseMetadata.comparator.subtype(i));
+        return new CompoundDenseCellNameType(types);
     }
 
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Column column)
+    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
     {
-        return column.value();
+        return cell.value();
     }
 
-    protected ColumnNameBuilder makeIndexColumnNameBuilder(ByteBuffer rowKey, ByteBuffer columnName)
+    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite cellName)
     {
-        CompositeType baseComparator = (CompositeType)baseCfs.getComparator();
-        ByteBuffer[] components = baseComparator.split(columnName);
-        CompositeType.Builder builder = getIndexComparator().builder();
+        CBuilder builder = getIndexComparator().prefixBuilder();
         builder.add(rowKey);
-        for (int i = 0; i < Math.min(columnDef.componentIndex, components.length); i++)
-            builder.add(components[i]);
-        return builder;
+        for (int i = 0; i < Math.min(columnDef.position(), cellName.size()); i++)
+            builder.add(cellName.get(i));
+        return builder.build();
     }
 
-    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Column indexEntry)
+    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
     {
-        ByteBuffer[] components = getIndexComparator().split(indexEntry.name());
-        CompositeType.Builder builder = getBaseComparator().builder();
-        for (int i = 0; i < columnDef.componentIndex; i++)
-            builder.add(components[i + 1]);
-        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), components[0], builder);
+        CBuilder builder = baseCfs.getComparator().builder();
+        for (int i = 0; i < columnDef.position(); i++)
+            builder.add(indexEntry.name().get(i + 1));
+        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), indexEntry.name().get(0), builder.build());
     }
 
     @Override
-    public boolean indexes(ByteBuffer name)
+    public boolean indexes(CellName name)
     {
-        ByteBuffer[] components = getBaseComparator().split(name);
         AbstractType<?> comp = baseCfs.metadata.getColumnDefinitionComparator(columnDef);
-        return components.length > columnDef.componentIndex
-            && comp.compare(components[columnDef.componentIndex], columnDef.name) == 0;
+        return name.size() > columnDef.position()
+            && comp.compare(name.get(columnDef.position()), columnDef.name.bytes) == 0;
     }
 
     public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
     {
-        ByteBuffer bb = entry.indexedEntryNameBuilder.copy().add(columnDef.name).build();
-        Column liveColumn = data.getColumn(bb);
-        if (liveColumn == null || liveColumn.isMarkedForDelete(now))
-            return true;
-
-        ByteBuffer liveValue = liveColumn.value();
-        return columnDef.getValidator().compare(entry.indexValue.key, liveValue) != 0;
+        CellName name = data.getComparator().create(entry.indexedEntryPrefix, columnDef);
+        Cell cell = data.getColumn(name);
+        return cell == null || !cell.isLive(now) || columnDef.type.compare(entry.indexValue.getKey(), cell.value()) != 0;
     }
 }

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesSearcher.java b/src/java/org/apache/cassandra/db/index/composites/CompositesSearcher.java
index eb618f4..5c1abc9 100644
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesSearcher.java
+++ b/src/java/org/apache/cassandra/db/index/composites/CompositesSearcher.java

@@ -24,15 +24,27 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.cql3.ColumnNameBuilder;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.ArrayBackedSortedColumns;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.IndexExpression;
+import org.apache.cassandra.db.Row;
+import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.composites.Composites;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.filter.ExtendedFilter;
+import org.apache.cassandra.db.filter.IDiskAtomFilter;
+import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
 import org.apache.cassandra.db.index.SecondaryIndexSearcher;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.thrift.IndexExpression;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
 public class CompositesSearcher extends SecondaryIndexSearcher
 {
@@ -47,35 +59,41 @@
     public List<Row> search(ExtendedFilter filter)
     {
         assert filter.getClause() != null && !filter.getClause().isEmpty();
-        return baseCfs.filter(getIndexedIterator(filter), filter);
+        final IndexExpression primary = highestSelectivityPredicate(filter.getClause());
+        final CompositesIndex index = (CompositesIndex)indexManager.getIndexForColumn(primary.column);
+        // TODO: this should perhaps not open and maintain a writeOp for the full duration, but instead only *try* to delete stale entries, without blocking if there's no room
+        // as it stands, we open a writeOp and keep it open for the duration to ensure that should this CF get flushed to make room we don't block the reclamation of any room being made
+        try (OpOrder.Group writeOp = baseCfs.keyspace.writeOrder.start(); OpOrder.Group baseOp = baseCfs.readOrdering.start(); OpOrder.Group indexOp = index.getIndexCfs().readOrdering.start())
+        {
+            return baseCfs.filter(getIndexedIterator(writeOp, filter, primary, index), filter);
+        }
     }
 
-    private ByteBuffer makePrefix(CompositesIndex index, ByteBuffer key, ExtendedFilter filter, boolean isStart)
+    private Composite makePrefix(CompositesIndex index, ByteBuffer key, ExtendedFilter filter, boolean isStart)
     {
         if (key.remaining() == 0)
-            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+            return Composites.EMPTY;
 
-        ColumnNameBuilder builder;
+        Composite prefix;
         IDiskAtomFilter columnFilter = filter.columnFilter(key);
         if (columnFilter instanceof SliceQueryFilter)
         {
             SliceQueryFilter sqf = (SliceQueryFilter)columnFilter;
-            builder = index.makeIndexColumnNameBuilder(key, isStart ? sqf.start() : sqf.finish());
+            Composite columnName = isStart ? sqf.start() : sqf.finish();
+            prefix = columnName.isEmpty() ? index.getIndexComparator().make(key) : index.makeIndexColumnPrefix(key, columnName);
         }
         else
         {
-            builder = index.getIndexComparator().builder().add(key);
+            prefix = index.getIndexComparator().make(key);
         }
-        return isStart ? builder.build() : builder.buildAsEndOfRange();
+        return isStart ? prefix.start() : prefix.end();
     }
 
-    private ColumnFamilyStore.AbstractScanIterator getIndexedIterator(final ExtendedFilter filter)
+    private ColumnFamilyStore.AbstractScanIterator getIndexedIterator(final OpOrder.Group writeOp, final ExtendedFilter filter, final IndexExpression primary, final CompositesIndex index)
     {
         // Start with the most-restrictive indexed clause, then apply remaining clauses
         // to each row matching that clause.
         // TODO: allow merge join instead of just one index + loop
-        final IndexExpression primary = highestSelectivityPredicate(filter.getClause());
-        final CompositesIndex index = (CompositesIndex)indexManager.getIndexForColumn(primary.column_name);
         assert index != null;
         assert index.getIndexCfs() != null;
         final DecoratedKey indexKey = index.getIndexKeyFor(primary.value);
@@ -90,19 +108,19 @@
          * indexed row.
          */
         final AbstractBounds<RowPosition> range = filter.dataRange.keyRange();
-        ByteBuffer startKey = range.left instanceof DecoratedKey ? ((DecoratedKey)range.left).key : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        ByteBuffer endKey = range.right instanceof DecoratedKey ? ((DecoratedKey)range.right).key : ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        ByteBuffer startKey = range.left instanceof DecoratedKey ? ((DecoratedKey)range.left).getKey() : ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        ByteBuffer endKey = range.right instanceof DecoratedKey ? ((DecoratedKey)range.right).getKey() : ByteBufferUtil.EMPTY_BYTE_BUFFER;
 
-        final CompositeType baseComparator = (CompositeType)baseCfs.getComparator();
-        final CompositeType indexComparator = (CompositeType)index.getIndexCfs().getComparator();
+        final CellNameType baseComparator = baseCfs.getComparator();
+        final CellNameType indexComparator = index.getIndexCfs().getComparator();
 
-        final ByteBuffer startPrefix = makePrefix(index, startKey, filter, true);
-        final ByteBuffer endPrefix = makePrefix(index, endKey, filter, false);
+        final Composite startPrefix = makePrefix(index, startKey, filter, true);
+        final Composite endPrefix = makePrefix(index, endKey, filter, false);
 
         return new ColumnFamilyStore.AbstractScanIterator()
         {
-            private ByteBuffer lastSeenPrefix = startPrefix;
-            private Deque<Column> indexColumns;
+            private Composite lastSeenPrefix = startPrefix;
+            private Deque<Cell> indexCells;
             private int columnsRead = Integer.MAX_VALUE;
             private int limit = filter.currentLimit();
             private int columnsCount = 0;
@@ -134,6 +152,7 @@
                  */
                 DecoratedKey currentKey = null;
                 ColumnFamily data = null;
+                Composite previousPrefix = null;
 
                 while (true)
                 {
@@ -142,7 +161,7 @@
                     if (columnsCount >= limit)
                         return makeReturn(currentKey, data);
 
-                    if (indexColumns == null || indexColumns.isEmpty())
+                    if (indexCells == null || indexCells.isEmpty())
                     {
                         if (columnsRead < rowsPerQuery)
                         {
@@ -162,34 +181,34 @@
                                                                              rowsPerQuery,
                                                                              filter.timestamp);
                         ColumnFamily indexRow = index.getIndexCfs().getColumnFamily(indexFilter);
-                        if (indexRow == null || indexRow.getColumnCount() == 0)
+                        if (indexRow == null || !indexRow.hasColumns())
                             return makeReturn(currentKey, data);
 
-                        Collection<Column> sortedColumns = indexRow.getSortedColumns();
-                        columnsRead = sortedColumns.size();
-                        indexColumns = new ArrayDeque<>(sortedColumns);
-                        Column firstColumn = sortedColumns.iterator().next();
+                        Collection<Cell> sortedCells = indexRow.getSortedColumns();
+                        columnsRead = sortedCells.size();
+                        indexCells = new ArrayDeque<>(sortedCells);
+                        Cell firstCell = sortedCells.iterator().next();
 
                         // Paging is racy, so it is possible the first column of a page is not the last seen one.
-                        if (lastSeenPrefix != startPrefix && lastSeenPrefix.equals(firstColumn.name()))
+                        if (lastSeenPrefix != startPrefix && lastSeenPrefix.equals(firstCell.name()))
                         {
                             // skip the row we already saw w/ the last page of results
-                            indexColumns.poll();
-                            logger.trace("Skipping {}", indexComparator.getString(firstColumn.name()));
+                            indexCells.poll();
+                            logger.trace("Skipping {}", indexComparator.getString(firstCell.name()));
                         }
                     }
 
-                    while (!indexColumns.isEmpty() && columnsCount <= limit)
+                    while (!indexCells.isEmpty() && columnsCount <= limit)
                     {
-                        Column column = indexColumns.poll();
-                        lastSeenPrefix = column.name();
-                        if (column.isMarkedForDelete(filter.timestamp))
+                        Cell cell = indexCells.poll();
+                        lastSeenPrefix = cell.name();
+                        if (!cell.isLive(filter.timestamp))
                         {
-                            logger.trace("skipping {}", column.name());
+                            logger.trace("skipping {}", cell.name());
                             continue;
                         }
 
-                        CompositesIndex.IndexedEntry entry = index.decodeEntry(indexKey, column);
+                        CompositesIndex.IndexedEntry entry = index.decodeEntry(indexKey, cell);
                         DecoratedKey dk = baseCfs.partitioner.decorateKey(entry.indexedKey);
 
                         // Are we done for this row?
@@ -203,7 +222,7 @@
                             currentKey = dk;
 
                             // We're done with the previous row, return it if it had data, continue otherwise
-                            indexColumns.addFirst(column);
+                            indexCells.addFirst(cell);
                             if (data == null)
                                 continue;
                             else
@@ -221,57 +240,66 @@
                             }
                             else
                             {
-                                logger.debug("Skipping entry {} before assigned scan range", dk.token);
+                                logger.debug("Skipping entry {} before assigned scan range", dk.getToken());
                                 continue;
                             }
                         }
 
-                        // Check if this entry cannot be a hit due to the original column filter
-                        ByteBuffer start = entry.indexedEntryStart();
-                        if (!filter.columnFilter(dk.key).maySelectPrefix(baseComparator, start))
+                        // Check if this entry cannot be a hit due to the original cell filter
+                        Composite start = entry.indexedEntryPrefix;
+                        if (!filter.columnFilter(dk.getKey()).maySelectPrefix(baseComparator, start))
                             continue;
 
-                        logger.trace("Adding index hit to current row for {}", indexComparator.getString(column.name()));
+                        // If we've record the previous prefix, it means we're dealing with an index on the collection value. In
+                        // that case, we can have multiple index prefix for the same CQL3 row. In that case, we want to only add
+                        // the CQL3 row once (because requesting the data multiple time would be inefficient but more importantly
+                        // because we shouldn't count the columns multiple times with the lastCounted() call at the end of this
+                        // method).
+                        if (previousPrefix != null && previousPrefix.equals(start))
+                            continue;
+                        else
+                            previousPrefix = null;
+
+                        logger.trace("Adding index hit to current row for {}", indexComparator.getString(cell.name()));
 
                         // We always query the whole CQL3 row. In the case where the original filter was a name filter this might be
                         // slightly wasteful, but this probably doesn't matter in practice and it simplify things.
-                        ColumnSlice dataSlice = new ColumnSlice(start, entry.indexedEntryEnd());
-                        ColumnSlice[] slices;
-                        if (baseCfs.metadata.hasStaticColumns())
-                        {
-                            // If the table has static columns, we must fetch them too as they may need to be returned too.
-                            // Note that this is potentially wasteful for 2 reasons:
-                            //  1) we will retrieve the static parts for each indexed row, even if we have more than one row in
-                            //     the same partition. If we were to group data queries to rows on the same slice, which would
-                            //     speed up things in general, we would also optimize here since we would fetch static columns only
-                            //     once for each group.
-                            //  2) at this point we don't know if the user asked for static columns or not, so we might be fetching
-                            //     them for nothing. We would however need to ship the list of "CQL3 columns selected" with getRangeSlice
-                            //     to be able to know that.
-                            // TODO: we should improve both point above
-                            ColumnSlice staticSlice = new ColumnSlice(ByteBufferUtil.EMPTY_BYTE_BUFFER, baseCfs.metadata.getStaticColumnNameBuilder().buildAsEndOfRange());
-                            slices = new ColumnSlice[]{ staticSlice, dataSlice };
-                        }
-                        else
-                        {
-                            slices = new ColumnSlice[]{ dataSlice };
-                        }
-                        SliceQueryFilter dataFilter = new SliceQueryFilter(slices, false, Integer.MAX_VALUE, baseCfs.metadata.clusteringKeyColumns().size());
+                        ColumnSlice dataSlice = new ColumnSlice(start, entry.indexedEntryPrefix.end());
+                        // If the table has static columns, we must fetch them too as they may need to be returned too.
+                        // Note that this is potentially wasteful for 2 reasons:
+                        //  1) we will retrieve the static parts for each indexed row, even if we have more than one row in
+                        //     the same partition. If we were to group data queries to rows on the same slice, which would
+                        //     speed up things in general, we would also optimize here since we would fetch static columns only
+                        //     once for each group.
+                        //  2) at this point we don't know if the user asked for static columns or not, so we might be fetching
+                        //     them for nothing. We would however need to ship the list of "CQL3 columns selected" with getRangeSlice
+                        //     to be able to know that.
+                        // TODO: we should improve both point above
+                        ColumnSlice[] slices = baseCfs.metadata.hasStaticColumns()
+                                             ? new ColumnSlice[]{ baseCfs.metadata.comparator.staticPrefix().slice(), dataSlice }
+                                             : new ColumnSlice[]{ dataSlice };
+                        SliceQueryFilter dataFilter = new SliceQueryFilter(slices, false, Integer.MAX_VALUE, baseCfs.metadata.clusteringColumns().size());
                         ColumnFamily newData = baseCfs.getColumnFamily(new QueryFilter(dk, baseCfs.name, dataFilter, filter.timestamp));
                         if (newData == null || index.isStale(entry, newData, filter.timestamp))
                         {
-                            index.delete(entry);
+                            index.delete(entry, writeOp);
                             continue;
                         }
 
-                        assert newData != null : "An entry with not data should have been considered stale";
+                        assert newData != null : "An entry with no data should have been considered stale";
 
-                        if (!filter.isSatisfiedBy(dk, newData, entry.indexedEntryNameBuilder))
+                        // We know the entry is not stale and so the entry satisfy the primary clause. So whether
+                        // or not the data satisfies the other clauses, there will be no point to re-check the
+                        // same CQL3 row if we run into another collection value entry for this row.
+                        if (entry.indexedEntryCollectionKey != null)
+                            previousPrefix = start;
+
+                        if (!filter.isSatisfiedBy(dk, newData, entry.indexedEntryPrefix, entry.indexedEntryCollectionKey))
                             continue;
 
                         if (data == null)
-                            data = TreeMapBackedSortedColumns.factory.create(baseCfs.metadata);
-                        data.resolve(newData);
+                            data = ArrayBackedSortedColumns.factory.create(baseCfs.metadata);
+                        data.addAll(newData);
                         columnsCount += dataFilter.lastCounted();
                     }
                  }

diff --git a/src/java/org/apache/cassandra/db/index/keys/KeysIndex.java b/src/java/org/apache/cassandra/db/index/keys/KeysIndex.java
index cbb91a6..e771d99 100644
--- a/src/java/org/apache/cassandra/db/index/keys/KeysIndex.java
+++ b/src/java/org/apache/cassandra/db/index/keys/KeysIndex.java

@@ -20,8 +20,10 @@
 import java.nio.ByteBuffer;
 import java.util.Set;
 
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.Column;
 import org.apache.cassandra.db.index.AbstractSimplePerColumnSecondaryIndex;
 import org.apache.cassandra.db.index.SecondaryIndexSearcher;
 import org.apache.cassandra.db.marshal.AbstractType;
@@ -37,14 +39,14 @@
  */
 public class KeysIndex extends AbstractSimplePerColumnSecondaryIndex
 {
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Column column)
+    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
     {
-        return column.value();
+        return cell.value();
     }
 
-    protected ByteBuffer makeIndexColumnName(ByteBuffer rowKey, Column column)
+    protected CellName makeIndexColumnName(ByteBuffer rowKey, Cell cell)
     {
-        return rowKey;
+        return CellNames.simpleDense(rowKey);
     }
 
     public SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns)
@@ -54,12 +56,8 @@
 
     public boolean isIndexEntryStale(ByteBuffer indexedValue, ColumnFamily data, long now)
     {
-        Column liveColumn = data.getColumn(columnDef.name);
-        if (liveColumn == null || liveColumn.isMarkedForDelete(now))
-            return true;
-
-        ByteBuffer liveValue = liveColumn.value();
-        return columnDef.getValidator().compare(indexedValue, liveValue) != 0;
+        Cell cell = data.getColumn(data.getComparator().makeCellName(columnDef.name.bytes));
+        return cell == null || !cell.isLive(now) || columnDef.type.compare(indexedValue, cell.value()) != 0;
     }
 
     public void validateOptions() throws ConfigurationException
@@ -67,8 +65,15 @@
         // no options used
     }
 
+    public boolean indexes(CellName name)
+    {
+        // This consider the full cellName directly
+        AbstractType<?> comparator = baseCfs.metadata.getColumnDefinitionComparator(columnDef);
+        return comparator.compare(columnDef.name.bytes, name.toByteBuffer()) == 0;
+    }
+
     protected AbstractType getExpressionComparator()
     {
-        return baseCfs.getComparator();
+        return baseCfs.getComparator().asAbstractType();
     }
 }

diff --git a/src/java/org/apache/cassandra/db/index/keys/KeysSearcher.java b/src/java/org/apache/cassandra/db/index/keys/KeysSearcher.java
index 5d82ba0..4055b7c 100644
--- a/src/java/org/apache/cassandra/db/index/keys/KeysSearcher.java
+++ b/src/java/org/apache/cassandra/db/index/keys/KeysSearcher.java

@@ -28,15 +28,17 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.composites.Composites;
 import org.apache.cassandra.db.filter.ExtendedFilter;
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.index.*;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.HeapAllocator;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
 public class KeysSearcher extends SecondaryIndexSearcher
 {
@@ -51,17 +53,22 @@
     public List<Row> search(ExtendedFilter filter)
     {
         assert filter.getClause() != null && !filter.getClause().isEmpty();
-        return baseCfs.filter(getIndexedIterator(filter), filter);
+        final IndexExpression primary = highestSelectivityPredicate(filter.getClause());
+        final SecondaryIndex index = indexManager.getIndexForColumn(primary.column);
+        // TODO: this should perhaps not open and maintain a writeOp for the full duration, but instead only *try* to delete stale entries, without blocking if there's no room
+        // as it stands, we open a writeOp and keep it open for the duration to ensure that should this CF get flushed to make room we don't block the reclamation of any room  being made
+        try (OpOrder.Group writeOp = baseCfs.keyspace.writeOrder.start(); OpOrder.Group baseOp = baseCfs.readOrdering.start(); OpOrder.Group indexOp = index.getIndexCfs().readOrdering.start())
+        {
+            return baseCfs.filter(getIndexedIterator(writeOp, filter, primary, index), filter);
+        }
     }
 
-    private ColumnFamilyStore.AbstractScanIterator getIndexedIterator(final ExtendedFilter filter)
+    private ColumnFamilyStore.AbstractScanIterator getIndexedIterator(final OpOrder.Group writeOp, final ExtendedFilter filter, final IndexExpression primary, final SecondaryIndex index)
     {
 
         // Start with the most-restrictive indexed clause, then apply remaining clauses
         // to each row matching that clause.
         // TODO: allow merge join instead of just one index + loop
-        final IndexExpression primary = highestSelectivityPredicate(filter.getClause());
-        final SecondaryIndex index = indexManager.getIndexForColumn(primary.column_name);
         assert index != null;
         assert index.getIndexCfs() != null;
         final DecoratedKey indexKey = index.getIndexKeyFor(primary.value);
@@ -77,13 +84,16 @@
          * indexed row.
          */
         final AbstractBounds<RowPosition> range = filter.dataRange.keyRange();
-        final ByteBuffer startKey = range.left instanceof DecoratedKey ? ((DecoratedKey)range.left).key : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        final ByteBuffer endKey = range.right instanceof DecoratedKey ? ((DecoratedKey)range.right).key : ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        CellNameType type = index.getIndexCfs().getComparator();
+        final Composite startKey = range.left instanceof DecoratedKey ? type.make(((DecoratedKey)range.left).getKey()) : Composites.EMPTY;
+        final Composite endKey = range.right instanceof DecoratedKey ? type.make(((DecoratedKey)range.right).getKey()) : Composites.EMPTY;
+
+        final CellName primaryColumn = baseCfs.getComparator().cellFromByteBuffer(primary.column);
 
         return new ColumnFamilyStore.AbstractScanIterator()
         {
-            private ByteBuffer lastSeenKey = startKey;
-            private Iterator<Column> indexColumns;
+            private Composite lastSeenKey = startKey;
+            private Iterator<Cell> indexColumns;
             private int columnsRead = Integer.MAX_VALUE;
 
             protected Row computeNext()
@@ -103,7 +113,7 @@
 
                         if (logger.isTraceEnabled() && (index instanceof AbstractSimplePerColumnSecondaryIndex))
                             logger.trace("Scanning index {} starting with {}",
-                                         ((AbstractSimplePerColumnSecondaryIndex)index).expressionString(primary), index.getBaseCfs().metadata.getKeyValidator().getString(startKey));
+                                         ((AbstractSimplePerColumnSecondaryIndex)index).expressionString(primary), index.getBaseCfs().metadata.getKeyValidator().getString(startKey.toByteBuffer()));
 
                         QueryFilter indexFilter = QueryFilter.getSliceFilter(indexKey,
                                                                              index.getIndexCfs().name,
@@ -120,19 +130,19 @@
                             return endOfData();
                         }
 
-                        Collection<Column> sortedColumns = indexRow.getSortedColumns();
-                        columnsRead = sortedColumns.size();
-                        indexColumns = sortedColumns.iterator();
-                        Column firstColumn = sortedColumns.iterator().next();
+                        Collection<Cell> sortedCells = indexRow.getSortedColumns();
+                        columnsRead = sortedCells.size();
+                        indexColumns = sortedCells.iterator();
+                        Cell firstCell = sortedCells.iterator().next();
 
                         // Paging is racy, so it is possible the first column of a page is not the last seen one.
-                        if (lastSeenKey != startKey && lastSeenKey.equals(firstColumn.name()))
+                        if (lastSeenKey != startKey && lastSeenKey.equals(firstCell.name()))
                         {
                             // skip the row we already saw w/ the last page of results
                             indexColumns.next();
-                            logger.trace("Skipping {}", baseCfs.metadata.getKeyValidator().getString(firstColumn.name()));
+                            logger.trace("Skipping {}", baseCfs.metadata.getKeyValidator().getString(firstCell.name().toByteBuffer()));
                         }
-                        else if (range instanceof Range && indexColumns.hasNext() && firstColumn.name().equals(startKey))
+                        else if (range instanceof Range && indexColumns.hasNext() && firstCell.name().equals(startKey))
                         {
                             // skip key excluded by range
                             indexColumns.next();
@@ -142,15 +152,15 @@
 
                     while (indexColumns.hasNext())
                     {
-                        Column column = indexColumns.next();
-                        lastSeenKey = column.name();
-                        if (column.isMarkedForDelete(filter.timestamp))
+                        Cell cell = indexColumns.next();
+                        lastSeenKey = cell.name();
+                        if (!cell.isLive(filter.timestamp))
                         {
-                            logger.trace("skipping {}", column.name());
+                            logger.trace("skipping {}", cell.name());
                             continue;
                         }
 
-                        DecoratedKey dk = baseCfs.partitioner.decorateKey(lastSeenKey);
+                        DecoratedKey dk = baseCfs.partitioner.decorateKey(lastSeenKey.toByteBuffer());
                         if (!range.right.isMinimum(baseCfs.partitioner) && range.right.compareTo(dk) < 0)
                         {
                             logger.trace("Reached end of assigned scan range");
@@ -158,15 +168,15 @@
                         }
                         if (!range.contains(dk))
                         {
-                            logger.trace("Skipping entry {} outside of assigned scan range", dk.token);
+                            logger.trace("Skipping entry {} outside of assigned scan range", dk.getToken());
                             continue;
                         }
 
                         logger.trace("Returning index hit for {}", dk);
-                        ColumnFamily data = baseCfs.getColumnFamily(new QueryFilter(dk, baseCfs.name, filter.columnFilter(lastSeenKey), filter.timestamp));
-                        // While the column family we'll get in the end should contains the primary clause column, the initialFilter may not have found it and can thus be null
+                        ColumnFamily data = baseCfs.getColumnFamily(new QueryFilter(dk, baseCfs.name, filter.columnFilter(lastSeenKey.toByteBuffer()), filter.timestamp));
+                        // While the column family we'll get in the end should contains the primary clause cell, the initialFilter may not have found it and can thus be null
                         if (data == null)
-                            data = TreeMapBackedSortedColumns.factory.create(baseCfs.metadata);
+                            data = ArrayBackedSortedColumns.factory.create(baseCfs.metadata);
 
                         // as in CFS.filter - extend the filter to ensure we include the columns
                         // from the index expressions, just in case they weren't included in the initialFilter
@@ -175,14 +185,14 @@
                         {
                             ColumnFamily cf = baseCfs.getColumnFamily(new QueryFilter(dk, baseCfs.name, extraFilter, filter.timestamp));
                             if (cf != null)
-                                data.addAll(cf, HeapAllocator.instance);
+                                data.addAll(cf);
                         }
 
-                        if (((KeysIndex)index).isIndexEntryStale(indexKey.key, data, filter.timestamp))
+                        if (((KeysIndex)index).isIndexEntryStale(indexKey.getKey(), data, filter.timestamp))
                         {
                             // delete the index entry w/ its own timestamp
-                            Column dummyColumn = new Column(primary.column_name, indexKey.key, column.timestamp());
-                            ((PerColumnSecondaryIndex)index).delete(dk.key, dummyColumn);
+                            Cell dummyCell = new BufferCell(primaryColumn, indexKey.getKey(), cell.timestamp());
+                            ((PerColumnSecondaryIndex)index).delete(dk.getKey(), dummyCell, writeOp);
                             continue;
                         }
                         return new Row(dk, data);

diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractCommutativeType.java b/src/java/org/apache/cassandra/db/marshal/AbstractCommutativeType.java
deleted file mode 100644
index 01db148..0000000
--- a/src/java/org/apache/cassandra/db/marshal/AbstractCommutativeType.java
+++ /dev/null

@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.marshal;
-
-import java.nio.ByteBuffer;
-import org.apache.cassandra.db.Column;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public abstract class AbstractCommutativeType extends AbstractType<Long>
-{
-    public boolean isCommutative()
-    {
-        return true;
-    }
-
-    @Override
-    public Long compose(ByteBuffer bytes)
-    {
-        return CounterContext.instance().total(bytes);
-    }
-
-    @Override
-    public ByteBuffer decompose(Long value)
-    {
-        return ByteBufferUtil.bytes(value);
-    }
-
-    /**
-     * create commutative column
-     */
-    public abstract Column createColumn(ByteBuffer name, ByteBuffer value, long timestamp);
-}

diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java
index de58158..4e830abb 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java

@@ -37,10 +37,8 @@
 {
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1 == null || !o1.hasRemaining())
-            return o2 == null || !o2.hasRemaining() ? 0 : -1;
-        if (o2 == null || !o2.hasRemaining())
-            return 1;
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         ByteBuffer bb1 = o1.duplicate();
         ByteBuffer bb2 = o2.duplicate();
@@ -141,7 +139,7 @@
      * Escapes all occurences of the ':' character from the input, replacing them by "\:".
      * Furthermore, if the last character is '\' or '!', a '!' is appended.
      */
-    static String escape(String input)
+    public static String escape(String input)
     {
         if (input.isEmpty())
             return input;
@@ -207,7 +205,7 @@
             byte b = bb.get();
             if (b != 0)
             {
-                sb.append(":!");
+                sb.append(b < 0 ? ":_" : ":!");
                 break;
             }
             ++i;
@@ -222,6 +220,7 @@
         List<ParsedComparator> comparators = new ArrayList<ParsedComparator>(parts.size());
         int totalLength = 0, i = 0;
         boolean lastByteIsOne = false;
+        boolean lastByteIsMinusOne = false;
 
         for (String part : parts)
         {
@@ -230,6 +229,11 @@
                 lastByteIsOne = true;
                 break;
             }
+            else if (part.equals("_"))
+            {
+                lastByteIsMinusOne = true;
+                break;
+            }
 
             ParsedComparator p = parseComparator(i, part);
             AbstractType<?> type = p.getAbstractType();
@@ -254,6 +258,8 @@
         }
         if (lastByteIsOne)
             bb.put(bb.limit() - 1, (byte)1);
+        else if (lastByteIsMinusOne)
+            bb.put(bb.limit() - 1, (byte)-1);
 
         bb.rewind();
         return bb;

diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
index e92f272..4e1f2a3 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java

@@ -25,17 +25,9 @@
 import java.util.Map;
 
 import org.apache.cassandra.cql3.CQL3Type;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.db.Column;
-import org.apache.cassandra.db.OnDiskAtom;
-import org.apache.cassandra.db.RangeTombstone;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import static org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
 
 /**
  * Specifies a Comparator for a specific type of ByteBuffer.
@@ -47,78 +39,10 @@
  */
 public abstract class AbstractType<T> implements Comparator<ByteBuffer>
 {
-    public final Comparator<IndexInfo> indexComparator;
-    public final Comparator<IndexInfo> indexReverseComparator;
-    public final Comparator<Column> columnComparator;
-    public final Comparator<Column> columnReverseComparator;
-    public final Comparator<OnDiskAtom> onDiskAtomComparator;
     public final Comparator<ByteBuffer> reverseComparator;
 
     protected AbstractType()
     {
-        indexComparator = new Comparator<IndexInfo>()
-        {
-            public int compare(IndexInfo o1, IndexInfo o2)
-            {
-                return AbstractType.this.compare(o1.lastName, o2.lastName);
-            }
-        };
-        indexReverseComparator = new Comparator<IndexInfo>()
-        {
-            public int compare(IndexInfo o1, IndexInfo o2)
-            {
-                return AbstractType.this.compare(o1.firstName, o2.firstName);
-            }
-        };
-        columnComparator = new Comparator<Column>()
-        {
-            public int compare(Column c1, Column c2)
-            {
-                return AbstractType.this.compare(c1.name(), c2.name());
-            }
-        };
-        columnReverseComparator = new Comparator<Column>()
-        {
-            public int compare(Column c1, Column c2)
-            {
-                return AbstractType.this.compare(c2.name(), c1.name());
-            }
-        };
-        onDiskAtomComparator = new Comparator<OnDiskAtom>()
-        {
-            public int compare(OnDiskAtom c1, OnDiskAtom c2)
-            {
-                int comp = AbstractType.this.compare(c1.name(), c2.name());
-                if (comp != 0)
-                    return comp;
-
-                if (c1 instanceof RangeTombstone)
-                {
-                    if (c2 instanceof RangeTombstone)
-                    {
-                        RangeTombstone t1 = (RangeTombstone)c1;
-                        RangeTombstone t2 = (RangeTombstone)c2;
-                        int comp2 = AbstractType.this.compare(t1.max, t2.max);
-                        if (comp2 == 0)
-                            return t1.data.compareTo(t2.data);
-                        else
-                            return comp2;
-                    }
-                    else
-                    {
-                        return -1;
-                    }
-                }
-                else if (c2 instanceof RangeTombstone)
-                {
-                    return 1;
-                }
-                else
-                {
-                    return 0;
-                }
-            }
-        };
         reverseComparator = new Comparator<ByteBuffer>()
         {
             public int compare(ByteBuffer o1, ByteBuffer o2)
@@ -171,6 +95,19 @@
         getSerializer().validate(bytes);
     }
 
+    /**
+     * Validate cell value. Unlike {@linkplain #validate(java.nio.ByteBuffer)},
+     * cell value is passed to validate its content.
+     * Usually, this is the same as validate except collection.
+     *
+     * @param cellValue ByteBuffer representing cell value
+     * @throws MarshalException
+     */
+    public void validateCellValue(ByteBuffer cellValue) throws MarshalException
+    {
+        validate(cellValue);
+    }
+
     /* Most of our internal type should override that. */
     public CQL3Type asCQL3Type()
     {
@@ -179,12 +116,6 @@
 
     public abstract TypeSerializer<T> getSerializer();
 
-    /** @deprecated use reverseComparator field instead */
-    public Comparator<ByteBuffer> getReverseComparator()
-    {
-        return reverseComparator;
-    }
-
     /* convenience method */
     public String getString(Collection<ByteBuffer> names)
     {
@@ -196,18 +127,7 @@
         return builder.toString();
     }
 
-    /* convenience method */
-    public String getColumnsString(Iterable<Column> columns)
-    {
-        StringBuilder builder = new StringBuilder();
-        for (Column column : columns)
-        {
-            builder.append(column.getString(this)).append(",");
-        }
-        return builder.toString();
-    }
-
-    public boolean isCommutative()
+    public boolean isCounter()
     {
         return false;
     }
@@ -267,6 +187,15 @@
     }
 
     /**
+     * @return true IFF the byte representation of this type can be compared unsigned
+     * and always return the same result as calling this object's compare or compareCollectionMembers methods
+     */
+    public boolean isByteOrderComparable()
+    {
+        return false;
+    }
+
+    /**
      * An alternative comparison function used by CollectionsType in conjunction with CompositeType.
      *
      * This comparator is only called to compare components of a CompositeType. It gets the value of the
@@ -324,25 +253,4 @@
     {
         return getClass().getName();
     }
-
-    protected boolean intersects(ByteBuffer minColName, ByteBuffer maxColName, ByteBuffer sliceStart, ByteBuffer sliceEnd)
-    {
-        return (sliceStart.equals(ByteBufferUtil.EMPTY_BYTE_BUFFER) || compare(maxColName, sliceStart) >= 0)
-               && (sliceEnd.equals(ByteBufferUtil.EMPTY_BYTE_BUFFER) || compare(sliceEnd, minColName) >= 0);
-    }
-
-    public boolean intersects(List<ByteBuffer> minColumnNames, List<ByteBuffer> maxColumnNames, SliceQueryFilter filter)
-    {
-        assert minColumnNames.size() == 1;
-
-        for (ColumnSlice slice : filter.slices)
-        {
-            ByteBuffer start = filter.isReversed() ? slice.finish : slice.start;
-            ByteBuffer finish = filter.isReversed() ? slice.start : slice.finish;
-
-            if (intersects(minColumnNames.get(0), maxColumnNames.get(0), start, finish))
-                return true;
-        }
-        return false;
-    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/AsciiType.java b/src/java/org/apache/cassandra/db/marshal/AsciiType.java
index fdccfcd..891a8ed 100644
--- a/src/java/org/apache/cassandra/db/marshal/AsciiType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AsciiType.java

@@ -27,6 +27,7 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.AsciiSerializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class AsciiType extends AbstractType<String>
 {
@@ -45,7 +46,7 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        return BytesType.bytesCompare(o1, o2);
+        return ByteBufferUtil.compareUnsigned(o1, o2);
     }
 
     public ByteBuffer fromString(String source)
@@ -73,4 +74,9 @@
     {
         return AsciiSerializer.instance;
     }
+
+    public boolean isByteOrderComparable()
+    {
+        return true;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/BooleanType.java b/src/java/org/apache/cassandra/db/marshal/BooleanType.java
index d5f44ce..70d7559 100644
--- a/src/java/org/apache/cassandra/db/marshal/BooleanType.java
+++ b/src/java/org/apache/cassandra/db/marshal/BooleanType.java

@@ -37,10 +37,8 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if ((o1 == null) || (o1.remaining() != 1))
-            return ((o2 == null) || (o2.remaining() != 1)) ? 0 : -1;
-        if ((o2 == null) || (o2.remaining() != 1))
-            return 1;
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         // False is 0, True is anything else, makes False sort before True.
         byte b1 = o1.get(o1.position());

diff --git a/src/java/org/apache/cassandra/db/marshal/BytesType.java b/src/java/org/apache/cassandra/db/marshal/BytesType.java
index 7907c2d..a6a672c 100644
--- a/src/java/org/apache/cassandra/db/marshal/BytesType.java
+++ b/src/java/org/apache/cassandra/db/marshal/BytesType.java

@@ -34,14 +34,6 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        return BytesType.bytesCompare(o1, o2);
-    }
-
-    public static int bytesCompare(ByteBuffer o1, ByteBuffer o2)
-    {
-        if (o1 == null)
-            return o2 == null ? 0 : -1;
-
         return ByteBufferUtil.compareUnsigned(o1, o2);
     }
 
@@ -72,6 +64,11 @@
         return true;
     }
 
+    public boolean isByteOrderComparable()
+    {
+        return true;
+    }
+
     public CQL3Type asCQL3Type()
     {
         return CQL3Type.Native.BLOB;

diff --git a/src/java/org/apache/cassandra/db/marshal/CollectionType.java b/src/java/org/apache/cassandra/db/marshal/CollectionType.java
index 29b77a0..8a5fe5c 100644
--- a/src/java/org/apache/cassandra/db/marshal/CollectionType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CollectionType.java

@@ -24,10 +24,10 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.cql3.CQL3Type;
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.Pair;
 
 /**
  * The abstract validator that is the base for maps, sets and lists.
@@ -58,7 +58,16 @@
 
     protected abstract void appendToStringBuilder(StringBuilder sb);
 
-    public abstract ByteBuffer serialize(List<Pair<ByteBuffer, Column>> columns);
+    public abstract List<ByteBuffer> serializedValues(List<Cell> cells);
+
+    @Override
+    public abstract CollectionSerializer<T> getSerializer();
+
+    @Override
+    public void validateCellValue(ByteBuffer cellValue) throws MarshalException
+    {
+        valueComparator().validate(cellValue);
+    }
 
     @Override
     public String toString()
@@ -85,9 +94,20 @@
         }
     }
 
-    public void validate(ByteBuffer bytes)
+    @Override
+    public boolean isCompatibleWith(AbstractType<?> previous)
     {
-        valueComparator().validate(bytes);
+        if (this == previous)
+            return true;
+
+        if (!getClass().equals(previous.getClass()))
+            return false;
+
+        CollectionType tprev = (CollectionType) previous;
+        // The name is part of the Cell name, so we need sorting compatibility, i.e. isCompatibleWith().
+        // But value is the Cell value, so isValueCompatibleWith() is enough
+        return this.nameComparator().isCompatibleWith(tprev.nameComparator())
+            && this.valueComparator().isValueCompatibleWith(tprev.valueComparator());
     }
 
     public boolean isCollection()
@@ -95,35 +115,21 @@
         return true;
     }
 
-    // Utilitary method
-    protected static ByteBuffer pack(List<ByteBuffer> buffers, int elements, int size)
+    protected List<Cell> enforceLimit(List<Cell> cells, int version)
     {
-        ByteBuffer result = ByteBuffer.allocate(2 + size);
-        result.putShort((short)elements);
-        for (ByteBuffer bb : buffers)
-        {
-            result.putShort((short)bb.remaining());
-            result.put(bb.duplicate());
-        }
-        return (ByteBuffer)result.flip();
-    }
-
-    protected List<Pair<ByteBuffer, Column>> enforceLimit(List<Pair<ByteBuffer, Column>> columns)
-    {
-        if (columns.size() <= MAX_ELEMENTS)
-            return columns;
+        if (version >= 3 || cells.size() <= MAX_ELEMENTS)
+            return cells;
 
         logger.error("Detected collection with {} elements, more than the {} limit. Only the first {} elements will be returned to the client. "
-                   + "Please see http://cassandra.apache.org/doc/cql3/CQL.html#collections for more details.", columns.size(), MAX_ELEMENTS, MAX_ELEMENTS);
-        return columns.subList(0, MAX_ELEMENTS);
+                   + "Please see http://cassandra.apache.org/doc/cql3/CQL.html#collections for more details.", cells.size(), MAX_ELEMENTS, MAX_ELEMENTS);
+        return cells.subList(0, MAX_ELEMENTS);
     }
 
-    public static ByteBuffer pack(List<ByteBuffer> buffers, int elements)
+    public ByteBuffer serializeForNativeProtocol(List<Cell> cells, int version)
     {
-        int size = 0;
-        for (ByteBuffer bb : buffers)
-            size += 2 + bb.remaining();
-        return pack(buffers, elements, size);
+        cells = enforceLimit(cells, version);
+        List<ByteBuffer> values = serializedValues(cells);
+        return CollectionSerializer.pack(values, cells.size(), version);
     }
 
     public CQL3Type asCQL3Type()

diff --git a/src/java/org/apache/cassandra/db/marshal/ColumnToCollectionType.java b/src/java/org/apache/cassandra/db/marshal/ColumnToCollectionType.java
index a4f7857..a28b874 100644
--- a/src/java/org/apache/cassandra/db/marshal/ColumnToCollectionType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ColumnToCollectionType.java

@@ -121,7 +121,8 @@
         // We are compatible if we have all the definitions previous have (but we can have more).
         for (Map.Entry<ByteBuffer, CollectionType> entry : prev.defined.entrySet())
         {
-            if (!entry.getValue().isCompatibleWith(defined.get(entry.getKey())))
+            CollectionType newType = defined.get(entry.getKey());
+            if (newType == null || !newType.isCompatibleWith(entry.getValue()))
                 return false;
         }
         return true;

diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java
index 946ba24..1115dff 100644
--- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java

@@ -27,11 +27,9 @@
 
 import com.google.common.collect.ImmutableList;
 
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
+import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.Relation;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.serializers.MarshalException;
@@ -112,7 +110,7 @@
         return ct;
     }
 
-    private CompositeType(List<AbstractType<?>> types)
+    protected CompositeType(List<AbstractType<?>> types)
     {
         this.types = ImmutableList.copyOf(types);
     }
@@ -170,6 +168,23 @@
         return build(serialized);
     }
 
+    // Overriding the one of AbstractCompositeType because we can do a tad better
+    @Override
+    public ByteBuffer[] split(ByteBuffer name)
+    {
+        // Assume all components, we'll trunk the array afterwards if need be, but
+        // most names will be complete.
+        ByteBuffer[] l = new ByteBuffer[types.size()];
+        ByteBuffer bb = name.duplicate();
+        int i = 0;
+        while (bb.remaining() > 0)
+        {
+            l[i++] = ByteBufferUtil.readBytesWithShortLength(bb);
+            bb.get(); // skip end-of-component
+        }
+        return i == l.length ? l : Arrays.copyOfRange(l, 0, i);
+    }
+
     // Extract component idx from bb. Return null if there is not enough component.
     public static ByteBuffer extractComponent(ByteBuffer bb, int idx)
     {
@@ -260,60 +275,6 @@
         return true;
     }
 
-    @Override
-    public boolean intersects(List<ByteBuffer> minColumnNames, List<ByteBuffer> maxColumnNames, SliceQueryFilter filter)
-    {
-        assert minColumnNames.size() == maxColumnNames.size();
-
-        // If any of the slices in the filter intersect, return true
-        outer:
-        for (ColumnSlice slice : filter.slices)
-        {
-            ByteBuffer[] start = split(filter.isReversed() ? slice.finish : slice.start);
-            ByteBuffer[] finish = split(filter.isReversed() ? slice.start : slice.finish);
-
-            if (compare(start, maxColumnNames, true) > 0 || compare(finish, minColumnNames, false) < 0)
-                continue;  // slice does not intersect
-
-            // We could safely return true here, but there's a minor optimization: if the first component is restricted
-            // to a single value, we can check that the second component falls within the min/max for that component
-            // (and repeat for all components).
-            for (int i = 0; i < minColumnNames.size(); i++)
-            {
-                AbstractType<?> t = types.get(i);
-                ByteBuffer s = i < start.length ? start[i] : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-                ByteBuffer f = i < finish.length ? finish[i] : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-
-                // we already know the first component falls within its min/max range (otherwise we wouldn't get here)
-                if (i > 0 && !t.intersects(minColumnNames.get(i), maxColumnNames.get(i), s, f))
-                    continue outer;
-
-                // if this component isn't equal in the start and finish, we don't need to check any more
-                if (i >= start.length || i >= finish.length || t.compare(s, f) != 0)
-                    break;
-            }
-            return true;
-        }
-
-        // none of the slices intersected
-        return false;
-    }
-
-    /** Helper method for intersects() */
-    private int compare(ByteBuffer[] sliceBounds, List<ByteBuffer> sstableBounds, boolean isSliceStart)
-    {
-        for (int i = 0; i < sstableBounds.size(); i++)
-        {
-            if (i >= sliceBounds.length)
-                return isSliceStart ? -1 : 1;
-
-            int comparison = types.get(i).compare(sliceBounds[i], sstableBounds.get(i));
-            if (comparison != 0)
-                return comparison;
-        }
-        return 0;
-    }
-
     private static class StaticParsedComparator implements ParsedComparator
     {
         final AbstractType<?> type;
@@ -371,7 +332,7 @@
         return out;
     }
 
-    public static class Builder implements ColumnNameBuilder
+    public static class Builder
     {
         private final CompositeType composite;
 
@@ -418,6 +379,11 @@
             return this;
         }
 
+        public Builder add(ColumnIdentifier name)
+        {
+            return add(name.bytes);
+        }
+
         public int componentCount()
         {
             return components.size();

diff --git a/src/java/org/apache/cassandra/db/marshal/CounterColumnType.java b/src/java/org/apache/cassandra/db/marshal/CounterColumnType.java
index 6a77458..2bcb4db 100644
--- a/src/java/org/apache/cassandra/db/marshal/CounterColumnType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CounterColumnType.java

@@ -20,22 +20,41 @@
 import java.nio.ByteBuffer;
 
 import org.apache.cassandra.cql3.CQL3Type;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.CounterSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-public class CounterColumnType extends AbstractCommutativeType
+public class CounterColumnType extends AbstractType<Long>
 {
     public static final CounterColumnType instance = new CounterColumnType();
 
     CounterColumnType() {} // singleton
 
+    public boolean isCounter()
+    {
+        return true;
+    }
+
+    public boolean isByteOrderComparable()
+    {
+        throw new AssertionError();
+    }
+
+    @Override
+    public Long compose(ByteBuffer bytes)
+    {
+        return CounterContext.instance().total(bytes);
+    }
+
+    @Override
+    public ByteBuffer decompose(Long value)
+    {
+        return ByteBufferUtil.bytes(value);
+    }
+
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1 == null)
-            return o2 == null ?  0 : -1;
-
         return ByteBufferUtil.compareUnsigned(o1, o2);
     }
 
@@ -44,14 +63,6 @@
         return ByteBufferUtil.bytesToHex(bytes);
     }
 
-    /**
-     * create commutative column
-     */
-    public Column createColumn(ByteBuffer name, ByteBuffer value, long timestamp)
-    {
-        return new CounterUpdateColumn(name, value, timestamp);
-    }
-
     public ByteBuffer fromString(String source)
     {
         return ByteBufferUtil.hexToBytes(source);

diff --git a/src/java/org/apache/cassandra/db/marshal/DateType.java b/src/java/org/apache/cassandra/db/marshal/DateType.java
index bf25d88..882e4cf 100644
--- a/src/java/org/apache/cassandra/db/marshal/DateType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DateType.java

@@ -19,7 +19,6 @@
 
 import java.nio.ByteBuffer;
 import java.text.ParseException;
-import java.text.SimpleDateFormat;
 import java.util.Date;
 
 import org.slf4j.Logger;
@@ -42,14 +41,8 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1.remaining() == 0)
-        {
-            return o2.remaining() == 0 ? 0 : -1;
-        }
-        if (o2.remaining() == 0)
-        {
-            return 1;
-        }
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         return ByteBufferUtil.compareUnsigned(o1, o2);
     }
@@ -81,6 +74,11 @@
         return false;
     }
 
+    public boolean isByteOrderComparable()
+    {
+        return true;
+    }
+
     @Override
     public boolean isValueCompatibleWithInternal(AbstractType<?> otherType)
     {

diff --git a/src/java/org/apache/cassandra/db/marshal/DecimalType.java b/src/java/org/apache/cassandra/db/marshal/DecimalType.java
index 2ac8480..b7e481d 100644
--- a/src/java/org/apache/cassandra/db/marshal/DecimalType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DecimalType.java

@@ -32,18 +32,12 @@
 
     DecimalType() {} // singleton
 
-    public int compare(ByteBuffer bb0, ByteBuffer bb1)
+    public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (bb0.remaining() == 0)
-        {
-            return bb1.remaining() == 0 ? 0 : -1;
-        }
-        if (bb1.remaining() == 0)
-        {
-            return 1;
-        }
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
-        return compose(bb0).compareTo(compose(bb1));
+        return compose(o1).compareTo(compose(o2));
     }
 
     public ByteBuffer fromString(String source) throws MarshalException

diff --git a/src/java/org/apache/cassandra/db/marshal/DoubleType.java b/src/java/org/apache/cassandra/db/marshal/DoubleType.java
index 35b33d6..af11a36 100644
--- a/src/java/org/apache/cassandra/db/marshal/DoubleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DoubleType.java

@@ -33,14 +33,8 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1.remaining() == 0)
-        {
-            return o2.remaining() == 0 ? 0 : -1;
-        }
-        if (o2.remaining() == 0)
-        {
-            return 1;
-        }
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         return compose(o1).compareTo(compose(o2));
     }

diff --git a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
index cddbd1d..d3b0b90 100644
--- a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java

@@ -22,6 +22,9 @@
 import java.util.HashMap;
 import java.util.Map;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.TypeSerializer;
@@ -49,6 +52,8 @@
  */
 public class DynamicCompositeType extends AbstractCompositeType
 {
+    private static final Logger logger = LoggerFactory.getLogger(DynamicCompositeType.class);
+
     private final Map<Byte, AbstractType<?>> aliases;
 
     // interning instances
@@ -202,13 +207,25 @@
                 throw new MarshalException("Not enough bytes to read comparator name of component " + i);
 
             ByteBuffer value = ByteBufferUtil.readBytes(bb, header);
+            String valueStr = null;
             try
             {
-                comparator = TypeParser.parse(ByteBufferUtil.string(value));
+                valueStr = ByteBufferUtil.string(value);
+                comparator = TypeParser.parse(valueStr);
+            }
+            catch (CharacterCodingException ce) 
+            {
+                // ByteBufferUtil.string failed. 
+                // Log it here and we'll further throw an exception below since comparator == null
+                logger.error("Failed with [{}] when decoding the byte buffer in ByteBufferUtil.string()", 
+                   ce.toString());
             }
             catch (Exception e)
             {
-                // we'll deal with this below since comparator == null
+                // parse failed. 
+                // Log it here and we'll further throw an exception below since comparator == null
+                logger.error("Failed to parse value string \"{}\" with exception: [{}]", 
+                   valueStr, e.toString());
             }
         }
         else
@@ -385,5 +402,10 @@
         {
             throw new UnsupportedOperationException();
         }
+
+        public boolean isByteOrderComparable()
+        {
+            return false;
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/FloatType.java b/src/java/org/apache/cassandra/db/marshal/FloatType.java
index bee226e..9364928 100644
--- a/src/java/org/apache/cassandra/db/marshal/FloatType.java
+++ b/src/java/org/apache/cassandra/db/marshal/FloatType.java

@@ -34,14 +34,8 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1.remaining() == 0)
-        {
-            return o2.remaining() == 0 ? 0 : -1;
-        }
-        if (o2.remaining() == 0)
-        {
-            return 1;
-        }
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         return compose(o1).compareTo(compose(o2));
     }

diff --git a/src/java/org/apache/cassandra/db/marshal/InetAddressType.java b/src/java/org/apache/cassandra/db/marshal/InetAddressType.java
index c29e8ac..0473ee8 100644
--- a/src/java/org/apache/cassandra/db/marshal/InetAddressType.java
+++ b/src/java/org/apache/cassandra/db/marshal/InetAddressType.java

@@ -66,4 +66,9 @@
     {
         return InetAddressSerializer.instance;
     }
+
+    public boolean isByteOrderComparable()
+    {
+        return true;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/Int32Type.java b/src/java/org/apache/cassandra/db/marshal/Int32Type.java
index 2f37721..976c7a8 100644
--- a/src/java/org/apache/cassandra/db/marshal/Int32Type.java
+++ b/src/java/org/apache/cassandra/db/marshal/Int32Type.java

@@ -35,20 +35,13 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1.remaining() == 0)
-        {
-            return o2.remaining() == 0 ? 0 : -1;
-        }
-        if (o2.remaining() == 0)
-        {
-            return 1;
-        }
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         int diff = o1.get(o1.position()) - o2.get(o2.position());
         if (diff != 0)
             return diff;
 
-
         return ByteBufferUtil.compareUnsigned(o1, o2);
     }
 
@@ -81,4 +74,5 @@
     {
         return Int32Serializer.instance;
     }
+
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
index 303169f..634194f 100644
--- a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java

@@ -36,14 +36,8 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1.remaining() == 0)
-        {
-            return o2.remaining() == 0 ? 0 : -1;
-        }
-        if (o2.remaining() == 0)
-        {
-            return 1;
-        }
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         return UUIDGen.getUUID(o1).compareTo(UUIDGen.getUUID(o2));
     }

diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java
index 4b45bd7..171e179 100644
--- a/src/java/org/apache/cassandra/db/marshal/ListType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ListType.java

@@ -20,12 +20,12 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.ListSerializer;
-import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class ListType<T> extends CollectionType<List<T>>
@@ -73,7 +73,7 @@
         return elements;
     }
 
-    public TypeSerializer<List<T>> getSerializer()
+    public ListSerializer<T> getSerializer()
     {
         return serializer;
     }
@@ -87,21 +87,19 @@
     static int compareListOrSet(AbstractType<?> elementsComparator, ByteBuffer o1, ByteBuffer o2)
     {
         // Note that this is only used if the collection is inside an UDT
-        if (o1 == null || !o1.hasRemaining())
-            return o2 == null || !o2.hasRemaining() ? 0 : -1;
-        if (o2 == null || !o2.hasRemaining())
-            return 1;
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         ByteBuffer bb1 = o1.duplicate();
         ByteBuffer bb2 = o2.duplicate();
 
-        int size1 = ByteBufferUtil.readShortLength(bb1);
-        int size2 = ByteBufferUtil.readShortLength(bb2);
+        int size1 = CollectionSerializer.readCollectionSize(bb1, 3);
+        int size2 = CollectionSerializer.readCollectionSize(bb2, 3);
 
         for (int i = 0; i < Math.min(size1, size2); i++)
         {
-            ByteBuffer v1 = ByteBufferUtil.readBytesWithShortLength(bb1);
-            ByteBuffer v2 = ByteBufferUtil.readBytesWithShortLength(bb2);
+            ByteBuffer v1 = CollectionSerializer.readValue(bb1, 3);
+            ByteBuffer v2 = CollectionSerializer.readValue(bb2, 3);
             int cmp = elementsComparator.compare(v1, v2);
             if (cmp != 0)
                 return cmp;
@@ -115,17 +113,11 @@
         sb.append(getClass().getName()).append(TypeParser.stringifyTypeParameters(Collections.<AbstractType<?>>singletonList(elements)));
     }
 
-    public ByteBuffer serialize(List<Pair<ByteBuffer, Column>> columns)
+    public List<ByteBuffer> serializedValues(List<Cell> cells)
     {
-        columns = enforceLimit(columns);
-
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(columns.size());
-        int size = 0;
-        for (Pair<ByteBuffer, Column> p : columns)
-        {
-            bbs.add(p.right.value());
-            size += 2 + p.right.value().remaining();
-        }
-        return pack(bbs, columns.size(), size);
+        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(cells.size());
+        for (Cell c : cells)
+            bbs.add(c.value());
+        return bbs;
     }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/LocalByPartionerType.java b/src/java/org/apache/cassandra/db/marshal/LocalByPartionerType.java
index a686b90..60bce9de 100644
--- a/src/java/org/apache/cassandra/db/marshal/LocalByPartionerType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LocalByPartionerType.java

@@ -63,7 +63,7 @@
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
         // o1 and o2 can be empty so we need to use RowPosition, not DecoratedKey
-        return RowPosition.forKey(o1, partitioner).compareTo(RowPosition.forKey(o2, partitioner));
+        return RowPosition.ForKey.get(o1, partitioner).compareTo(RowPosition.ForKey.get(o2, partitioner));
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/db/marshal/LongType.java b/src/java/org/apache/cassandra/db/marshal/LongType.java
index efe7223..feedaeb 100644
--- a/src/java/org/apache/cassandra/db/marshal/LongType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LongType.java

@@ -38,14 +38,8 @@
 
     public static int compareLongs(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1.remaining() == 0)
-        {
-            return o2.remaining() == 0 ? 0 : -1;
-        }
-        if (o2.remaining() == 0)
-        {
-            return 1;
-        }
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         int diff = o1.get(o1.position()) - o2.get(o2.position());
         if (diff != 0)
@@ -89,4 +83,5 @@
     {
         return LongSerializer.instance;
     }
+
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java
index 08f795f..dbf6721 100644
--- a/src/java/org/apache/cassandra/db/marshal/MapType.java
+++ b/src/java/org/apache/cassandra/db/marshal/MapType.java

@@ -20,9 +20,10 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.MapSerializer;
 import org.apache.cassandra.utils.Pair;
@@ -80,27 +81,25 @@
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
         // Note that this is only used if the collection is inside an UDT
-        if (o1 == null || !o1.hasRemaining())
-            return o2 == null || !o2.hasRemaining() ? 0 : -1;
-        if (o2 == null || !o2.hasRemaining())
-            return 1;
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
 
         ByteBuffer bb1 = o1.duplicate();
         ByteBuffer bb2 = o2.duplicate();
 
-        int size1 = ByteBufferUtil.readShortLength(bb1);
-        int size2 = ByteBufferUtil.readShortLength(bb2);
+        int size1 = CollectionSerializer.readCollectionSize(bb1, 3);
+        int size2 = CollectionSerializer.readCollectionSize(bb2, 3);
 
         for (int i = 0; i < Math.min(size1, size2); i++)
         {
-            ByteBuffer k1 = ByteBufferUtil.readBytesWithShortLength(bb1);
-            ByteBuffer k2 = ByteBufferUtil.readBytesWithShortLength(bb2);
+            ByteBuffer k1 = CollectionSerializer.readValue(bb1, 3);
+            ByteBuffer k2 = CollectionSerializer.readValue(bb2, 3);
             int cmp = keys.compare(k1, k2);
             if (cmp != 0)
                 return cmp;
 
-            ByteBuffer v1 = ByteBufferUtil.readBytesWithShortLength(bb1);
-            ByteBuffer v2 = ByteBufferUtil.readBytesWithShortLength(bb2);
+            ByteBuffer v1 = CollectionSerializer.readValue(bb1, 3);
+            ByteBuffer v2 = CollectionSerializer.readValue(bb2, 3);
             cmp = values.compare(v1, v2);
             if (cmp != 0)
                 return cmp;
@@ -110,31 +109,29 @@
     }
 
     @Override
-    public TypeSerializer<Map<K, V>> getSerializer()
+    public MapSerializer<K, V> getSerializer()
     {
         return serializer;
     }
 
+    public boolean isByteOrderComparable()
+    {
+        return keys.isByteOrderComparable();
+    }
+
     protected void appendToStringBuilder(StringBuilder sb)
     {
         sb.append(getClass().getName()).append(TypeParser.stringifyTypeParameters(Arrays.asList(keys, values)));
     }
 
-    /**
-     * Creates the same output than serialize, but from the internal representation.
-     */
-    public ByteBuffer serialize(List<Pair<ByteBuffer, Column>> columns)
+    public List<ByteBuffer> serializedValues(List<Cell> cells)
     {
-        columns = enforceLimit(columns);
-
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(2 * columns.size());
-        int size = 0;
-        for (Pair<ByteBuffer, Column> p : columns)
+        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(cells.size() * 2);
+        for (Cell c : cells)
         {
-            bbs.add(p.left);
-            bbs.add(p.right.value());
-            size += 4 + p.left.remaining() + p.right.value().remaining();
+            bbs.add(c.name().collectionElement());
+            bbs.add(c.value());
         }
-        return pack(bbs, columns.size(), size);
+        return bbs;
     }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/ReversedType.java b/src/java/org/apache/cassandra/db/marshal/ReversedType.java
index ffb0229..1323dc6 100644
--- a/src/java/org/apache/cassandra/db/marshal/ReversedType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ReversedType.java

@@ -50,7 +50,7 @@
             type = new ReversedType<T>(baseType);
             instances.put(baseType, type);
         }
-        return (ReversedType<T>) type;
+        return type;
     }
 
     private ReversedType(AbstractType<T> baseType)

diff --git a/src/java/org/apache/cassandra/db/marshal/SetType.java b/src/java/org/apache/cassandra/db/marshal/SetType.java
index 614ecc7..d2f7f12 100644
--- a/src/java/org/apache/cassandra/db/marshal/SetType.java
+++ b/src/java/org/apache/cassandra/db/marshal/SetType.java

@@ -20,12 +20,11 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.SetSerializer;
-import org.apache.cassandra.utils.Pair;
 
 public class SetType<T> extends CollectionType<Set<T>>
 {
@@ -78,27 +77,26 @@
         return ListType.compareListOrSet(elements, o1, o2);
     }
 
-    public TypeSerializer<Set<T>> getSerializer()
+    public SetSerializer<T> getSerializer()
     {
         return serializer;
     }
 
+    public boolean isByteOrderComparable()
+    {
+        return elements.isByteOrderComparable();
+    }
+
     protected void appendToStringBuilder(StringBuilder sb)
     {
         sb.append(getClass().getName()).append(TypeParser.stringifyTypeParameters(Collections.<AbstractType<?>>singletonList(elements)));
     }
 
-    public ByteBuffer serialize(List<Pair<ByteBuffer, Column>> columns)
+    public List<ByteBuffer> serializedValues(List<Cell> cells)
     {
-        columns = enforceLimit(columns);
-
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(columns.size());
-        int size = 0;
-        for (Pair<ByteBuffer, Column> p : columns)
-        {
-            bbs.add(p.left);
-            size += 2 + p.left.remaining();
-        }
-        return pack(bbs, columns.size(), size);
+        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(cells.size());
+        for (Cell c : cells)
+            bbs.add(c.name().collectionElement());
+        return bbs;
     }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
index fa82f06..88dc211 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java

@@ -41,14 +41,9 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        if (o1.remaining() == 0)
-        {
-            return o2.remaining() == 0 ? 0 : -1;
-        }
-        if (o2.remaining() == 0)
-        {
-            return 1;
-        }
+        if (!o1.hasRemaining() || !o2.hasRemaining())
+            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
+
         int res = compareTimestampBytes(o1, o2);
         if (res != 0)
             return res;

diff --git a/src/java/org/apache/cassandra/db/marshal/TupleType.java b/src/java/org/apache/cassandra/db/marshal/TupleType.java
index 12457e9..42aaba1 100644
--- a/src/java/org/apache/cassandra/db/marshal/TupleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TupleType.java

@@ -72,8 +72,7 @@
         ByteBuffer bb1 = o1.duplicate();
         ByteBuffer bb2 = o2.duplicate();
 
-        int i = 0;
-        while (bb1.remaining() > 0 && bb2.remaining() > 0)
+        for (int i = 0; bb1.remaining() > 0 && bb2.remaining() > 0; i++)
         {
             AbstractType<?> comparator = types.get(i);
 
@@ -95,8 +94,6 @@
             int cmp = comparator.compare(value1, value2);
             if (cmp != 0)
                 return cmp;
-
-            ++i;
         }
 
         if (bb1.remaining() == 0)
@@ -120,9 +117,10 @@
                 throw new MarshalException(String.format("Not enough bytes to read size of %dth component", i));
 
             int size = input.getInt();
-            // We don't handle null just yet, but we should fix that soon (CASSANDRA-7206)
+
+            // size < 0 means null value
             if (size < 0)
-                throw new MarshalException("Nulls are not yet supported inside tuple values");
+                continue;
 
             if (input.remaining() < size)
                 throw new MarshalException(String.format("Not enough bytes to read %dth component", i));
@@ -158,13 +156,20 @@
     {
         int totalLength = 0;
         for (ByteBuffer component : components)
-            totalLength += 4 + component.remaining();
+            totalLength += 4 + (component == null ? 0 : component.remaining());
 
         ByteBuffer result = ByteBuffer.allocate(totalLength);
         for (ByteBuffer component : components)
         {
-            result.putInt(component.remaining());
-            result.put(component.duplicate());
+            if (component == null)
+            {
+                result.putInt(-1);
+            }
+            else
+            {
+                result.putInt(component.remaining());
+                result.put(component.duplicate());
+            }
         }
         result.rewind();
         return result;
@@ -183,12 +188,17 @@
             if (i > 0)
                 sb.append(":");
 
+            AbstractType<?> type = type(i);
             int size = input.getInt();
-            assert size >= 0; // We don't support nulls yet, but we will likely do with #7206 and we'll need
-                              // a way to represent it as a string (without it conflicting with a user value)
+            if (size < 0)
+            {
+                sb.append("@");
+                continue;
+            }
+
             ByteBuffer field = ByteBufferUtil.readBytes(input, size);
-            // We use ':' as delimiter so escape it if it's in the generated string
-            sb.append(field == null ? "null" : type(i).getString(value).replaceAll(":", "\\\\:"));
+            // We use ':' as delimiter, and @ to represent null, so escape them in the generated string
+            sb.append(type.getString(field).replaceAll(":", "\\\\:").replaceAll("@", "\\\\@"));
         }
         return sb.toString();
     }
@@ -196,15 +206,19 @@
     public ByteBuffer fromString(String source)
     {
         // Split the input on non-escaped ':' characters
-        List<String> strings = AbstractCompositeType.split(source);
-        ByteBuffer[] components = new ByteBuffer[strings.size()];
-        for (int i = 0; i < strings.size(); i++)
+        List<String> fieldStrings = AbstractCompositeType.split(source);
+        ByteBuffer[] fields = new ByteBuffer[fieldStrings.size()];
+        for (int i = 0; i < fieldStrings.size(); i++)
         {
-            // TODO: we'll need to handle null somehow here once we support them
-            String str = strings.get(i).replaceAll("\\\\:", ":");
-            components[i] = type(i).fromString(str);
+            String fieldString = fieldStrings.get(i);
+            // We use @ to represent nulls
+            if (fieldString.equals("@"))
+                continue;
+
+            AbstractType<?> type = type(i);
+            fields[i] = type.fromString(fieldString.replaceAll("\\\\:", ":").replaceAll("\\\\@", "@"));
         }
-        return buildValue(components);
+        return buildValue(fields);
     }
 
     public TypeSerializer<ByteBuffer> getSerializer()
@@ -271,9 +285,14 @@
     }
 
     @Override
+    public CQL3Type asCQL3Type()
+    {
+        return CQL3Type.Tuple.create(this);
+    }
+
+    @Override
     public String toString()
     {
         return getClass().getName() + TypeParser.stringifyTypeParameters(types);
     }
 }
-

diff --git a/src/java/org/apache/cassandra/db/marshal/TypeParser.java b/src/java/org/apache/cassandra/db/marshal/TypeParser.java
index 1330a40..1b83180 100644
--- a/src/java/org/apache/cassandra/db/marshal/TypeParser.java
+++ b/src/java/org/apache/cassandra/db/marshal/TypeParser.java

@@ -32,6 +32,7 @@
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
 
 /**
  * Parse a string containing an Type definition.
@@ -258,16 +259,7 @@
                 return map;
             }
 
-            String bbHex = readNextIdentifier();
-            ByteBuffer bb = null;
-            try
-            {
-                 bb = ByteBufferUtil.hexToBytes(bbHex);
-            }
-            catch (NumberFormatException e)
-            {
-                throwSyntaxError(e.getMessage());
-            }
+            ByteBuffer bb = fromHex(readNextIdentifier());
 
             skipBlank();
             if (str.charAt(idx) != ':')
@@ -292,6 +284,62 @@
         throw new SyntaxException(String.format("Syntax error parsing '%s' at char %d: unexpected end of string", str, idx));
     }
 
+    private ByteBuffer fromHex(String hex) throws SyntaxException
+    {
+        try
+        {
+            return ByteBufferUtil.hexToBytes(hex);
+        }
+        catch (NumberFormatException e)
+        {
+            throwSyntaxError(e.getMessage());
+            return null;
+        }
+    }
+
+    public Pair<Pair<String, ByteBuffer>, List<Pair<ByteBuffer, AbstractType>>> getUserTypeParameters() throws SyntaxException, ConfigurationException
+    {
+
+        if (isEOS() || str.charAt(idx) != '(')
+            throw new IllegalStateException();
+
+        ++idx; // skipping '('
+
+        skipBlankAndComma();
+        String keyspace = readNextIdentifier();
+        skipBlankAndComma();
+        ByteBuffer typeName = fromHex(readNextIdentifier());
+        List<Pair<ByteBuffer, AbstractType>> defs = new ArrayList<>();
+
+        while (skipBlankAndComma())
+        {
+            if (str.charAt(idx) == ')')
+            {
+                ++idx;
+                return Pair.create(Pair.create(keyspace, typeName), defs);
+            }
+
+            ByteBuffer name = fromHex(readNextIdentifier());
+            skipBlank();
+            if (str.charAt(idx) != ':')
+                throwSyntaxError("expecting ':' token");
+            ++idx;
+            skipBlank();
+            try
+            {
+                AbstractType type = parse();
+                defs.add(Pair.create(name, type));
+            }
+            catch (SyntaxException e)
+            {
+                SyntaxException ex = new SyntaxException(String.format("Exception while parsing '%s' around char %d", str, idx));
+                ex.initCause(e);
+                throw ex;
+            }
+        }
+        throw new SyntaxException(String.format("Syntax error parsing '%s' at char %d: unexpected end of string", str, idx));
+    }
+
     private static AbstractType<?> getAbstractType(String compareWith) throws ConfigurationException
     {
         String className = compareWith.contains(".") ? compareWith : "org.apache.cassandra.db.marshal." + compareWith;
@@ -514,4 +562,19 @@
         sb.append(')');
         return sb.toString();
     }
+
+    public static String stringifyUserTypeParameters(String keysace, ByteBuffer typeName, List<ByteBuffer> columnNames, List<AbstractType<?>> columnTypes)
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append('(').append(keysace).append(",").append(ByteBufferUtil.bytesToHex(typeName));
+
+        for (int i = 0; i < columnNames.size(); i++)
+        {
+            sb.append(',');
+            sb.append(ByteBufferUtil.bytesToHex(columnNames.get(i))).append(":");
+            sb.append(columnTypes.get(i).toString());
+        }
+        sb.append(')');
+        return sb.toString();
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/UTF8Type.java b/src/java/org/apache/cassandra/db/marshal/UTF8Type.java
index e763316..6d58db2 100644
--- a/src/java/org/apache/cassandra/db/marshal/UTF8Type.java
+++ b/src/java/org/apache/cassandra/db/marshal/UTF8Type.java

@@ -22,6 +22,7 @@
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.UTF8Serializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class UTF8Type extends AbstractType<String>
 {
@@ -31,7 +32,7 @@
 
     public int compare(ByteBuffer o1, ByteBuffer o2)
     {
-        return BytesType.bytesCompare(o1, o2);
+        return ByteBufferUtil.compareUnsigned(o1, o2);
     }
 
     public ByteBuffer fromString(String source)
@@ -47,6 +48,11 @@
         return this == previous || previous == AsciiType.instance;
     }
 
+    public boolean isByteOrderComparable()
+    {
+        return true;
+    }
+
     public CQL3Type asCQL3Type()
     {
         return CQL3Type.Native.TEXT;

diff --git a/src/java/org/apache/cassandra/db/marshal/UUIDType.java b/src/java/org/apache/cassandra/db/marshal/UUIDType.java
index b2caa04..c4ce1d1 100644
--- a/src/java/org/apache/cassandra/db/marshal/UUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/UUIDType.java

@@ -204,4 +204,5 @@
     {
         return UUIDSerializer.instance;
     }
+
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/UserType.java b/src/java/org/apache/cassandra/db/marshal/UserType.java
new file mode 100644
index 0000000..44c208f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/UserType.java

@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.serializers.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Pair;
+
+/**
+ * A user defined type.
+ *
+ * A user type is really just a tuple type on steroids.
+ */
+public class UserType extends TupleType
+{
+    public final String keyspace;
+    public final ByteBuffer name;
+    private final List<ByteBuffer> fieldNames;
+
+    public UserType(String keyspace, ByteBuffer name, List<ByteBuffer> fieldNames, List<AbstractType<?>> fieldTypes)
+    {
+        super(fieldTypes);
+        assert fieldNames.size() == fieldTypes.size();
+        this.keyspace = keyspace;
+        this.name = name;
+        this.fieldNames = fieldNames;
+    }
+
+    public static UserType getInstance(TypeParser parser) throws ConfigurationException, SyntaxException
+    {
+        Pair<Pair<String, ByteBuffer>, List<Pair<ByteBuffer, AbstractType>>> params = parser.getUserTypeParameters();
+        String keyspace = params.left.left;
+        ByteBuffer name = params.left.right;
+        List<ByteBuffer> columnNames = new ArrayList<>(params.right.size());
+        List<AbstractType<?>> columnTypes = new ArrayList<>(params.right.size());
+        for (Pair<ByteBuffer, AbstractType> p : params.right)
+        {
+            columnNames.add(p.left);
+            columnTypes.add(p.right);
+        }
+        return new UserType(keyspace, name, columnNames, columnTypes);
+    }
+
+    public AbstractType<?> fieldType(int i)
+    {
+        return type(i);
+    }
+
+    public List<AbstractType<?>> fieldTypes()
+    {
+        return types;
+    }
+
+    public ByteBuffer fieldName(int i)
+    {
+        return fieldNames.get(i);
+    }
+
+    public List<ByteBuffer> fieldNames()
+    {
+        return fieldNames;
+    }
+
+    public String getNameAsString()
+    {
+        return UTF8Type.instance.compose(name);
+    }
+
+    // Note: the only reason we override this is to provide nicer error message, but since that's not that much code...
+    @Override
+    public void validate(ByteBuffer bytes) throws MarshalException
+    {
+        ByteBuffer input = bytes.duplicate();
+        for (int i = 0; i < size(); i++)
+        {
+            // we allow the input to have less fields than declared so as to support field addition.
+            if (!input.hasRemaining())
+                return;
+
+            if (input.remaining() < 4)
+                throw new MarshalException(String.format("Not enough bytes to read size of %dth field %s", i, fieldName(i)));
+
+            int size = input.getInt();
+
+            // size < 0 means null value
+            if (size < 0)
+                continue;
+
+            if (input.remaining() < size)
+                throw new MarshalException(String.format("Not enough bytes to read %dth field %s", i, fieldName(i)));
+
+            ByteBuffer field = ByteBufferUtil.readBytes(input, size);
+            types.get(i).validate(field);
+        }
+
+        // We're allowed to get less fields than declared, but not more
+        if (input.hasRemaining())
+            throw new MarshalException("Invalid remaining data after end of UDT value");
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(keyspace, name, fieldNames, types);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if(!(o instanceof UserType))
+            return false;
+
+        UserType that = (UserType)o;
+        return keyspace.equals(that.keyspace) && name.equals(that.name) && fieldNames.equals(that.fieldNames) && types.equals(that.types);
+    }
+
+    @Override
+    public CQL3Type asCQL3Type()
+    {
+        return CQL3Type.UserDefined.create(this);
+    }
+
+    @Override
+    public String toString()
+    {
+        return getClass().getName() + TypeParser.stringifyUserTypeParameters(keyspace, name, fieldNames, types);
+    }
+}

diff --git a/src/java/org/apache/cassandra/dht/AbstractBounds.java b/src/java/org/apache/cassandra/dht/AbstractBounds.java
index 849a841..b69f5ee 100644
--- a/src/java/org/apache/cassandra/dht/AbstractBounds.java
+++ b/src/java/org/apache/cassandra/dht/AbstractBounds.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.dht;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.*;
@@ -28,6 +27,7 @@
 import org.apache.cassandra.db.RowPosition;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.Pair;
 
 public abstract class AbstractBounds<T extends RingPosition> implements Serializable
@@ -99,7 +99,7 @@
     {
         if (value instanceof DecoratedKey)
         {
-            return keyValidator.getString(((DecoratedKey)value).key);
+            return keyValidator.getString(((DecoratedKey)value).getKey());
         }
         else
         {
@@ -126,7 +126,7 @@
 
     public static class AbstractBoundsSerializer implements IVersionedSerializer<AbstractBounds<?>>
     {
-        public void serialize(AbstractBounds<?> range, DataOutput out, int version) throws IOException
+        public void serialize(AbstractBounds<?> range, DataOutputPlus out, int version) throws IOException
         {
             /*
              * The first int tells us if it's a range or bounds (depending on the value) _and_ if it's tokens or keys (depending on the

diff --git a/src/java/org/apache/cassandra/dht/AbstractByteOrderedPartitioner.java b/src/java/org/apache/cassandra/dht/AbstractByteOrderedPartitioner.java
index 5858a4a..94be94d 100644
--- a/src/java/org/apache/cassandra/dht/AbstractByteOrderedPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/AbstractByteOrderedPartitioner.java

@@ -22,6 +22,7 @@
 import java.util.*;
 
 import org.apache.cassandra.config.*;
+import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.commons.lang3.ArrayUtils;
 
@@ -42,7 +43,7 @@
 
     public DecoratedKey decorateKey(ByteBuffer key)
     {
-        return new DecoratedKey(getToken(key), key);
+        return new BufferDecoratedKey(getToken(key), key);
     }
 
     public BytesToken midpoint(Token ltoken, Token rtoken)
@@ -198,7 +199,7 @@
                 for (Range<Token> r : sortedRanges)
                 {
                     // Looping over every KS:CF:Range, get the splits size and add it to the count
-                    allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, 1, cfmd).size());
+                    allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, 1).size());
                 }
             }
         }

diff --git a/src/java/org/apache/cassandra/dht/BootStrapper.java b/src/java/org/apache/cassandra/dht/BootStrapper.java
index b35d222..cbbd100 100644
--- a/src/java/org/apache/cassandra/dht/BootStrapper.java
+++ b/src/java/org/apache/cassandra/dht/BootStrapper.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.dht;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.*;
@@ -34,6 +33,7 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.FailureDetector;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.locator.TokenMetadata;
 import org.apache.cassandra.service.StorageService;
@@ -63,7 +63,7 @@
         if (logger.isDebugEnabled())
             logger.debug("Beginning bootstrap process");
 
-        RangeStreamer streamer = new RangeStreamer(tokenMetadata, address, "Bootstrap");
+        RangeStreamer streamer = new RangeStreamer(tokenMetadata, tokens, address, "Bootstrap");
         streamer.addSourceFilter(new RangeStreamer.FailureDetectorSourceFilter(FailureDetector.instance));
 
         for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
@@ -136,7 +136,7 @@
     {
         public static final StringSerializer instance = new StringSerializer();
 
-        public void serialize(String s, DataOutput out, int version) throws IOException
+        public void serialize(String s, DataOutputPlus out, int version) throws IOException
         {
             out.writeUTF(s);
         }

diff --git a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
index ccf7089..f46026f 100644
--- a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java

@@ -17,14 +17,24 @@
  */
 package org.apache.cassandra.dht;
 
+import org.apache.cassandra.utils.ObjectSizes;
+
 import java.nio.ByteBuffer;
 
 public class ByteOrderedPartitioner extends AbstractByteOrderedPartitioner
 {
+    private static final long EMPTY_SIZE = ObjectSizes.measure(MINIMUM);
+
     public BytesToken getToken(ByteBuffer key)
     {
         if (key.remaining() == 0)
             return MINIMUM;
         return new BytesToken(key);
     }
+
+    @Override
+    public long getHeapSizeOf(BytesToken token)
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOfArray(token.token);
+    }
 }

diff --git a/src/java/org/apache/cassandra/dht/IPartitioner.java b/src/java/org/apache/cassandra/dht/IPartitioner.java
index 46165b8..0ef242f 100644
--- a/src/java/org/apache/cassandra/dht/IPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/IPartitioner.java

@@ -56,6 +56,13 @@
     public T getToken(ByteBuffer key);
 
     /**
+     *
+     * @param token
+     * @return the on-heap memory used by the provided token
+     */
+    public long getHeapSizeOf(T token);
+
+    /**
      * @return a randomly generated token
      */
     public T getRandomToken();

diff --git a/src/java/org/apache/cassandra/dht/LocalPartitioner.java b/src/java/org/apache/cassandra/dht/LocalPartitioner.java
index b24eede..c32e792 100644
--- a/src/java/org/apache/cassandra/dht/LocalPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/LocalPartitioner.java

@@ -22,12 +22,16 @@
 import java.util.List;
 import java.util.Map;
 
+import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
 
 public class LocalPartitioner extends AbstractPartitioner<LocalToken>
 {
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new LocalToken(null, null));
+
     private final AbstractType<?> comparator;
 
     public LocalPartitioner(AbstractType<?> comparator)
@@ -37,7 +41,7 @@
 
     public DecoratedKey decorateKey(ByteBuffer key)
     {
-        return new DecoratedKey(getToken(key), key);
+        return new BufferDecoratedKey(getToken(key), key);
     }
 
     public Token midpoint(Token left, Token right)
@@ -55,6 +59,11 @@
         return new LocalToken(comparator, key);
     }
 
+    public long getHeapSizeOf(LocalToken token)
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOnHeapOf(token.token);
+    }
+
     public LocalToken getRandomToken()
     {
         throw new UnsupportedOperationException();

diff --git a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
index 3a045d7..2bb0423 100644
--- a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
+++ b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java

@@ -24,7 +24,9 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.ThreadLocalRandom;
 
+import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.LongType;
@@ -32,6 +34,7 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.MurmurHash;
+import org.apache.cassandra.utils.ObjectSizes;
 
 /**
  * This class generates a BigIntegerToken using a Murmur3 hash.
@@ -41,9 +44,11 @@
     public static final LongToken MINIMUM = new LongToken(Long.MIN_VALUE);
     public static final long MAXIMUM = Long.MAX_VALUE;
 
+    private static final int HEAP_SIZE = (int) ObjectSizes.measureDeep(MINIMUM);
+
     public DecoratedKey decorateKey(ByteBuffer key)
     {
-        return new DecoratedKey(getToken(key), key);
+        return new BufferDecoratedKey(getToken(key), key);
     }
 
     public Token midpoint(Token lToken, Token rToken)
@@ -94,9 +99,14 @@
         return new LongToken(normalize(hash[0]));
     }
 
+    public long getHeapSizeOf(LongToken token)
+    {
+        return HEAP_SIZE;
+    }
+
     public LongToken getRandomToken()
     {
-        return new LongToken(normalize(FBUtilities.threadLocalRandom().nextLong()));
+        return new LongToken(normalize(ThreadLocalRandom.current().nextLong()));
     }
 
     private long normalize(long v)

diff --git a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
index 4c5a7d4..ad1ffaa 100644
--- a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java

@@ -23,6 +23,7 @@
 import java.util.*;
 
 import org.apache.cassandra.config.*;
+import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.UTF8Type;
@@ -31,6 +32,7 @@
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.Pair;
 
 public class OrderPreservingPartitioner extends AbstractPartitioner<StringToken>
@@ -39,9 +41,11 @@
 
     public static final BigInteger CHAR_MASK = new BigInteger("65535");
 
+    private static final long EMPTY_SIZE = ObjectSizes.measure(MINIMUM);
+
     public DecoratedKey decorateKey(ByteBuffer key)
     {
-        return new DecoratedKey(getToken(key), key);
+        return new BufferDecoratedKey(getToken(key), key);
     }
 
     public StringToken midpoint(Token ltoken, Token rtoken)
@@ -169,6 +173,11 @@
         return new StringToken(skey);
     }
 
+    public long getHeapSizeOf(StringToken token)
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOf(token.token);
+    }
+
     public Map<Token, Float> describeOwnership(List<Token> sortedTokens)
     {
         // allTokens will contain the count and be returned, sorted_ranges is shorthand for token<->token math.
@@ -191,7 +200,7 @@
                 for (Range<Token> r : sortedRanges)
                 {
                     // Looping over every KS:CF:Range, get the splits size and add it to the count
-                    allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, cfmd.getIndexInterval(), cfmd).size());
+                    allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, cfmd.getMinIndexInterval()).size());
                 }
             }
         }

diff --git a/src/java/org/apache/cassandra/dht/RandomPartitioner.java b/src/java/org/apache/cassandra/dht/RandomPartitioner.java
index c9ddccf..3ccd086 100644
--- a/src/java/org/apache/cassandra/dht/RandomPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/RandomPartitioner.java

@@ -22,6 +22,7 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
+import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
@@ -29,6 +30,7 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.GuidGenerator;
+import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -40,9 +42,11 @@
     public static final BigIntegerToken MINIMUM = new BigIntegerToken("-1");
     public static final BigInteger MAXIMUM = new BigInteger("2").pow(127);
 
+    private static final int EMPTY_SIZE = (int) ObjectSizes.measureDeep(new BigIntegerToken(FBUtilities.hashToBigInteger(ByteBuffer.allocate(1))));
+
     public DecoratedKey decorateKey(ByteBuffer key)
     {
-        return new DecoratedKey(getToken(key), key);
+        return new BufferDecoratedKey(getToken(key), key);
     }
 
     public Token midpoint(Token ltoken, Token rtoken)
@@ -123,6 +127,11 @@
         return new BigIntegerToken(FBUtilities.hashToBigInteger(key));
     }
 
+    public long getHeapSizeOf(BigIntegerToken token)
+    {
+        return EMPTY_SIZE;
+    }
+
     public Map<Token, Float> describeOwnership(List<Token> sortedTokens)
     {
         Map<Token, Float> ownerships = new HashMap<Token, Float>();

diff --git a/src/java/org/apache/cassandra/dht/RangeStreamer.java b/src/java/org/apache/cassandra/dht/RangeStreamer.java
index 1e6d9b8..d84a951 100644
--- a/src/java/org/apache/cassandra/dht/RangeStreamer.java
+++ b/src/java/org/apache/cassandra/dht/RangeStreamer.java

@@ -23,6 +23,8 @@
 import com.google.common.collect.ArrayListMultimap;
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
+import org.apache.cassandra.gms.EndpointState;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -30,6 +32,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.gms.FailureDetector;
+import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.gms.IFailureDetector;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.locator.IEndpointSnitch;
@@ -44,7 +47,8 @@
 public class RangeStreamer
 {
     private static final Logger logger = LoggerFactory.getLogger(RangeStreamer.class);
-
+    public static final boolean useStrictConsistency = Boolean.valueOf(System.getProperty("cassandra.consistent.rangemovement","true"));
+    private final Collection<Token> tokens;
     private final TokenMetadata metadata;
     private final InetAddress address;
     private final String description;
@@ -99,9 +103,19 @@
         }
     }
 
+    public RangeStreamer(TokenMetadata metadata, Collection<Token> tokens, InetAddress address, String description)
+    {
+        this.metadata = metadata;
+        this.tokens = tokens;
+        this.address = address;
+        this.description = description;
+        this.streamPlan = new StreamPlan(description);
+    }
+
     public RangeStreamer(TokenMetadata metadata, InetAddress address, String description)
     {
         this.metadata = metadata;
+        this.tokens = null;
         this.address = address;
         this.description = description;
         this.streamPlan = new StreamPlan(description);
@@ -114,11 +128,12 @@
 
     public void addRanges(String keyspaceName, Collection<Range<Token>> ranges)
     {
-        Multimap<Range<Token>, InetAddress> rangesForKeyspace = getAllRangesWithSourcesFor(keyspaceName, ranges);
+        Multimap<Range<Token>, InetAddress> rangesForKeyspace = useStrictSourcesForRanges(keyspaceName)
+                ? getAllRangesWithStrictSourcesFor(keyspaceName, ranges) : getAllRangesWithSourcesFor(keyspaceName, ranges);
 
         if (logger.isDebugEnabled())
         {
-            for (Map.Entry<Range<Token>, InetAddress> entry: rangesForKeyspace.entries())
+            for (Map.Entry<Range<Token>, InetAddress> entry : rangesForKeyspace.entries())
                 logger.debug(String.format("%s: range %s exists on %s", description, entry.getKey(), entry.getValue()));
         }
 
@@ -133,6 +148,15 @@
         }
     }
 
+    private boolean useStrictSourcesForRanges(String keyspaceName)
+    {
+        AbstractReplicationStrategy strat = Keyspace.open(keyspaceName).getReplicationStrategy();
+        return !DatabaseDescriptor.isReplacing()
+                && useStrictConsistency
+                && tokens != null
+                && metadata.getAllEndpoints().size() != strat.getReplicationFactor();
+    }
+
     /**
      * Get a map of all ranges and their respective sources that are candidates for streaming the given ranges
      * to us. For each range, the list of sources is sorted by proximity relative to the given destAddress.
@@ -163,6 +187,66 @@
     }
 
     /**
+     * Get a map of all ranges and the source that will be cleaned up once this bootstrapped node is added for the given ranges.
+     * For each range, the list should only contain a single source. This allows us to consistently migrate data without violating
+     * consistency.
+     */
+    private Multimap<Range<Token>, InetAddress> getAllRangesWithStrictSourcesFor(String table, Collection<Range<Token>> desiredRanges)
+    {
+
+        assert tokens != null;
+        AbstractReplicationStrategy strat = Keyspace.open(table).getReplicationStrategy();
+
+        //Active ranges
+        TokenMetadata metadataClone = metadata.cloneOnlyTokenMap();
+        Multimap<Range<Token>,InetAddress> addressRanges = strat.getRangeAddresses(metadataClone);
+
+        //Pending ranges
+        metadataClone.updateNormalTokens(tokens, address);
+        Multimap<Range<Token>,InetAddress> pendingRangeAddresses = strat.getRangeAddresses(metadataClone);
+
+        //Collects the source that will have its range moved to the new node
+        Multimap<Range<Token>, InetAddress> rangeSources = ArrayListMultimap.create();
+
+        for (Range<Token> desiredRange : desiredRanges)
+        {
+            for (Map.Entry<Range<Token>, Collection<InetAddress>> preEntry : addressRanges.asMap().entrySet())
+            {
+                if (preEntry.getKey().contains(desiredRange))
+                {
+                    Set<InetAddress> oldEndpoints = Sets.newHashSet(preEntry.getValue());
+                    Set<InetAddress> newEndpoints = Sets.newHashSet(pendingRangeAddresses.get(desiredRange));
+
+                    //Due to CASSANDRA-5953 we can have a higher RF then we have endpoints.
+                    //So we need to be careful to only be strict when endpoints == RF
+                    if (oldEndpoints.size() == strat.getReplicationFactor())
+                    {
+                        oldEndpoints.removeAll(newEndpoints);
+                        assert oldEndpoints.size() == 1 : "Expected 1 endpoint but found " + oldEndpoints.size();
+                    }
+
+                    rangeSources.put(desiredRange, oldEndpoints.iterator().next());
+                }
+            }
+
+            //Validate
+            Collection<InetAddress> addressList = rangeSources.get(desiredRange);
+            if (addressList == null || addressList.isEmpty())
+                throw new IllegalStateException("No sources found for " + desiredRange);
+
+            if (addressList.size() > 1)
+                throw new IllegalStateException("Multiple endpoints found for " + desiredRange);
+
+            InetAddress sourceIp = addressList.iterator().next();
+            EndpointState sourceState = Gossiper.instance.getEndpointStateForEndpoint(sourceIp);
+            if (Gossiper.instance.isEnabled() && (sourceState == null || !sourceState.isAlive()))
+                throw new RuntimeException("A node required to move the data consistently is down ("+sourceIp+").  If you wish to move the data from a potentially inconsistent replica, restart the node with -Dcassandra.consistent.rangemovement=false");
+        }
+
+        return rangeSources;
+    }
+
+    /**
      * @param rangesWithSources The ranges we want to fetch (key) and their potential sources (value)
      * @param sourceFilters A (possibly empty) collection of source filters to apply. In addition to any filters given
      *                      here, we always exclude ourselves.
@@ -224,7 +308,7 @@
             Collection<Range<Token>> ranges = entry.getValue().getValue();
             /* Send messages to respective folks to stream data over to me */
             if (logger.isDebugEnabled())
-                logger.debug("" + description + "ing from " + source + " ranges " + StringUtils.join(ranges, ", "));
+                logger.debug("{}ing from {} ranges {}", description, source, StringUtils.join(ranges, ", "));
             streamPlan.requestRanges(source, keyspace, ranges);
         }
 

diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java
index 771f833..09b73b7 100644
--- a/src/java/org/apache/cassandra/dht/Token.java
+++ b/src/java/org/apache/cassandra/dht/Token.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.dht;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
 import java.nio.ByteBuffer;
@@ -27,6 +26,7 @@
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.RowPosition;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -83,7 +83,7 @@
 
     public static class TokenSerializer implements ISerializer<Token>
     {
-        public void serialize(Token token, DataOutput out) throws IOException
+        public void serialize(Token token, DataOutputPlus out) throws IOException
         {
             IPartitioner p = StorageService.getPartitioner();
             ByteBuffer b = p.getTokenFactory().toByteArray(token);
@@ -173,7 +173,7 @@
             return (R)maxKeyBound();
     }
 
-    public static class KeyBound extends RowPosition
+    public static class KeyBound implements RowPosition
     {
         private final Token token;
         public final boolean isMinimumBound;
@@ -209,6 +209,11 @@
             return getToken().isMinimum(partitioner);
         }
 
+        public boolean isMinimum()
+        {
+            return isMinimum(StorageService.getPartitioner());
+        }
+
         public RowPosition.Kind kind()
         {
             return isMinimumBound ? RowPosition.Kind.MIN_BOUND : RowPosition.Kind.MAX_BOUND;

diff --git a/src/java/org/apache/cassandra/gms/EchoMessage.java b/src/java/org/apache/cassandra/gms/EchoMessage.java
index 46b572e..444278f 100644
--- a/src/java/org/apache/cassandra/gms/EchoMessage.java
+++ b/src/java/org/apache/cassandra/gms/EchoMessage.java

@@ -22,10 +22,10 @@
 
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class EchoMessage
 {
@@ -33,7 +33,7 @@
 
     public static class EchoMessageSerializer implements IVersionedSerializer<EchoMessage>
     {
-        public void serialize(EchoMessage t, DataOutput out, int version) throws IOException
+        public void serialize(EchoMessage t, DataOutputPlus out, int version) throws IOException
         {
         }
 

diff --git a/src/java/org/apache/cassandra/gms/EndpointState.java b/src/java/org/apache/cassandra/gms/EndpointState.java
index 3df9155..1029374 100644
--- a/src/java/org/apache/cassandra/gms/EndpointState.java
+++ b/src/java/org/apache/cassandra/gms/EndpointState.java

@@ -25,6 +25,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 import org.cliffc.high_scale_lib.NonBlockingHashMap;
 
@@ -121,7 +122,7 @@
 
 class EndpointStateSerializer implements IVersionedSerializer<EndpointState>
 {
-    public void serialize(EndpointState epState, DataOutput out, int version) throws IOException
+    public void serialize(EndpointState epState, DataOutputPlus out, int version) throws IOException
     {
         /* serialize the HeartBeatState */
         HeartBeatState hbState = epState.getHeartBeatState();

diff --git a/src/java/org/apache/cassandra/gms/FailureDetector.java b/src/java/org/apache/cassandra/gms/FailureDetector.java
index 60729d3..7490619 100644
--- a/src/java/org/apache/cassandra/gms/FailureDetector.java
+++ b/src/java/org/apache/cassandra/gms/FailureDetector.java

@@ -197,7 +197,7 @@
         // it's worth being defensive here so minor bugs don't cause disproportionate
         // badness.  (See CASSANDRA-1463 for an example).
         if (epState == null)
-            logger.error("unknown endpoint " + ep);
+            logger.error("unknown endpoint {}", ep);
         return epState != null && epState.isAlive();
     }
 

diff --git a/src/java/org/apache/cassandra/gms/GossipDigest.java b/src/java/org/apache/cassandra/gms/GossipDigest.java
index 0191dad..471602e 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigest.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigest.java

@@ -22,6 +22,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 
 /**
@@ -79,7 +80,7 @@
 
 class GossipDigestSerializer implements IVersionedSerializer<GossipDigest>
 {
-    public void serialize(GossipDigest gDigest, DataOutput out, int version) throws IOException
+    public void serialize(GossipDigest gDigest, DataOutputPlus out, int version) throws IOException
     {
         CompactEndpointSerializationHelper.serialize(gDigest.endpoint, out);
         out.writeInt(gDigest.generation);

diff --git a/src/java/org/apache/cassandra/gms/GossipDigestAck.java b/src/java/org/apache/cassandra/gms/GossipDigestAck.java
index c1445dd..e3be9aa 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigestAck.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigestAck.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.gms;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.HashMap;
@@ -27,6 +26,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 
 /**
@@ -59,7 +59,7 @@
 
 class GossipDigestAckSerializer implements IVersionedSerializer<GossipDigestAck>
 {
-    public void serialize(GossipDigestAck gDigestAckMessage, DataOutput out, int version) throws IOException
+    public void serialize(GossipDigestAck gDigestAckMessage, DataOutputPlus out, int version) throws IOException
     {
         GossipDigestSerializationHelper.serialize(gDigestAckMessage.gDigestList, out, version);
         out.writeInt(gDigestAckMessage.epStateMap.size());

diff --git a/src/java/org/apache/cassandra/gms/GossipDigestAck2.java b/src/java/org/apache/cassandra/gms/GossipDigestAck2.java
index c7c81d4..4a6a06e 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigestAck2.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigestAck2.java

@@ -24,6 +24,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 
 /**
@@ -49,7 +50,7 @@
 
 class GossipDigestAck2Serializer implements IVersionedSerializer<GossipDigestAck2>
 {
-    public void serialize(GossipDigestAck2 ack2, DataOutput out, int version) throws IOException
+    public void serialize(GossipDigestAck2 ack2, DataOutputPlus out, int version) throws IOException
     {
         out.writeInt(ack2.epStateMap.size());
         for (Map.Entry<InetAddress, EndpointState> entry : ack2.epStateMap.entrySet())

diff --git a/src/java/org/apache/cassandra/gms/GossipDigestSyn.java b/src/java/org/apache/cassandra/gms/GossipDigestSyn.java
index 7c53604..0ad67bd 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigestSyn.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigestSyn.java

@@ -23,6 +23,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
  * This is the first message that gets sent out as a start of the Gossip protocol in a
@@ -51,7 +52,7 @@
 
 class GossipDigestSerializationHelper
 {
-    static void serialize(List<GossipDigest> gDigestList, DataOutput out, int version) throws IOException
+    static void serialize(List<GossipDigest> gDigestList, DataOutputPlus out, int version) throws IOException
     {
         out.writeInt(gDigestList.size());
         for (GossipDigest gDigest : gDigestList)
@@ -78,7 +79,7 @@
 
 class GossipDigestSynSerializer implements IVersionedSerializer<GossipDigestSyn>
 {
-    public void serialize(GossipDigestSyn gDigestSynMessage, DataOutput out, int version) throws IOException
+    public void serialize(GossipDigestSyn gDigestSynMessage, DataOutputPlus out, int version) throws IOException
     {
         out.writeUTF(gDigestSynMessage.clusterId);
         out.writeUTF(gDigestSynMessage.partioner);

diff --git a/src/java/org/apache/cassandra/gms/GossipDigestSynVerbHandler.java b/src/java/org/apache/cassandra/gms/GossipDigestSynVerbHandler.java
index df74808..6b175de 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigestSynVerbHandler.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigestSynVerbHandler.java

@@ -49,13 +49,13 @@
         /* If the message is from a different cluster throw it away. */
         if (!gDigestMessage.clusterId.equals(DatabaseDescriptor.getClusterName()))
         {
-            logger.warn("ClusterName mismatch from " + from + " " + gDigestMessage.clusterId + "!=" + DatabaseDescriptor.getClusterName());
+            logger.warn("ClusterName mismatch from {} {}!={}", from, gDigestMessage.clusterId, DatabaseDescriptor.getClusterName());
             return;
         }
 
         if (gDigestMessage.partioner != null && !gDigestMessage.partioner.equals(DatabaseDescriptor.getPartitionerName()))
         {
-            logger.warn("Partitioner mismatch from " + from + " " + gDigestMessage.partioner + "!=" + DatabaseDescriptor.getPartitionerName());
+            logger.warn("Partitioner mismatch from {} {}!={}", from, gDigestMessage.partioner, DatabaseDescriptor.getPartitionerName());
             return;
         }
 

diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java
index eb0cf39..5f0e576 100644
--- a/src/java/org/apache/cassandra/gms/Gossiper.java
+++ b/src/java/org/apache/cassandra/gms/Gossiper.java

@@ -47,6 +47,7 @@
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 import com.google.common.collect.ImmutableList;
 
@@ -81,6 +82,9 @@
     public static final Gossiper instance = new Gossiper();
 
     public static final long aVeryLongTime = 259200 * 1000; // 3 days
+
+    /** Maximimum difference in generation and version values we are willing to accept about a peer */
+    private static final long MAX_GENERATION_DIFFERENCE = 86400 * 365;
     private long FatClientTimeout;
     private final Random random = new Random();
     private final Comparator<InetAddress> inetcomparator = new Comparator<InetAddress>()
@@ -343,7 +347,7 @@
         expireTimeEndpointMap.remove(endpoint);
         quarantineEndpoint(endpoint);
         if (logger.isDebugEnabled())
-            logger.debug("evicting " + endpoint + " from gossip");
+            logger.debug("evicting {} from gossip", endpoint);
     }
 
     /**
@@ -370,7 +374,7 @@
         quarantineEndpoint(endpoint);
         MessagingService.instance().destroyConnectionPool(endpoint);
         if (logger.isDebugEnabled())
-            logger.debug("removing endpoint " + endpoint);
+            logger.debug("removing endpoint {}", endpoint);
     }
 
     /**
@@ -447,14 +451,14 @@
         // remember this node's generation
         int generation = epState.getHeartBeatState().getGeneration();
         logger.info("Removing host: {}", hostId);
-        logger.info("Sleeping for " + StorageService.RING_DELAY + "ms to ensure " + endpoint + " does not change");
+        logger.info("Sleeping for {}ms to ensure {} does not change", StorageService.RING_DELAY, endpoint);
         Uninterruptibles.sleepUninterruptibly(StorageService.RING_DELAY, TimeUnit.MILLISECONDS);
         // make sure it did not change
         epState = endpointStateMap.get(endpoint);
         if (epState.getHeartBeatState().getGeneration() != generation)
             throw new RuntimeException("Endpoint " + endpoint + " generation changed while trying to remove it");
         // update the other node's generation to mimic it as if it had changed it itself
-        logger.info("Advertising removal for " + endpoint);
+        logger.info("Advertising removal for {}", endpoint);
         epState.updateTimestamp(); // make sure we don't evict it too soon
         epState.getHeartBeatState().forceNewerGenerationUnsafe();
         epState.addApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.removingNonlocal(hostId));
@@ -476,7 +480,7 @@
         epState.getHeartBeatState().forceNewerGenerationUnsafe();
         long expireTime = computeExpireTime();
         epState.addApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.removedNonlocal(hostId, expireTime));
-        logger.info("Completing removal of " + endpoint);
+        logger.info("Completing removal of {}", endpoint);
         addExpireTimeForEndpoint(endpoint, expireTime);
         endpointStateMap.put(endpoint, epState);
         // ensure at least one gossip round occurs before returning
@@ -511,12 +515,13 @@
             }
             catch (Throwable th)
             {
+                JVMStabilityInspector.inspectThrowable(th);
                 // TODO this is broken
                 logger.warn("Unable to calculate tokens for {}.  Will use a random one", address);
                 tokens = Collections.singletonList(StorageService.getPartitioner().getRandomToken());
             }
             int generation = epState.getHeartBeatState().getGeneration();
-            logger.info("Sleeping for " + StorageService.RING_DELAY + "ms to ensure " + endpoint + " does not change");
+            logger.info("Sleeping for {}ms to ensure {} does not change", StorageService.RING_DELAY, endpoint);
             Uninterruptibles.sleepUninterruptibly(StorageService.RING_DELAY, TimeUnit.MILLISECONDS);
             // make sure it did not change
             EndpointState newState = endpointStateMap.get(endpoint);
@@ -666,7 +671,7 @@
                     && !justRemovedEndpoints.containsKey(endpoint)
                     && TimeUnit.NANOSECONDS.toMillis(nowNano - epState.getUpdateTimestamp()) > FatClientTimeout)
                 {
-                    logger.info("FatClient " + endpoint + " has been silent for " + FatClientTimeout + "ms, removing from gossip");
+                    logger.info("FatClient {} has been silent for {}ms, removing from gossip", endpoint, FatClientTimeout);
                     removeEndpoint(endpoint); // will put it in justRemovedEndpoints to respect quarantine delay
                     evictFromMembership(endpoint); // can get rid of the state immediately
                 }
@@ -678,7 +683,7 @@
                 {
                     if (logger.isDebugEnabled())
                     {
-                        logger.debug("time is expiring for endpoint : " + endpoint + " (" + expireTime + ")");
+                        logger.debug("time is expiring for endpoint : {} ({})", endpoint, expireTime);
                     }
                     evictFromMembership(endpoint);
                 }
@@ -692,7 +697,7 @@
                 if ((now - entry.getValue()) > QUARANTINE_DELAY)
                 {
                     if (logger.isDebugEnabled())
-                        logger.debug(QUARANTINE_DELAY + " elapsed, " + entry.getKey() + " gossip quarantine over");
+                        logger.debug("{} elapsed, {} gossip quarantine over", QUARANTINE_DELAY, entry.getKey());
                     justRemovedEndpoints.remove(entry.getKey());
                 }
             }
@@ -932,7 +937,7 @@
             markAlive(ep, epState);
         else
         {
-            logger.debug("Not marking " + ep + " alive due to dead state");
+            logger.debug("Not marking {} alive due to dead state", ep);
             markDead(ep, epState);
         }
         for (IEndpointStateChangeSubscriber subscriber : subscribers)
@@ -982,7 +987,12 @@
                 if (logger.isTraceEnabled())
                     logger.trace(ep + "local generation " + localGeneration + ", remote generation " + remoteGeneration);
 
-                if (remoteGeneration > localGeneration)
+                if (remoteGeneration > localGeneration + MAX_GENERATION_DIFFERENCE)
+                {
+                    // assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
+                    logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}", ep, localGeneration, remoteGeneration);
+                }
+                else if (remoteGeneration > localGeneration)
                 {
                     if (logger.isTraceEnabled())
                         logger.trace("Updating heartbeat state generation to " + remoteGeneration + " from " + localGeneration + " for " + ep);
@@ -1362,7 +1372,7 @@
     {
         if (logger.isDebugEnabled())
         {
-            logger.debug("adding expire time for endpoint : " + endpoint + " (" + expireTime + ")");
+            logger.debug("adding expire time for endpoint : {} ({})", endpoint, expireTime);
         }
         expireTimeEndpointMap.put(endpoint, expireTime);
     }

diff --git a/src/java/org/apache/cassandra/gms/HeartBeatState.java b/src/java/org/apache/cassandra/gms/HeartBeatState.java
index c3b423c..b33ef92 100644
--- a/src/java/org/apache/cassandra/gms/HeartBeatState.java
+++ b/src/java/org/apache/cassandra/gms/HeartBeatState.java

@@ -21,6 +21,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
  * HeartBeat State associated with any given endpoint.
@@ -71,7 +72,7 @@
 
 class HeartBeatStateSerializer implements IVersionedSerializer<HeartBeatState>
 {
-    public void serialize(HeartBeatState hbState, DataOutput out, int version) throws IOException
+    public void serialize(HeartBeatState hbState, DataOutputPlus out, int version) throws IOException
     {
         out.writeInt(hbState.getGeneration());
         out.writeInt(hbState.getHeartBeatVersion());

diff --git a/src/java/org/apache/cassandra/gms/IFailureNotification.java b/src/java/org/apache/cassandra/gms/IFailureNotification.java
deleted file mode 100644
index 7e1c955..0000000
--- a/src/java/org/apache/cassandra/gms/IFailureNotification.java
+++ /dev/null

@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.gms;
-
-import java.net.InetAddress;
-
-public interface IFailureNotification
-{
-    public void convict(InetAddress ep);
-
-    public void revive(InetAddress ep);
-}

diff --git a/src/java/org/apache/cassandra/gms/TokenSerializer.java b/src/java/org/apache/cassandra/gms/TokenSerializer.java
index 3789b50..bc5bf4b 100644
--- a/src/java/org/apache/cassandra/gms/TokenSerializer.java
+++ b/src/java/org/apache/cassandra/gms/TokenSerializer.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.gms;
 
-import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
 

diff --git a/src/java/org/apache/cassandra/gms/VersionedValue.java b/src/java/org/apache/cassandra/gms/VersionedValue.java
index 565a8cb..36c2f00 100644
--- a/src/java/org/apache/cassandra/gms/VersionedValue.java
+++ b/src/java/org/apache/cassandra/gms/VersionedValue.java

@@ -31,6 +31,7 @@
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -245,7 +246,7 @@
 
     private static class VersionedValueSerializer implements IVersionedSerializer<VersionedValue>
     {
-        public void serialize(VersionedValue value, DataOutput out, int version) throws IOException
+        public void serialize(VersionedValue value, DataOutputPlus out, int version) throws IOException
         {
             out.writeUTF(outValue(value, version));
             out.writeInt(value.version);

diff --git a/src/java/org/apache/cassandra/hadoop/AbstractColumnFamilyInputFormat.java b/src/java/org/apache/cassandra/hadoop/AbstractColumnFamilyInputFormat.java
index 73bc25c..e8de0f2 100644
--- a/src/java/org/apache/cassandra/hadoop/AbstractColumnFamilyInputFormat.java
+++ b/src/java/org/apache/cassandra/hadoop/AbstractColumnFamilyInputFormat.java

@@ -127,7 +127,7 @@
         keyspace = ConfigHelper.getInputKeyspace(conf);
         cfName = ConfigHelper.getInputColumnFamily(conf);
         partitioner = ConfigHelper.getInputPartitioner(conf);
-        logger.debug("partitioner is " + partitioner);
+        logger.debug("partitioner is {}", partitioner);
 
 
         // cannonical ranges, split into pieces, fetching the splits in parallel
@@ -263,7 +263,7 @@
                                     subSplit.getRow_count(),
                                     endpoints);
 
-                    logger.debug("adding " + split);
+                    logger.debug("adding {}", split);
                     splits.add(split);
                 }
             }
@@ -303,7 +303,7 @@
             }
             catch (IOException e)
             {
-                logger.debug("failed connect to endpoint " + host, e);
+                logger.debug("failed connect to endpoint {}", host, e);
             }
             catch (InvalidRequestException e)
             {

diff --git a/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java b/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java
index 6dd90f6..686d486 100644
--- a/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java
+++ b/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java

@@ -21,7 +21,7 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.Cell;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.Reporter;
@@ -44,15 +44,15 @@
  *
  * The default split size is 64k rows.
  */
-public class ColumnFamilyInputFormat extends AbstractColumnFamilyInputFormat<ByteBuffer, SortedMap<ByteBuffer, Column>>
+public class ColumnFamilyInputFormat extends AbstractColumnFamilyInputFormat<ByteBuffer, SortedMap<ByteBuffer, Cell>>
 {
     
-    public RecordReader<ByteBuffer, SortedMap<ByteBuffer, Column>> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException
+    public RecordReader<ByteBuffer, SortedMap<ByteBuffer, Cell>> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException
     {
         return new ColumnFamilyRecordReader();
     }
 
-    public org.apache.hadoop.mapred.RecordReader<ByteBuffer, SortedMap<ByteBuffer, Column>> getRecordReader(org.apache.hadoop.mapred.InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException
+    public org.apache.hadoop.mapred.RecordReader<ByteBuffer, SortedMap<ByteBuffer, Cell>> getRecordReader(org.apache.hadoop.mapred.InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException
     {
         TaskAttemptContext tac = HadoopCompat.newMapContext(
                 jobConf,

diff --git a/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java b/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java
index 0bc1c49..0b52904 100644
--- a/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java
+++ b/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java

@@ -24,11 +24,13 @@
 import java.util.*;
 
 import com.google.common.collect.*;
+import org.apache.cassandra.db.BufferCell;
+import org.apache.cassandra.db.Cell;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.db.marshal.TypeParser;
@@ -44,8 +46,8 @@
 import org.apache.thrift.TException;
 import org.apache.thrift.transport.TTransport;
 
-public class ColumnFamilyRecordReader extends RecordReader<ByteBuffer, SortedMap<ByteBuffer, Column>>
-    implements org.apache.hadoop.mapred.RecordReader<ByteBuffer, SortedMap<ByteBuffer, Column>>
+public class ColumnFamilyRecordReader extends RecordReader<ByteBuffer, SortedMap<ByteBuffer, Cell>>
+    implements org.apache.hadoop.mapred.RecordReader<ByteBuffer, SortedMap<ByteBuffer, Cell>>
 {
     private static final Logger logger = LoggerFactory.getLogger(ColumnFamilyRecordReader.class);
 
@@ -53,7 +55,7 @@
 
     private ColumnFamilySplit split;
     private RowIterator iter;
-    private Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> currentRow;
+    private Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>> currentRow;
     private SlicePredicate predicate;
     private boolean isEmptyPredicate;
     private int totalRowCount; // total number of rows to fetch
@@ -92,7 +94,7 @@
         return currentRow.left;
     }
 
-    public SortedMap<ByteBuffer, Column> getCurrentValue()
+    public SortedMap<ByteBuffer, Cell> getCurrentValue()
     {
         return currentRow.right;
     }
@@ -174,7 +176,7 @@
     {
         if (!iter.hasNext())
         {
-            logger.debug("Finished scanning " + iter.rowsRead() + " rows (estimate was: " + totalRowCount + ")");
+            logger.debug("Finished scanning {} rows (estimate was: {})", iter.rowsRead(), totalRowCount);
             return false;
         }
 
@@ -210,7 +212,7 @@
         return split.getLocations()[0];
     }
 
-    private abstract class RowIterator extends AbstractIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Column>>>
+    private abstract class RowIterator extends AbstractIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>>>
     {
         protected List<KeySlice> rows;
         protected int totalRead = 0;
@@ -279,50 +281,50 @@
             return totalRead;
         }
 
-        protected List<Column> unthriftify(ColumnOrSuperColumn cosc)
+        protected List<Cell> unthriftify(ColumnOrSuperColumn cosc)
         {
             if (cosc.counter_column != null)
-                return Collections.<Column>singletonList(unthriftifyCounter(cosc.counter_column));
+                return Collections.<Cell>singletonList(unthriftifyCounter(cosc.counter_column));
             if (cosc.counter_super_column != null)
                 return unthriftifySuperCounter(cosc.counter_super_column);
             if (cosc.super_column != null)
                 return unthriftifySuper(cosc.super_column);
             assert cosc.column != null;
-            return Collections.<Column>singletonList(unthriftifySimple(cosc.column));
+            return Collections.<Cell>singletonList(unthriftifySimple(cosc.column));
         }
 
-        private List<Column> unthriftifySuper(SuperColumn super_column)
+        private List<Cell> unthriftifySuper(SuperColumn super_column)
         {
-            List<Column> columns = new ArrayList<Column>(super_column.columns.size());
+            List<Cell> cells = new ArrayList<Cell>(super_column.columns.size());
             for (org.apache.cassandra.thrift.Column column : super_column.columns)
             {
-                Column c = unthriftifySimple(column);
-                columns.add(c.withUpdatedName(CompositeType.build(super_column.name, c.name())));
+                Cell c = unthriftifySimple(column);
+                cells.add(c.withUpdatedName(CellNames.simpleDense(CompositeType.build(super_column.name, c.name().toByteBuffer()))));
             }
-            return columns;
+            return cells;
         }
 
-        protected Column unthriftifySimple(org.apache.cassandra.thrift.Column column)
+        protected Cell unthriftifySimple(org.apache.cassandra.thrift.Column column)
         {
-            return new Column(column.name, column.value, column.timestamp);
+            return new BufferCell(CellNames.simpleDense(column.name), column.value, column.timestamp);
         }
 
-        private Column unthriftifyCounter(CounterColumn column)
+        private Cell unthriftifyCounter(CounterColumn column)
         {
             //CounterColumns read the counterID from the System keyspace, so need the StorageService running and access
-            //to cassandra.yaml. To avoid a Hadoop needing access to yaml return a regular Column.
-            return new Column(column.name, ByteBufferUtil.bytes(column.value), 0);
+            //to cassandra.yaml. To avoid a Hadoop needing access to yaml return a regular Cell.
+            return new BufferCell(CellNames.simpleDense(column.name), ByteBufferUtil.bytes(column.value), 0);
         }
 
-        private List<Column> unthriftifySuperCounter(CounterSuperColumn super_column)
+        private List<Cell> unthriftifySuperCounter(CounterSuperColumn super_column)
         {
-            List<Column> columns = new ArrayList<Column>(super_column.columns.size());
+            List<Cell> cells = new ArrayList<Cell>(super_column.columns.size());
             for (CounterColumn column : super_column.columns)
             {
-                Column c = unthriftifyCounter(column);
-                columns.add(c.withUpdatedName(CompositeType.build(super_column.name, c.name())));
+                Cell c = unthriftifyCounter(column);
+                cells.add(c.withUpdatedName(CellNames.simpleDense(CompositeType.build(super_column.name, c.name().toByteBuffer()))));
             }
-            return columns;
+            return cells;
         }
     }
 
@@ -401,7 +403,7 @@
             }
         }
 
-        protected Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> computeNext()
+        protected Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>> computeNext()
         {
             maybeInit();
             if (rows == null)
@@ -409,12 +411,13 @@
 
             totalRead++;
             KeySlice ks = rows.get(i++);
-            SortedMap<ByteBuffer, Column> map = new TreeMap<ByteBuffer, Column>(comparator);
+            AbstractType<?> comp = isSuper ? CompositeType.getInstance(comparator, subComparator) : comparator;
+            SortedMap<ByteBuffer, Cell> map = new TreeMap<ByteBuffer, Cell>(comp);
             for (ColumnOrSuperColumn cosc : ks.columns)
             {
-                List<Column> columns = unthriftify(cosc);
-                for (Column column : columns)
-                    map.put(column.name(), column);
+                List<Cell> cells = unthriftify(cosc);
+                for (Cell cell : cells)
+                    map.put(cell.name().toByteBuffer(), cell);
             }
             return Pair.create(ks.key, map);
         }
@@ -422,7 +425,7 @@
 
     private class WideRowIterator extends RowIterator
     {
-        private PeekingIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Column>>> wideColumns;
+        private PeekingIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>>> wideColumns;
         private ByteBuffer lastColumn = ByteBufferUtil.EMPTY_BYTE_BUFFER;
         private ByteBuffer lastCountedKey = ByteBufferUtil.EMPTY_BYTE_BUFFER;
 
@@ -471,14 +474,14 @@
             }
         }
 
-        protected Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> computeNext()
+        protected Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>> computeNext()
         {
             maybeInit();
             if (rows == null)
                 return endOfData();
 
-            Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> next = wideColumns.next();
-            lastColumn = next.right.values().iterator().next().name().duplicate();
+            Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>> next = wideColumns.next();
+            lastColumn = next.right.keySet().iterator().next().duplicate();
 
             maybeIncreaseRowCounter(next);
             return next;
@@ -489,7 +492,7 @@
          * Increases the row counter only if we really moved to the next row.
          * @param next just fetched row slice
          */
-        private void maybeIncreaseRowCounter(Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> next)
+        private void maybeIncreaseRowCounter(Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>> next)
         {
             ByteBuffer currentKey = next.left;
             if (!currentKey.equals(lastCountedKey))
@@ -499,7 +502,7 @@
             }
         }
 
-        private class WideColumnIterator extends AbstractIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Column>>>
+        private class WideColumnIterator extends AbstractIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>>>
         {
             private final Iterator<KeySlice> rows;
             private Iterator<ColumnOrSuperColumn> columns;
@@ -520,27 +523,28 @@
                 columns = currentRow.columns.iterator();
             }
 
-            protected Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> computeNext()
+            protected Pair<ByteBuffer, SortedMap<ByteBuffer, Cell>> computeNext()
             {
+                AbstractType<?> comp = isSuper ? CompositeType.getInstance(comparator, subComparator) : comparator;
                 while (true)
                 {
                     if (columns.hasNext())
                     {
                         ColumnOrSuperColumn cosc = columns.next();
-                        SortedMap<ByteBuffer, Column> map;
-                        List<Column> columns = unthriftify(cosc);
-                        if (columns.size() == 1)
+                        SortedMap<ByteBuffer, Cell> map;
+                        List<Cell> cells = unthriftify(cosc);
+                        if (cells.size() == 1)
                         {
-                            map = ImmutableSortedMap.of(columns.get(0).name(), columns.get(0));
+                            map = ImmutableSortedMap.of(cells.get(0).name().toByteBuffer(), cells.get(0));
                         }
                         else
                         {
                             assert isSuper;
-                            map = new TreeMap<ByteBuffer, Column>(CompositeType.getInstance(comparator, subComparator));
-                            for (Column column : columns)
-                                map.put(column.name(), column);
+                            map = new TreeMap<ByteBuffer, Cell>(comp);
+                            for (Cell cell : cells)
+                                map.put(cell.name().toByteBuffer(), cell);
                         }
-                        return Pair.<ByteBuffer, SortedMap<ByteBuffer, Column>>create(currentRow.key, map);
+                        return Pair.<ByteBuffer, SortedMap<ByteBuffer, Cell>>create(currentRow.key, map);
                     }
 
                     if (!rows.hasNext())
@@ -557,7 +561,7 @@
     // to the old. Thus, expect a small performance hit.
     // And obviously this wouldn't work for wide rows. But since ColumnFamilyInputFormat
     // and ColumnFamilyRecordReader don't support them, it should be fine for now.
-    public boolean next(ByteBuffer key, SortedMap<ByteBuffer, Column> value) throws IOException
+    public boolean next(ByteBuffer key, SortedMap<ByteBuffer, Cell> value) throws IOException
     {
         if (this.nextKeyValue())
         {
@@ -578,9 +582,9 @@
         return ByteBuffer.wrap(new byte[this.keyBufferSize]);
     }
 
-    public SortedMap<ByteBuffer, Column> createValue()
+    public SortedMap<ByteBuffer, Cell> createValue()
     {
-        return new TreeMap<ByteBuffer, Column>();
+        return new TreeMap<ByteBuffer, Cell>();
     }
 
     public long getPos() throws IOException

diff --git a/src/java/org/apache/cassandra/hadoop/ReporterWrapper.java b/src/java/org/apache/cassandra/hadoop/ReporterWrapper.java
index 9c9c61f..00023d8 100644
--- a/src/java/org/apache/cassandra/hadoop/ReporterWrapper.java
+++ b/src/java/org/apache/cassandra/hadoop/ReporterWrapper.java

@@ -7,13 +7,14 @@
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
  */
 package org.apache.cassandra.hadoop;
 

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java
index 43e3a12..e60a240 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java

@@ -23,6 +23,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.UUID;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -161,7 +162,7 @@
     
     private File getColumnFamilyDirectory() throws IOException
     {
-        File dir = new File(String.format("%s%s%s%s%s", getOutputLocation(), File.separator, keyspace, File.separator, columnFamily));
+        File dir = new File(String.format("%s%s%s%s%s-%s", getOutputLocation(), File.separator, keyspace, File.separator, columnFamily, UUID.randomUUID().toString()));
         
         if (!dir.exists() && !dir.mkdirs())
         {

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlInputFormat.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlInputFormat.java
index 36da92d..09bd80c 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlInputFormat.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlInputFormat.java

@@ -19,9 +19,7 @@
 
 import java.io.IOException;
 
-import org.apache.cassandra.hadoop.HadoopCompat;
 import org.apache.cassandra.hadoop.AbstractColumnFamilyInputFormat;
-import org.apache.cassandra.hadoop.ReporterWrapper;
 import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.RecordReader;
@@ -55,15 +53,14 @@
     public RecordReader<Long, Row> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter)
             throws IOException
     {
-        TaskAttemptContext tac = HadoopCompat.newMapContext(
-                jobConf,
-                TaskAttemptID.forName(jobConf.get(MAPRED_TASK_ID)),
-                null,
-                null,
-                null,
-                new ReporterWrapper(reporter),
-                null);
-
+        TaskAttemptContext tac = new TaskAttemptContext(jobConf, TaskAttemptID.forName(jobConf.get(MAPRED_TASK_ID)))
+        {
+            @Override
+            public void progress()
+            {
+                reporter.progress();
+            }
+        };
 
         CqlRecordReader recordReader = new CqlRecordReader();
         recordReader.initialize((org.apache.hadoop.mapreduce.InputSplit)split, tac);

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlOutputFormat.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlOutputFormat.java
index 5845175..0d09ca2 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlOutputFormat.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlOutputFormat.java

@@ -37,7 +37,7 @@
  * As is the case with the {@link org.apache.cassandra.hadoop.ColumnFamilyInputFormat}, 
  * you need to set the prepared statement in your
  * Hadoop job Configuration. The {@link CqlConfigHelper} class, through its
- * {@link ConfigHelper#setOutputPreparedStatement} method, is provided to make this
+ * {@link CqlConfigHelper#setOutputCql} method, is provided to make this
  * simple.
  * you need to set the Keyspace. The {@link ConfigHelper} class, through its
  * {@link ConfigHelper#setOutputColumnFamily} method, is provided to make this

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordReader.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordReader.java
index 3eab7c0..eedaa17 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordReader.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordReader.java

@@ -43,6 +43,7 @@
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.hadoop.ColumnFamilySplit;
 import org.apache.cassandra.hadoop.ConfigHelper;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.InputSplit;
@@ -283,7 +284,8 @@
             {
                 for (String column : partitionBoundColumns.keySet())
                 {
-                    if (BytesType.bytesCompare(keyColumns.get(column), previousRowKey.get(column)) != 0)
+                    // this is not correct - but we don't seem to have easy access to better type information here
+                    if (ByteBufferUtil.compareUnsigned(keyColumns.get(column), previousRowKey.get(column)) != 0)
                     {
                         previousRowKey = keyColumns;
                         totalRead++;

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordWriter.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordWriter.java
index b967494..5075be4 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordWriter.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordWriter.java

@@ -60,21 +60,21 @@
  *
  * @see CqlOutputFormat
  */
-final class CqlRecordWriter extends AbstractColumnFamilyRecordWriter<Map<String, ByteBuffer>, List<ByteBuffer>>
+class CqlRecordWriter extends AbstractColumnFamilyRecordWriter<Map<String, ByteBuffer>, List<ByteBuffer>>
 {
     private static final Logger logger = LoggerFactory.getLogger(CqlRecordWriter.class);
 
     // handles for clients for each range running in the threadpool
-    private final Map<InetAddress, RangeClient> clients;
+    protected final Map<InetAddress, RangeClient> clients;
 
     // host to prepared statement id mappings
-    private ConcurrentHashMap<Cassandra.Client, Integer> preparedStatements = new ConcurrentHashMap<Cassandra.Client, Integer>();
+    protected final ConcurrentHashMap<Cassandra.Client, Integer> preparedStatements = new ConcurrentHashMap<Cassandra.Client, Integer>();
 
-    private final String cql;
+    protected final String cql;
 
-    private AbstractType<?> keyValidator;
-    private String [] partitionKeyColumns;
-    private List<String> clusterColumns;
+    protected AbstractType<?> keyValidator;
+    protected String [] partitionKeyColumns;
+    protected List<String> clusterColumns;
 
     /**
      * Upon construction, obtain the map that this writer will use to collect
@@ -348,7 +348,7 @@
         
         Column rawPartitionKeys = result.rows.get(0).columns.get(1);
         String keyString = ByteBufferUtil.string(ByteBuffer.wrap(rawPartitionKeys.getValue()));
-        logger.debug("partition keys: " + keyString);
+        logger.debug("partition keys: {}", keyString);
 
         List<String> keys = FBUtilities.fromJsonList(keyString);
         partitionKeyColumns = new String[keys.size()];
@@ -362,7 +362,7 @@
         Column rawClusterColumns = result.rows.get(0).columns.get(2);
         String clusterColumnString = ByteBufferUtil.string(ByteBuffer.wrap(rawClusterColumns.getValue()));
 
-        logger.debug("cluster columns: " + clusterColumnString);
+        logger.debug("cluster columns: {}", clusterColumnString);
         clusterColumns = FBUtilities.fromJsonList(clusterColumnString);
     }
 

diff --git a/src/java/org/apache/cassandra/hadoop/pig/AbstractCassandraStorage.java b/src/java/org/apache/cassandra/hadoop/pig/AbstractCassandraStorage.java
index 65e3be1..361baa4 100644
--- a/src/java/org/apache/cassandra/hadoop/pig/AbstractCassandraStorage.java
+++ b/src/java/org/apache/cassandra/hadoop/pig/AbstractCassandraStorage.java

@@ -27,15 +27,15 @@
 
 import com.google.common.collect.Iterables;
 
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.auth.IAuthenticator;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.CFDefinition;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.db.marshal.AbstractCompositeType.CompositeComponent;
+import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.hadoop.*;
 import org.apache.cassandra.thrift.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -84,7 +84,7 @@
 
     public final static String PARTITION_FILTER_SIGNATURE = "cassandra.partition.filter";
 
-    protected static final Logger logger = LoggerFactory.getLogger(AbstractCassandraStorage.class);
+    private static final Logger logger = LoggerFactory.getLogger(AbstractCassandraStorage.class);
 
     protected String username;
     protected String password;
@@ -119,27 +119,26 @@
     }
 
     /** convert a column to a tuple */
-    protected Tuple columnToTuple(Column col, CfInfo cfInfo, AbstractType comparator) throws IOException
+    protected Tuple columnToTuple(Cell col, CfInfo cfInfo, AbstractType comparator) throws IOException
     {
         CfDef cfDef = cfInfo.cfDef;
         Tuple pair = TupleFactory.getInstance().newTuple(2);
 
+        ByteBuffer colName = col.name().toByteBuffer();
+
         // name
         if(comparator instanceof AbstractCompositeType)
-            setTupleValue(pair, 0, composeComposite((AbstractCompositeType)comparator,col.name()));
+            setTupleValue(pair, 0, composeComposite((AbstractCompositeType)comparator,colName));
         else
-            setTupleValue(pair, 0, cassandraToObj(comparator, col.name()));
+            setTupleValue(pair, 0, cassandraToObj(comparator, colName));
 
         // value
         Map<ByteBuffer,AbstractType> validators = getValidatorMap(cfDef);
-        ByteBuffer colName;
         if (cfInfo.cql3Table && !cfInfo.compactCqlTable)
         {
-            ByteBuffer[] names = ((AbstractCompositeType) parseType(cfDef.comparator_type)).split(col.name());
+            ByteBuffer[] names = ((AbstractCompositeType) parseType(cfDef.comparator_type)).split(colName);
             colName = names[names.length-1];
         }
-        else
-            colName = col.name();
         if (validators.get(colName) == null)
         {
             Map<MarshallerType, AbstractType> marshallers = getDefaultMarshallers(cfDef);
@@ -433,8 +432,10 @@
         {
             ByteBuffer buffer = objToBB(sub);
             serialized.add(buffer);
-        }      
-        return CollectionType.pack(serialized, objects.size());
+        }
+        // NOTE: using protocol v1 serialization format for collections so as to not break
+        // compatibility. Not sure if that's the right thing.
+        return CollectionSerializer.pack(serialized, objects.size(), 1);
     }
 
     private ByteBuffer objToMapBB(List<Object> objects)
@@ -449,7 +450,9 @@
                 serialized.add(buffer);
             }
         } 
-        return CollectionType.pack(serialized, objects.size());
+        // NOTE: using protocol v1 serialization format for collections so as to not break
+        // compatibility. Not sure if that's the right thing.
+        return CollectionSerializer.pack(serialized, objects.size(), 1);
     }
 
     private ByteBuffer objToCompositeBB(List<Object> objects)
@@ -479,6 +482,10 @@
     {
     }
 
+    public void cleanupOnSuccess(String location, Job job) throws IOException {
+    }
+
+
     /** Methods to get the column family schema from Cassandra */
     protected void initSchema(String signature) throws IOException
     {
@@ -674,12 +681,12 @@
                 return columnDefs;
 
             // otherwise for CqlStorage, check metadata for classic thrift tables
-            CFDefinition cfDefinition = getCfDefinition(keyspace, column_family, client);
-            for (CFDefinition.Name column : Iterables.concat(cfDefinition.staticColumns(), cfDefinition.regularColumns()))
+            CFMetaData cfm = getCFMetaData(keyspace, column_family, client);
+            for (ColumnDefinition def : cfm.regularAndStaticColumns())
             {
                 ColumnDef cDef = new ColumnDef();
-                String columnName = column.name.toString();
-                String type = column.type.toString();
+                String columnName = def.name.toString();
+                String type = def.type.toString();
                 logger.debug("name: {}, type: {} ", columnName, type);
                 cDef.name = ByteBufferUtil.bytes(columnName);
                 cDef.validation_class = type;
@@ -687,14 +694,14 @@
             }
             // we may not need to include the value column for compact tables as we 
             // could have already processed it as schema_columnfamilies.value_alias
-            if (columnDefs.size() == 0 && includeCompactValueColumn)
+            if (columnDefs.size() == 0 && includeCompactValueColumn && cfm.compactValueColumn() != null)
             {
-                String value = cfDefinition.compactValue() != null ? cfDefinition.compactValue().toString() : null;
-                if ("value".equals(value))
+                ColumnDefinition def = cfm.compactValueColumn();
+                if ("value".equals(def.name.toString()))
                 {
                     ColumnDef cDef = new ColumnDef();
-                    cDef.name = ByteBufferUtil.bytes(value);
-                    cDef.validation_class = cfDefinition.compactValue().type.toString();
+                    cDef.name = def.name.bytes;
+                    cDef.validation_class = def.type.toString();
                     columnDefs.add(cDef);
                 }
             }
@@ -760,8 +767,8 @@
         return indexes;
     }
 
-    /** get CFDefinition of a column family */
-    protected CFDefinition getCfDefinition(String ks, String cf, Cassandra.Client client)
+    /** get CFMetaData of a column family */
+    protected CFMetaData getCFMetaData(String ks, String cf, Cassandra.Client client)
             throws NotFoundException,
             InvalidRequestException,
             TException,
@@ -772,7 +779,7 @@
         for (CfDef cfDef : ksDef.cf_defs)
         {
             if (cfDef.name.equalsIgnoreCase(cf))
-                return new CFDefinition(CFMetaData.fromThrift(cfDef));
+                return CFMetaData.fromThrift(cfDef);
         }
         return null;
     }
@@ -781,11 +788,18 @@
     {
         if (validator instanceof DecimalType || validator instanceof InetAddressType)
             return validator.getString(value);
-        else
-            return validator.compose(value);
+
+        if (validator instanceof CollectionType)
+        {
+            // For CollectionType, the compose() method assumes the v3 protocol format of collection, which
+            // is not correct here since we query using the CQL-over-thrift interface which use the pre-v3 format
+            return ((CollectionSerializer)validator.getSerializer()).deserializeForNativeProtocol(value, 1);
+        }
+
+        return validator.compose(value);
     }
 
-    protected class CfInfo
+    protected static class CfInfo
     {
         boolean compactCqlTable = false;
         boolean cql3Table = false;

diff --git a/src/java/org/apache/cassandra/hadoop/pig/CassandraStorage.java b/src/java/org/apache/cassandra/hadoop/pig/CassandraStorage.java
index 1b51762..14d30d5 100644
--- a/src/java/org/apache/cassandra/hadoop/pig/CassandraStorage.java
+++ b/src/java/org/apache/cassandra/hadoop/pig/CassandraStorage.java

@@ -23,10 +23,10 @@
 import java.util.*;
 
 import org.apache.cassandra.hadoop.HadoopCompat;
+import org.apache.cassandra.db.Cell;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.Column;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -66,7 +66,7 @@
     private boolean slice_reverse = false;
     private boolean allow_deletes = false;
 
-    private RecordReader<ByteBuffer, Map<ByteBuffer, Column>> reader;
+    private RecordReader<ByteBuffer, Map<ByteBuffer, Cell>> reader;
     private RecordWriter<ByteBuffer, List<Mutation>> writer;
 
     private boolean widerows = false;
@@ -74,7 +74,7 @@
     
     // wide row hacks
     private ByteBuffer lastKey;
-    private Map<ByteBuffer, Column> lastRow;
+    private Map<ByteBuffer, Cell> lastRow;
     private boolean hasNext = true;
 
     public CassandraStorage()
@@ -126,7 +126,7 @@
                             key = (ByteBuffer)reader.getCurrentKey();
                             tuple = keyToTuple(key, cfDef, parseType(cfDef.getKey_validation_class()));
                         }
-                        for (Map.Entry<ByteBuffer, Column> entry : lastRow.entrySet())
+                        for (Map.Entry<ByteBuffer, Cell> entry : lastRow.entrySet())
                         {
                             bag.add(columnToTuple(entry.getValue(), cfInfo, parseType(cfDef.getComparator_type())));
                         }
@@ -150,7 +150,7 @@
                 {
                     // read too much, hold on to it for next time
                     lastKey = (ByteBuffer)reader.getCurrentKey();
-                    lastRow = (SortedMap<ByteBuffer, Column>)reader.getCurrentValue();
+                    lastRow = (SortedMap<ByteBuffer, Cell>)reader.getCurrentValue();
                     // but return what we have so far
                     tuple.append(bag);
                     return tuple;
@@ -164,13 +164,13 @@
                             tuple = keyToTuple(lastKey, cfDef, parseType(cfDef.getKey_validation_class()));
                         else
                             addKeyToTuple(tuple, lastKey, cfDef, parseType(cfDef.getKey_validation_class()));
-                        for (Map.Entry<ByteBuffer, Column> entry : lastRow.entrySet())
+                        for (Map.Entry<ByteBuffer, Cell> entry : lastRow.entrySet())
                         {
                             bag.add(columnToTuple(entry.getValue(), cfInfo, parseType(cfDef.getComparator_type())));
                         }
                         tuple.append(bag);
                         lastKey = key;
-                        lastRow = (SortedMap<ByteBuffer, Column>)reader.getCurrentValue();
+                        lastRow = (SortedMap<ByteBuffer, Cell>)reader.getCurrentValue();
                         return tuple;
                     }
                     if (tuple == null)
@@ -178,17 +178,17 @@
                     else
                         addKeyToTuple(tuple, lastKey, cfDef, parseType(cfDef.getKey_validation_class()));
                 }
-                SortedMap<ByteBuffer, Column> row = (SortedMap<ByteBuffer, Column>)reader.getCurrentValue();
+                SortedMap<ByteBuffer, Cell> row = (SortedMap<ByteBuffer, Cell>)reader.getCurrentValue();
                 if (lastRow != null) // prepend what was read last time
                 {
-                    for (Map.Entry<ByteBuffer, Column> entry : lastRow.entrySet())
+                    for (Map.Entry<ByteBuffer, Cell> entry : lastRow.entrySet())
                     {
                         bag.add(columnToTuple(entry.getValue(), cfInfo, parseType(cfDef.getComparator_type())));
                     }
                     lastKey = null;
                     lastRow = null;
                 }
-                for (Map.Entry<ByteBuffer, Column> entry : row.entrySet())
+                for (Map.Entry<ByteBuffer, Cell> entry : row.entrySet())
                 {
                     bag.add(columnToTuple(entry.getValue(), cfInfo, parseType(cfDef.getComparator_type())));
                 }
@@ -215,7 +215,7 @@
             CfInfo cfInfo = getCfInfo(loadSignature);
             CfDef cfDef = cfInfo.cfDef;
             ByteBuffer key = reader.getCurrentKey();
-            Map<ByteBuffer, Column> cf = reader.getCurrentValue();
+            Map<ByteBuffer, Cell> cf = reader.getCurrentValue();
             assert key != null && cf != null;
 
             // output tuple, will hold the key, each indexed column in a tuple, then a bag of the rest
@@ -249,7 +249,7 @@
                 added.put(cdef.name, true);
             }
             // now add all the other columns
-            for (Map.Entry<ByteBuffer, Column> entry : cf.entrySet())
+            for (Map.Entry<ByteBuffer, Cell> entry : cf.entrySet())
             {
                 if (!added.containsKey(entry.getKey()))
                     bag.add(columnToTuple(entry.getValue(), cfInfo, parseType(cfDef.getComparator_type())));

diff --git a/src/java/org/apache/cassandra/hadoop/pig/CqlNativeStorage.java b/src/java/org/apache/cassandra/hadoop/pig/CqlNativeStorage.java
index eea5d4e..3c59a1c 100644
--- a/src/java/org/apache/cassandra/hadoop/pig/CqlNativeStorage.java
+++ b/src/java/org/apache/cassandra/hadoop/pig/CqlNativeStorage.java

@@ -22,7 +22,9 @@
 import java.util.Iterator;
 import java.util.Map;
 
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.BufferCell;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.hadoop.ConfigHelper;
 import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
@@ -99,9 +101,9 @@
                 ByteBuffer columnValue = row.getBytesUnsafe(ByteBufferUtil.string(cdef.name.duplicate()));
                 if (columnValue != null)
                 {
-                    Column column = new Column(cdef.name, columnValue);
-                    AbstractType<?> validator = getValidatorMap(cfDef).get(column.name());
-                    setTupleValue(tuple, i, cqlColumnToObj(column, cfDef), validator);
+                    Cell cell = new BufferCell(CellNames.simpleDense(cdef.name), columnValue);
+                    AbstractType<?> validator = getValidatorMap(cfDef).get(cdef.name);
+                    setTupleValue(tuple, i, cqlColumnToObj(cell, cfDef), validator);
                 }
                 else
                     tuple.set(i, null);
@@ -142,7 +144,7 @@
             CqlConfigHelper.setInputMaxConnections(conf, nativeMaxConnections);
         if (nativeMinSimultReqs != null)
             CqlConfigHelper.setInputMinSimultReqPerConnections(conf, nativeMinSimultReqs);
-        if (nativeMinSimultReqs != null)
+        if (nativeMaxSimultReqs != null)
             CqlConfigHelper.setInputMaxSimultReqPerConnections(conf, nativeMaxSimultReqs);
         if (nativeConnectionTimeout != null)
             CqlConfigHelper.setInputNativeConnectionTimeout(conf, nativeConnectionTimeout);

diff --git a/src/java/org/apache/cassandra/hadoop/pig/CqlStorage.java b/src/java/org/apache/cassandra/hadoop/pig/CqlStorage.java
index 53f3900..bb3ba36 100644
--- a/src/java/org/apache/cassandra/hadoop/pig/CqlStorage.java
+++ b/src/java/org/apache/cassandra/hadoop/pig/CqlStorage.java

@@ -22,9 +22,11 @@
 import java.nio.charset.CharacterCodingException;
 import java.util.*;
 
-import org.apache.cassandra.hadoop.HadoopCompat;
-import org.apache.cassandra.cql3.CFDefinition;
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.BufferCell;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.hadoop.*;
@@ -54,7 +56,6 @@
 public class CqlStorage extends AbstractCassandraStorage
 {
     private static final Logger logger = LoggerFactory.getLogger(CqlStorage.class);
-
     private RecordReader<Map<String, ByteBuffer>, Map<String, ByteBuffer>> reader;
     protected RecordWriter<Map<String, ByteBuffer>, List<ByteBuffer>> writer;
 
@@ -111,9 +112,9 @@
                 ByteBuffer columnValue = columns.get(ByteBufferUtil.string(cdef.name.duplicate()));
                 if (columnValue != null)
                 {
-                    Column column = new Column(cdef.name, columnValue);
-                    AbstractType<?> validator = getValidatorMap(cfDef).get(column.name());
-                    setTupleValue(tuple, i, cqlColumnToObj(column, cfDef), validator);
+                    Cell cell = new BufferCell(CellNames.simpleDense(cdef.name), columnValue);
+                    AbstractType<?> validator = getValidatorMap(cfDef).get(cdef.name);
+                    setTupleValue(tuple, i, cqlColumnToObj(cell, cfDef), validator);
                 }
                 else
                     tuple.set(i, null);
@@ -182,14 +183,15 @@
     }
 
     /** convert a cql column to an object */
-    protected Object cqlColumnToObj(Column col, CfDef cfDef) throws IOException
+    protected Object cqlColumnToObj(Cell col, CfDef cfDef) throws IOException
     {
         // standard
         Map<ByteBuffer,AbstractType> validators = getValidatorMap(cfDef);
-        if (validators.get(col.name()) == null)
+        ByteBuffer cellName = col.name().toByteBuffer();
+        if (validators.get(cellName) == null)
             return cassandraToObj(getDefaultMarshallers(cfDef).get(MarshallerType.DEFAULT_VALIDATOR), col.value());
         else
-            return cassandraToObj(validators.get(col.name()), col.value());
+            return cassandraToObj(validators.get(cellName), col.value());
     }
 
     /** set read configuration settings */
@@ -520,18 +522,18 @@
             // classic thrift tables
             if (keys.size() == 0)
             {
-                CFDefinition cfDefinition = getCfDefinition(keyspace, column_family, client);
-                for (CFDefinition.Name column : cfDefinition.partitionKeys())
+                CFMetaData cfm = getCFMetaData(keyspace, column_family, client);
+                for (ColumnDefinition def : cfm.partitionKeyColumns())
                 {
-                    String key = column.name.toString();
+                    String key = def.name.toString();
                     logger.debug("name: {} ", key);
                     ColumnDef cDef = new ColumnDef();
                     cDef.name = ByteBufferUtil.bytes(key);
                     keys.add(cDef);
                 }
-                for (CFDefinition.Name column : cfDefinition.clusteringColumns())
+                for (ColumnDefinition def : cfm.clusteringColumns())
                 {
-                    String key = column.name.toString();
+                    String key = def.name.toString();
                     logger.debug("name: {} ", key);
                     ColumnDef cDef = new ColumnDef();
                     cDef.name = ByteBufferUtil.bytes(key);

diff --git a/src/java/org/apache/cassandra/io/ISSTableSerializer.java b/src/java/org/apache/cassandra/io/ISSTableSerializer.java
index 5e501ae..20ee352 100644
--- a/src/java/org/apache/cassandra/io/ISSTableSerializer.java
+++ b/src/java/org/apache/cassandra/io/ISSTableSerializer.java

@@ -18,21 +18,22 @@
 package org.apache.cassandra.io;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public interface ISSTableSerializer<T>
 {
     /**
      * Serialize the specified type into the specified DataOutputStream
      * instance in the format suited for SSTables.
+     *
      * @param t type that needs to be serialized
      * @param out DataOutput into which serialization needs to happen.
      * @throws java.io.IOException
      */
-    public void serializeForSSTable(T t, DataOutput out) throws IOException;
+    public void serializeForSSTable(T t, DataOutputPlus out) throws IOException;
 
     /**
      * Deserialize into the specified DataInputStream instance in the format

diff --git a/src/java/org/apache/cassandra/io/ISerializer.java b/src/java/org/apache/cassandra/io/ISerializer.java
index a72d17d..7e1759c 100644
--- a/src/java/org/apache/cassandra/io/ISerializer.java
+++ b/src/java/org/apache/cassandra/io/ISerializer.java

@@ -18,20 +18,22 @@
 package org.apache.cassandra.io;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public interface ISerializer<T>
 {
     /**
      * Serialize the specified type into the specified DataOutput instance.
+     *
+     *
      * @param t type that needs to be serialized
      * @param out DataOutput into which serialization needs to happen.
      * @throws java.io.IOException
      */
-    public void serialize(T t, DataOutput out) throws IOException;
+    public void serialize(T t, DataOutputPlus out) throws IOException;
 
     /**
      * Deserialize from the specified DataInput instance.

diff --git a/src/java/org/apache/cassandra/io/IVersionedSerializer.java b/src/java/org/apache/cassandra/io/IVersionedSerializer.java
index dd890b6..46494e1 100644
--- a/src/java/org/apache/cassandra/io/IVersionedSerializer.java
+++ b/src/java/org/apache/cassandra/io/IVersionedSerializer.java

@@ -18,19 +18,21 @@
 package org.apache.cassandra.io;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
+import org.apache.cassandra.io.util.DataOutputPlus;
+
 public interface IVersionedSerializer<T>
 {
     /**
      * Serialize the specified type into the specified DataOutputStream instance.
+     *
      * @param t type that needs to be serialized
      * @param out DataOutput into which serialization needs to happen.
      * @param version protocol version
      * @throws java.io.IOException
      */
-    public void serialize(T t, DataOutput out, int version) throws IOException;
+    public void serialize(T t, DataOutputPlus out, int version) throws IOException;
 
     /**
      * Deserialize into the specified DataInputStream instance.

diff --git a/src/java/org/apache/cassandra/io/compress/CompressedRandomAccessReader.java b/src/java/org/apache/cassandra/io/compress/CompressedRandomAccessReader.java
index 64495b8..4521c19 100644
--- a/src/java/org/apache/cassandra/io/compress/CompressedRandomAccessReader.java
+++ b/src/java/org/apache/cassandra/io/compress/CompressedRandomAccessReader.java

@@ -19,6 +19,7 @@
 
 import java.io.*;
 import java.nio.ByteBuffer;
+import java.util.concurrent.ThreadLocalRandom;
 import java.util.zip.Adler32;
 import java.util.zip.CRC32;
 import java.util.zip.Checksum;
@@ -122,7 +123,7 @@
             throw new CorruptBlockException(getPath(), chunk, e);
         }
 
-        if (metadata.parameters.getCrcCheckChance() > FBUtilities.threadLocalRandom().nextDouble())
+        if (metadata.parameters.getCrcCheckChance() > ThreadLocalRandom.current().nextDouble())
         {
 
             if (metadata.hasPostCompressionAdlerChecksums)

diff --git a/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java b/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java
index eef5b17..e533b1e 100644
--- a/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java

@@ -21,26 +21,20 @@
 import java.io.File;
 import java.io.IOException;
 import java.util.zip.Adler32;
-import java.util.zip.CRC32;
 import java.util.zip.Checksum;
 
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
-import org.apache.cassandra.io.sstable.SSTableMetadata.Collector;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.util.DataIntegrityMetadata;
 import org.apache.cassandra.io.util.FileMark;
 import org.apache.cassandra.io.util.SequentialWriter;
 
 public class CompressedSequentialWriter extends SequentialWriter
 {
-    public static SequentialWriter open(String dataFilePath,
-                                        String indexFilePath,
-                                        boolean skipIOCache,
-                                        CompressionParameters parameters,
-                                        Collector sstableMetadataCollector)
-    {
-        return new CompressedSequentialWriter(new File(dataFilePath), indexFilePath, skipIOCache, parameters, sstableMetadataCollector);
-    }
+    private final DataIntegrityMetadata.ChecksumWriter crcMetadata;
 
     // holds offset in the file where current chunk should be written
     // changed only by flush() method where data buffer gets compressed and stored to the file
@@ -56,29 +50,26 @@
     // holds a number of already written chunks
     private int chunkCount = 0;
 
-    private final Checksum checksum = new Adler32();
-
     private long originalSize = 0, compressedSize = 0;
 
-    private final Collector sstableMetadataCollector;
+    private final MetadataCollector sstableMetadataCollector;
 
     public CompressedSequentialWriter(File file,
-                                      String indexFilePath,
-                                      boolean skipIOCache,
+                                      String offsetsPath,
                                       CompressionParameters parameters,
-                                      Collector sstableMetadataCollector)
+                                      MetadataCollector sstableMetadataCollector)
     {
-        super(file, parameters.chunkLength(), skipIOCache);
+        super(file, parameters.chunkLength());
         this.compressor = parameters.sstableCompressor;
 
         // buffer for compression should be the same size as buffer itself
         compressed = new ICompressor.WrappedArray(new byte[compressor.initialCompressedBufferLength(buffer.length)]);
 
         /* Index File (-CompressionInfo.db component) and it's header */
-        metadataWriter = CompressionMetadata.Writer.open(indexFilePath);
-        metadataWriter.writeHeader(parameters);
+        metadataWriter = CompressionMetadata.Writer.open(parameters, offsetsPath);
 
         this.sstableMetadataCollector = sstableMetadataCollector;
+        crcMetadata = new DataIntegrityMetadata.ChecksumWriter(out);
     }
 
     @Override
@@ -109,8 +100,7 @@
     @Override
     protected void flushData()
     {
-        seekToChunkStart();
-
+        seekToChunkStart(); // why is this necessary? seems like it should always be at chunk start in normal operation
 
         int compressedLength;
         try
@@ -126,13 +116,10 @@
         originalSize += validBufferBytes;
         compressedSize += compressedLength;
 
-        // update checksum
-        checksum.update(compressed.buffer, 0, compressedLength);
-
         try
         {
             // write an offset of the newly written chunk to the index file
-            metadataWriter.writeLong(chunkOffset);
+            metadataWriter.addOffset(chunkOffset);
             chunkCount++;
 
             assert compressedLength <= compressed.buffer.length;
@@ -140,20 +127,29 @@
             // write data itself
             out.write(compressed.buffer, 0, compressedLength);
             // write corresponding checksum
-            out.writeInt((int) checksum.getValue());
+            crcMetadata.append(compressed.buffer, 0, compressedLength);
+            lastFlushOffset += compressedLength + 4;
         }
         catch (IOException e)
         {
             throw new FSWriteError(e, getPath());
         }
 
-        // reset checksum object to the blank state for re-use
-        checksum.reset();
-
         // next chunk should be written right after current + length of the checksum (int)
         chunkOffset += compressedLength + 4;
     }
 
+    public CompressionMetadata openEarly()
+    {
+        return metadataWriter.openEarly(originalSize, chunkOffset);
+    }
+
+    public CompressionMetadata openAfterClose()
+    {
+        assert current == originalSize;
+        return metadataWriter.openAfterClose(current, chunkOffset);
+    }
+
     @Override
     public FileMark mark()
     {
@@ -204,6 +200,7 @@
                 throw new CorruptBlockException(getPath(), chunkOffset, chunkSize);
             }
 
+            Checksum checksum = new Adler32();
             checksum.update(compressed.buffer, 0, chunkSize);
 
             if (out.readInt() != (int) checksum.getValue())
@@ -222,8 +219,6 @@
             throw new FSReadError(e, getPath());
         }
 
-        checksum.reset();
-
         // reset buffer
         validBufferBytes = realMark.bufferOffset;
         bufferOffset = current - validBufferBytes;
@@ -260,10 +255,9 @@
 
         super.close();
         sstableMetadataCollector.addCompressionRatio(compressedSize, originalSize);
-        metadataWriter.finalizeHeader(current, chunkCount);
         try
         {
-            metadataWriter.close();
+            metadataWriter.close(current, chunkCount);
         }
         catch (IOException e)
         {
@@ -271,6 +265,12 @@
         }
     }
 
+    @Override
+    public void writeFullChecksum(Descriptor descriptor)
+    {
+        crcMetadata.writeFullChecksum(descriptor);
+    }
+
     /**
      * Class to hold a mark to the position of the file
      */

diff --git a/src/java/org/apache/cassandra/io/compress/CompressionMetadata.java b/src/java/org/apache/cassandra/io/compress/CompressionMetadata.java
index e75a7d7..abb067f 100644
--- a/src/java/org/apache/cassandra/io/compress/CompressionMetadata.java
+++ b/src/java/org/apache/cassandra/io/compress/CompressionMetadata.java

@@ -17,22 +17,40 @@
  */
 package org.apache.cassandra.io.compress;
 
-import java.io.*;
-import java.util.*;
+import java.io.BufferedOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.SortedSet;
+import java.util.TreeSet;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.primitives.Longs;
 
-import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.cache.RefCountedMemory;
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.Memory;
+import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -44,6 +62,7 @@
     public final long compressedFileLength;
     public final boolean hasPostCompressionAdlerChecksums;
     private final Memory chunkOffsets;
+    private final long chunkOffsetsSize;
     public final String indexFilePath;
     public final CompressionParameters parameters;
 
@@ -84,7 +103,7 @@
         {
             String compressorName = stream.readUTF();
             int optionCount = stream.readInt();
-            Map<String, String> options = new HashMap<String, String>();
+            Map<String, String> options = new HashMap<>();
             for (int i = 0; i < optionCount; ++i)
             {
                 String key = stream.readUTF();
@@ -113,6 +132,19 @@
         {
             FileUtils.closeQuietly(stream);
         }
+        this.chunkOffsetsSize = chunkOffsets.size();
+    }
+
+    private CompressionMetadata(String filePath, CompressionParameters parameters, RefCountedMemory offsets, long offsetsSize, long dataLength, long compressedLength, boolean hasPostCompressionAdlerChecksums)
+    {
+        this.indexFilePath = filePath;
+        this.parameters = parameters;
+        this.dataLength = dataLength;
+        this.compressedFileLength = compressedLength;
+        this.hasPostCompressionAdlerChecksums = hasPostCompressionAdlerChecksums;
+        this.chunkOffsets = offsets;
+        offsets.reference();
+        this.chunkOffsetsSize = offsetsSize;
     }
 
     public ICompressor compressor()
@@ -172,7 +204,7 @@
         // position of the chunk
         int idx = 8 * (int) (position / parameters.chunkLength());
 
-        if (idx >= chunkOffsets.size())
+        if (idx >= chunkOffsetsSize)
             throw new CorruptSSTableException(new EOFException(), indexFilePath);
 
         long chunkOffset = chunkOffsets.getLong(idx);
@@ -206,7 +238,7 @@
             {
                 long offset = i * 8;
                 long chunkOffset = chunkOffsets.getLong(offset);
-                long nextChunkOffset = offset + 8 == chunkOffsets.size()
+                long nextChunkOffset = offset + 8 == chunkOffsetsSize
                                      ? compressedFileLength
                                      : chunkOffsets.getLong(offset + 8);
                 offsets.add(new Chunk(chunkOffset, (int) (nextChunkOffset - chunkOffset - 4))); // "4" bytes reserved for checksum
@@ -217,52 +249,60 @@
 
     public void close()
     {
-        chunkOffsets.free();
+        if (chunkOffsets instanceof RefCountedMemory)
+            ((RefCountedMemory) chunkOffsets).unreference();
+        else
+            chunkOffsets.free();
     }
 
-    public static class Writer extends RandomAccessFile
+    public static class Writer
     {
-        // place for uncompressed data length in the index file
-        private long dataLengthOffset = -1;
         // path to the file
+        private final CompressionParameters parameters;
         private final String filePath;
+        private int maxCount = 100;
+        private RefCountedMemory offsets = new RefCountedMemory(maxCount * 8);
+        private int count = 0;
 
-        private Writer(String path) throws FileNotFoundException
+        private Writer(CompressionParameters parameters, String path)
         {
-            super(path, "rw");
+            this.parameters = parameters;
             filePath = path;
         }
 
-        public static Writer open(String path)
+        public static Writer open(CompressionParameters parameters, String path)
         {
-            try
-            {
-                return new Writer(path);
-            }
-            catch (FileNotFoundException e)
-            {
-                throw new RuntimeException(e);
-            }
+            return new Writer(parameters, path);
         }
 
-        public void writeHeader(CompressionParameters parameters)
+        public void addOffset(long offset)
+        {
+            if (count == maxCount)
+            {
+                RefCountedMemory newOffsets = offsets.copy((maxCount *= 2) * 8);
+                offsets.unreference();
+                offsets = newOffsets;
+            }
+            offsets.setLong(8 * count++, offset);
+        }
+
+        private void writeHeader(DataOutput out, long dataLength, int chunks)
         {
             try
             {
-                writeUTF(parameters.sstableCompressor.getClass().getSimpleName());
-                writeInt(parameters.otherOptions.size());
+                out.writeUTF(parameters.sstableCompressor.getClass().getSimpleName());
+                out.writeInt(parameters.otherOptions.size());
                 for (Map.Entry<String, String> entry : parameters.otherOptions.entrySet())
                 {
-                    writeUTF(entry.getKey());
-                    writeUTF(entry.getValue());
+                    out.writeUTF(entry.getKey());
+                    out.writeUTF(entry.getValue());
                 }
 
                 // store the length of the chunk
-                writeInt(parameters.chunkLength());
+                out.writeInt(parameters.chunkLength());
                 // store position and reserve a place for uncompressed data length and chunks count
-                dataLengthOffset = getFilePointer();
-                writeLong(-1);
-                writeInt(-1);
+                out.writeLong(dataLength);
+                out.writeInt(chunks);
             }
             catch (IOException e)
             {
@@ -270,36 +310,16 @@
             }
         }
 
-        public void finalizeHeader(long dataLength, int chunks)
+        public CompressionMetadata openEarly(long dataLength, long compressedLength)
         {
-            assert dataLengthOffset != -1 : "writeHeader wasn't called";
+            return new CompressionMetadata(filePath, parameters, offsets, count * 8L, dataLength, compressedLength, Descriptor.Version.CURRENT.hasPostCompressionAdlerChecksums);
+        }
 
-            long currentPosition;
-            try
-            {
-                currentPosition = getFilePointer();
-            }
-            catch (IOException e)
-            {
-                throw new FSReadError(e, filePath);
-            }
-
-            try
-            {
-                // seek back to the data length position
-                seek(dataLengthOffset);
-
-                // write uncompressed data length and chunks count
-                writeLong(dataLength);
-                writeInt(chunks);
-
-                // seek forward to the previous position
-                seek(currentPosition);
-            }
-            catch (IOException e)
-            {
-                throw new FSWriteError(e, filePath);
-            }
+        public CompressionMetadata openAfterClose(long dataLength, long compressedLength)
+        {
+            RefCountedMemory newOffsets = offsets.copy(count * 8L);
+            offsets.unreference();
+            return new CompressionMetadata(filePath, parameters, newOffsets, count * 8L, dataLength, compressedLength, Descriptor.Version.CURRENT.hasPostCompressionAdlerChecksums);
         }
 
         /**
@@ -311,33 +331,7 @@
          */
         public long chunkOffsetBy(int chunkIndex)
         {
-            if (dataLengthOffset == -1)
-                throw new IllegalStateException("writeHeader wasn't called");
-
-            try
-            {
-                long position = getFilePointer();
-
-                // seek to the position of the given chunk
-                seek(dataLengthOffset
-                     + 8 // size reserved for uncompressed data length
-                     + 4 // size reserved for chunk count
-                     + (chunkIndex * 8L));
-
-                try
-                {
-                    return readLong();
-                }
-                finally
-                {
-                    // back to the original position
-                    seek(position);
-                }
-            }
-            catch (IOException e)
-            {
-                throw new FSReadError(e, filePath);
-            }
+            return offsets.getLong(chunkIndex * 8L);
         }
 
         /**
@@ -346,25 +340,24 @@
          */
         public void resetAndTruncate(int chunkIndex)
         {
-            try
-            {
-                seek(dataLengthOffset
-                     + 8 // size reserved for uncompressed data length
-                     + 4 // size reserved for chunk count
-                     + (chunkIndex * 8L));
-                getChannel().truncate(getFilePointer());
-            }
-            catch (IOException e)
-            {
-                throw new FSWriteError(e, filePath);
-            }
+            count = chunkIndex;
         }
 
-        public void close() throws IOException
+        public void close(long dataLength, int chunks) throws IOException
         {
-            if (getChannel().isOpen()) // if RAF.closed were public we could just use that, but it's not
-                getChannel().force(true);
-            super.close();
+            DataOutputStream out = null;
+            try
+            {
+            	out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filePath)));
+	            assert chunks == count;
+	            writeHeader(out, dataLength, chunks);
+	            for (int i = 0 ; i < count ; i++)
+	                out.writeLong(offsets.getLong(i * 8));
+            }
+            finally
+            {
+                FileUtils.closeQuietly(out);
+            }
         }
     }
 
@@ -408,7 +401,7 @@
 
     static class ChunkSerializer implements IVersionedSerializer<Chunk>
     {
-        public void serialize(Chunk chunk, DataOutput out, int version) throws IOException
+        public void serialize(Chunk chunk, DataOutputPlus out, int version) throws IOException
         {
             out.writeLong(chunk.offset);
             out.writeInt(chunk.length);

diff --git a/src/java/org/apache/cassandra/io/compress/CompressionParameters.java b/src/java/org/apache/cassandra/io/compress/CompressionParameters.java
index 2df64b4..d3436c0 100644
--- a/src/java/org/apache/cassandra/io/compress/CompressionParameters.java
+++ b/src/java/org/apache/cassandra/io/compress/CompressionParameters.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.io.compress;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
@@ -37,6 +36,7 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class CompressionParameters
 {
@@ -322,7 +322,7 @@
 
     static class Serializer implements IVersionedSerializer<CompressionParameters>
     {
-        public void serialize(CompressionParameters parameters, DataOutput out, int version) throws IOException
+        public void serialize(CompressionParameters parameters, DataOutputPlus out, int version) throws IOException
         {
             out.writeUTF(parameters.sstableCompressor.getClass().getSimpleName());
             out.writeInt(parameters.otherOptions.size());

diff --git a/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java b/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java
index 2c6f82a..f8999bf 100644
--- a/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java

@@ -29,10 +29,10 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.CounterId;
-import org.apache.cassandra.utils.HeapAllocator;
 import org.apache.cassandra.utils.Pair;
 
 public abstract class AbstractSSTableSimpleWriter implements Closeable
@@ -56,9 +56,10 @@
         return new SSTableWriter(
             makeFilename(directory, metadata.ksName, metadata.cfName),
             0, // We don't care about the bloom filter
+            ActiveRepairService.UNREPAIRED_SSTABLE,
             metadata,
             DatabaseDescriptor.getPartitioner(),
-            SSTableMetadata.createCollector(metadata.comparator));
+            new MetadataCollector(metadata.comparator));
     }
 
     // find available generation and pick up filename from that
@@ -83,7 +84,7 @@
         int maxGen = 0;
         for (Descriptor desc : existing)
             maxGen = Math.max(maxGen, desc.generation);
-        return new Descriptor(directory, keyspace, columnFamily, maxGen + 1, true).filenameFor(Component.DATA);
+        return new Descriptor(directory, keyspace, columnFamily, maxGen + 1, Descriptor.Type.TEMP).filenameFor(Component.DATA);
     }
 
     /**
@@ -111,16 +112,16 @@
         currentSuperColumn = name;
     }
 
-    protected void addColumn(Column column) throws IOException
+    protected void addColumn(Cell cell) throws IOException
     {
         if (columnFamily.metadata().isSuper())
         {
             if (currentSuperColumn == null)
-                throw new IllegalStateException("Trying to add a column to a super column family, but no super column has been started.");
+                throw new IllegalStateException("Trying to add a cell to a super column family, but no super cell has been started.");
 
-            column = column.withUpdatedName(CompositeType.build(currentSuperColumn, column.name()));
+            cell = cell.withUpdatedName(columnFamily.getComparator().makeCellName(currentSuperColumn, cell.name().toByteBuffer()));
         }
-        columnFamily.addColumn(column);
+        columnFamily.addColumn(cell);
     }
 
     /**
@@ -131,7 +132,7 @@
      */
     public void addColumn(ByteBuffer name, ByteBuffer value, long timestamp) throws IOException
     {
-        addColumn(new Column(name, value, timestamp));
+        addColumn(new BufferCell(metadata.comparator.cellFromByteBuffer(name), value, timestamp));
     }
 
     /**
@@ -146,7 +147,7 @@
      */
     public void addExpiringColumn(ByteBuffer name, ByteBuffer value, long timestamp, int ttl, long expirationTimestampMS) throws IOException
     {
-        addColumn(new ExpiringColumn(name, value, timestamp, ttl, (int)(expirationTimestampMS / 1000)));
+        addColumn(new BufferExpiringCell(metadata.comparator.cellFromByteBuffer(name), value, timestamp, ttl, (int)(expirationTimestampMS / 1000)));
     }
 
     /**
@@ -156,9 +157,9 @@
      */
     public void addCounterColumn(ByteBuffer name, long value) throws IOException
     {
-        addColumn(new CounterColumn(name,
-                                    CounterContext.instance().createRemote(counterid, 1L, value, HeapAllocator.instance),
-                                    System.currentTimeMillis()));
+        addColumn(new BufferCounterCell(metadata.comparator.cellFromByteBuffer(name),
+                                        CounterContext.instance().createGlobal(counterid, 1L, value),
+                                        System.currentTimeMillis()));
     }
 
     /**

diff --git a/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java
index 49a1259..bf4da24 100644
--- a/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java

@@ -33,15 +33,14 @@
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.config.*;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestValidationException;
-import org.apache.cassandra.io.compress.CompressionParameters;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.utils.Allocator;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -198,21 +197,22 @@
         if (values.size() != boundNames.size())
             throw new InvalidRequestException(String.format("Invalid number of arguments, expecting %d values but got %d", boundNames.size(), values.size()));
 
-        List<ByteBuffer> keys = insert.buildPartitionKeyNames(values);
-        ColumnNameBuilder clusteringPrefix = insert.createClusteringPrefixBuilder(values);
+        QueryOptions options = QueryOptions.forInternalCalls(null, values);
+        List<ByteBuffer> keys = insert.buildPartitionKeyNames(options);
+        Composite clusteringPrefix = insert.createClusteringPrefix(options);
 
         long now = System.currentTimeMillis() * 1000;
         UpdateParameters params = new UpdateParameters(insert.cfm,
-                                                       values,
-                                                       insert.getTimestamp(now, values),
-                                                       insert.getTimeToLive(values),
-                                                       Collections.<ByteBuffer, ColumnGroupMap>emptyMap());
+                                                       options,
+                                                       insert.getTimestamp(now, options),
+                                                       insert.getTimeToLive(options),
+                                                       Collections.<ByteBuffer, CQL3Row>emptyMap());
 
         try
         {
-            for (ByteBuffer key: keys)
+            for (ByteBuffer key : keys)
             {
-                if (writer.currentKey() == null || !key.equals(writer.currentKey().key))
+                if (writer.currentKey() == null || !key.equals(writer.currentKey().getKey()))
                     writer.newRow(key);
                 insert.addUpdateForKey(writer.currentColumnFamily(), key, clusteringPrefix, params);
             }
@@ -339,13 +339,20 @@
 
                 // We need to register the keyspace/table metadata through Schema, otherwise we won't be able to properly
                 // build the insert statement in using().
-                KSMetaData ksm = KSMetaData.newKeyspace(this.schema.ksName,
-                                                        AbstractReplicationStrategy.getClass("org.apache.cassandra.locator.SimpleStrategy"),
-                                                        ImmutableMap.of("replication_factor", "1"),
-                                                        true,
-                                                        Collections.singleton(this.schema));
+                if (Schema.instance.getKSMetaData(this.schema.ksName) == null)
+                {
+                    KSMetaData ksm = KSMetaData.newKeyspace(this.schema.ksName,
+                                                            AbstractReplicationStrategy.getClass("org.apache.cassandra.locator.SimpleStrategy"),
+                                                            ImmutableMap.of("replication_factor", "1"),
+                                                            true,
+                                                            Collections.singleton(this.schema));
+                    Schema.instance.load(ksm);
+                }
+                else if (Schema.instance.getCFMetaData(this.schema.ksName, this.schema.cfName) == null)
+                {
+                    Schema.instance.load(this.schema);
+                }
 
-                Schema.instance.load(ksm);
                 return this;
             }
             catch (RequestValidationException e)
@@ -500,15 +507,15 @@
         @Override
         protected ColumnFamily createColumnFamily()
         {
-            return new TreeMapBackedSortedColumns(metadata)
+            return new ArrayBackedSortedColumns(metadata, false)
             {
                 @Override
-                public void addColumn(Column column, Allocator allocator)
+                public void addColumn(Cell cell)
                 {
-                    super.addColumn(column, allocator);
+                    super.addColumn(cell);
                     try
                     {
-                        countColumn(column);
+                        countColumn(cell);
                     }
                     catch (IOException e)
                     {

diff --git a/src/java/org/apache/cassandra/io/sstable/ColumnNameHelper.java b/src/java/org/apache/cassandra/io/sstable/ColumnNameHelper.java
index aaed765..f74b86f 100644
--- a/src/java/org/apache/cassandra/io/sstable/ColumnNameHelper.java
+++ b/src/java/org/apache/cassandra/io/sstable/ColumnNameHelper.java

@@ -19,112 +19,91 @@
 
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
 import java.util.List;
 
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.ColumnToCollectionType;
-import org.apache.cassandra.db.marshal.CompositeType;
 
 import static org.apache.cassandra.utils.ByteBufferUtil.minimalBufferFor;
 
 public class ColumnNameHelper
 {
+    private static List<ByteBuffer> maybeGrow(List<ByteBuffer> l, int size)
+    {
+        if (l.size() >= size)
+            return l;
+
+        List<ByteBuffer> nl = new ArrayList<>(size);
+        nl.addAll(l);
+        for (int i = l.size(); i < size; i++)
+            nl.add(null);
+        return nl;
+    }
+
+    private static List<ByteBuffer> getComponents(Composite prefix, int size)
+    {
+        List<ByteBuffer> l = new ArrayList<>(size);
+        for (int i = 0; i < size; i++)
+            l.add(prefix.get(i));
+        return l;
+    }
+
     /**
-     * finds the max column name(s)
+     * finds the max cell name component(s)
      *
-     * if comparator is of CompositeType, candidate will be split into its components, and each
-     * component is compared to the component on the same place in maxSeen, and then returning the list
-     * with the max columns.
-     *
-     * will collect at most the number of types in the comparator.
-     *
-     * if comparator is not CompositeType, maxSeen is assumed to be of size 1 and the item there is
-     * compared to the candidate.
+     * Note that this method *can modify maxSeen*.
      *
      * @param maxSeen the max columns seen so far
      * @param candidate the candidate column(s)
      * @param comparator the comparator to use
      * @return a list with the max column(s)
      */
-    public static List<ByteBuffer> maxComponents(List<ByteBuffer> maxSeen, ByteBuffer candidate, AbstractType<?> comparator)
+    public static List<ByteBuffer> maxComponents(List<ByteBuffer> maxSeen, Composite candidate, CellNameType comparator)
     {
-        if (comparator instanceof CompositeType)
-        {
-            CompositeType ct = (CompositeType)comparator;
-            if (maxSeen.isEmpty())
-                return Arrays.asList(ct.split(candidate));
+        // For a cell name, no reason to look more than the clustering prefix
+        // (and comparing the collection element would actually crash)
+        int size = Math.min(candidate.size(), comparator.clusteringPrefixSize());
 
-            int typeCount = getTypeCount(ct);
+        if (maxSeen.isEmpty())
+            return getComponents(candidate, size);
 
-            List<ByteBuffer> components = Arrays.asList(ct.split(candidate));
-            List<ByteBuffer> biggest = maxSeen.size() > components.size() ? maxSeen : components;
-            // if typecount is less than both the components and maxseen, we only keep typecount columns.
-            int minSize = Math.min(typeCount, Math.min(components.size(), maxSeen.size()));
-            int maxSize = Math.min(typeCount, biggest.size());
-            List<ByteBuffer> retList = new ArrayList<ByteBuffer>(maxSize);
+        // In most case maxSeen is big enough to hold the result so update it in place in those cases
+        maxSeen = maybeGrow(maxSeen, size);
 
-            for (int i = 0; i < minSize; i++)
-                retList.add(ColumnNameHelper.max(maxSeen.get(i), components.get(i), ct.types.get(i)));
-            for (int i = minSize; i < maxSize; i++)
-                retList.add(biggest.get(i));
+        for (int i = 0; i < size; i++)
+            maxSeen.set(i, max(maxSeen.get(i), candidate.get(i), comparator.subtype(i)));
 
-            return retList;
-        }
-        else
-        {
-            if (maxSeen.size() == 0)
-                return Collections.singletonList(candidate);
-            return Collections.singletonList(ColumnNameHelper.max(maxSeen.get(0), candidate, comparator));
-        }
+        return maxSeen;
     }
+
     /**
-     * finds the min column name(s)
+     * finds the min cell name component(s)
      *
-     * if comparator is of CompositeType, candidate will be split into its components, and each
-     * component is compared to the component on the same place in minSeen, and then returning the list
-     * with the min columns.
-     *
-     * if comparator is not CompositeType, maxSeen is assumed to be of size 1 and the item there is
-     * compared to the candidate.
+     * Note that this method *can modify maxSeen*.
      *
      * @param minSeen the max columns seen so far
      * @param candidate the candidate column(s)
      * @param comparator the comparator to use
      * @return a list with the min column(s)
      */
-    public static List<ByteBuffer> minComponents(List<ByteBuffer> minSeen, ByteBuffer candidate, AbstractType<?> comparator)
+    public static List<ByteBuffer> minComponents(List<ByteBuffer> minSeen, Composite candidate, CellNameType comparator)
     {
-        if (comparator instanceof CompositeType)
-        {
-            CompositeType ct = (CompositeType)comparator;
-            if (minSeen.isEmpty())
-                return Arrays.asList(ct.split(candidate));
+        // For a cell name, no reason to look more than the clustering prefix
+        // (and comparing the collection element would actually crash)
+        int size = Math.min(candidate.size(), comparator.clusteringPrefixSize());
 
-            int typeCount = getTypeCount(ct);
+        if (minSeen.isEmpty())
+            return getComponents(candidate, size);
 
-            List<ByteBuffer> components = Arrays.asList(ct.split(candidate));
-            List<ByteBuffer> biggest = minSeen.size() > components.size() ? minSeen : components;
-            // if typecount is less than both the components and maxseen, we only collect typecount columns.
-            int minSize = Math.min(typeCount, Math.min(components.size(), minSeen.size()));
-            int maxSize = Math.min(typeCount, biggest.size());
-            List<ByteBuffer> retList = new ArrayList<ByteBuffer>(maxSize);
+        // In most case maxSeen is big enough to hold the result so update it in place in those cases
+        minSeen = maybeGrow(minSeen, size);
 
-            for (int i = 0; i < minSize; i++)
-                retList.add(ColumnNameHelper.min(minSeen.get(i), components.get(i), ct.types.get(i)));
-            for (int i = minSize; i < maxSize; i++)
-                retList.add(biggest.get(i));
+        for (int i = 0; i < size; i++)
+            minSeen.set(i, min(minSeen.get(i), candidate.get(i), comparator.subtype(i)));
 
-            return retList;
-        }
-        else
-        {
-            if (minSeen.size() == 0)
-                return Collections.singletonList(candidate);
-            return Collections.singletonList(ColumnNameHelper.min(minSeen.get(0), candidate, comparator));
-
-        }
+        return minSeen;
     }
 
     /**
@@ -139,6 +118,11 @@
      */
     private static ByteBuffer min(ByteBuffer b1, ByteBuffer b2, AbstractType<?> comparator)
     {
+        if (b1 == null)
+            return b2;
+        if (b2 == null)
+            return b1;
+
         if (comparator.compare(b1, b2) >= 0)
             return b2;
         return b1;
@@ -156,24 +140,25 @@
      */
     private static ByteBuffer max(ByteBuffer b1, ByteBuffer b2, AbstractType<?> comparator)
     {
+        if (b1 == null)
+            return b2;
+        if (b2 == null)
+            return b1;
+
         if (comparator.compare(b1, b2) >= 0)
             return b1;
         return b2;
     }
 
     /**
-     * if columnNameComparator is CompositeType the columns are compared by components using the subcomparator
-     * on the same position.
-     *
-     * if comparator is not CompositeType, the lists are assumed to be of max size 1 and compared using the comparator
-     * directly.
+     * Merge 2 lists of min cell name components.
      *
      * @param minColumnNames lhs
      * @param candidates rhs
-     * @param columnNameComparator comparator to use
+     * @param comparator comparator to use
      * @return a list with smallest column names according to (sub)comparator
      */
-    public static List<ByteBuffer> mergeMin(List<ByteBuffer> minColumnNames, List<ByteBuffer> candidates, AbstractType<?> columnNameComparator)
+    public static List<ByteBuffer> mergeMin(List<ByteBuffer> minColumnNames, List<ByteBuffer> candidates, CellNameType comparator)
     {
         if (minColumnNames.isEmpty())
             return minimalBuffersFor(candidates);
@@ -181,27 +166,18 @@
         if (candidates.isEmpty())
             return minColumnNames;
 
-        if (columnNameComparator instanceof CompositeType)
-        {
-            CompositeType ct = (CompositeType)columnNameComparator;
-            List<ByteBuffer> biggest = minColumnNames.size() > candidates.size() ? minColumnNames : candidates;
-            int typeCount = getTypeCount(ct);
-            int minSize = Math.min(typeCount, Math.min(minColumnNames.size(), candidates.size()));
-            int maxSize = Math.min(typeCount, biggest.size());
+        List<ByteBuffer> biggest = minColumnNames.size() > candidates.size() ? minColumnNames : candidates;
+        List<ByteBuffer> smallest = minColumnNames.size() > candidates.size() ? candidates : minColumnNames;
 
-            List<ByteBuffer> retList = new ArrayList<ByteBuffer>(maxSize);
+        // We want to always copy the smallest list, and maybeGrow does it only if it's actually smaller
+        List<ByteBuffer> retList = smallest.size() == biggest.size()
+                                 ? new ArrayList<>(smallest)
+                                 : maybeGrow(smallest, biggest.size());
 
-            for (int i = 0; i < minSize; i++)
-                retList.add(minimalBufferFor(min(minColumnNames.get(i), candidates.get(i), ct.types.get(i))));
-            for (int i = minSize; i < maxSize; i++)
-                retList.add(minimalBufferFor(biggest.get(i)));
+        for (int i = 0; i < biggest.size(); i++)
+            retList.set(i, minimalBufferFor(min(retList.get(i), biggest.get(i), comparator.subtype(i))));
 
-            return retList;
-        }
-        else
-        {
-            return Collections.singletonList(minimalBufferFor(min(minColumnNames.get(0), candidates.get(0), columnNameComparator)));
-        }
+        return retList;
     }
 
     private static List<ByteBuffer> minimalBuffersFor(List<ByteBuffer> candidates)
@@ -213,18 +189,14 @@
     }
 
     /**
-     * if columnNameComparator is CompositeType the columns are compared by components using the subcomparator
-     * on the same position.
-     *
-     * if comparator is not CompositeType, the lists are assumed to be of max size 1 and compared using the comparator
-     * directly.
+     * Merge 2 lists of max cell name components.
      *
      * @param maxColumnNames lhs
      * @param candidates rhs
-     * @param columnNameComparator comparator to use
+     * @param comparator comparator to use
      * @return a list with biggest column names according to (sub)comparator
      */
-    public static List<ByteBuffer> mergeMax(List<ByteBuffer> maxColumnNames, List<ByteBuffer> candidates, AbstractType<?> columnNameComparator)
+    public static List<ByteBuffer> mergeMax(List<ByteBuffer> maxColumnNames, List<ByteBuffer> candidates, CellNameType comparator)
     {
         if (maxColumnNames.isEmpty())
             return minimalBuffersFor(candidates);
@@ -232,31 +204,17 @@
         if (candidates.isEmpty())
             return maxColumnNames;
 
-        if (columnNameComparator instanceof CompositeType)
-        {
-            CompositeType ct = (CompositeType)columnNameComparator;
-            List<ByteBuffer> biggest = maxColumnNames.size() > candidates.size() ? maxColumnNames : candidates;
-            int typeCount = getTypeCount(ct);
-            int minSize = Math.min(typeCount, Math.min(maxColumnNames.size(), candidates.size()));
-            int maxSize = Math.min(typeCount, biggest.size());
-            List<ByteBuffer> retList = new ArrayList<ByteBuffer>(maxSize);
+        List<ByteBuffer> biggest = maxColumnNames.size() > candidates.size() ? maxColumnNames : candidates;
+        List<ByteBuffer> smallest = maxColumnNames.size() > candidates.size() ? candidates : maxColumnNames;
 
-            for (int i = 0; i < minSize; i++)
-                retList.add(minimalBufferFor(max(maxColumnNames.get(i), candidates.get(i), ct.types.get(i))));
-            for (int i = minSize; i < maxSize; i++)
-                retList.add(minimalBufferFor(biggest.get(i)));
+        // We want to always copy the smallest list, and maybeGrow does it only if it's actually smaller
+        List<ByteBuffer> retList = smallest.size() == biggest.size()
+                                 ? new ArrayList<>(smallest)
+                                 : maybeGrow(smallest, biggest.size());
 
-            return retList;
-        }
-        else
-        {
-            return Collections.singletonList(minimalBufferFor(max(maxColumnNames.get(0), candidates.get(0), columnNameComparator)));
-        }
+        for (int i = 0; i < biggest.size(); i++)
+            retList.set(i, minimalBufferFor(max(retList.get(i), biggest.get(i), comparator.subtype(i))));
 
-    }
-
-    private static int getTypeCount(CompositeType ct)
-    {
-        return ct.types.get(ct.types.size() - 1) instanceof ColumnToCollectionType ? ct.types.size() - 1 : ct.types.size();
+        return retList;
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/ColumnStats.java b/src/java/org/apache/cassandra/io/sstable/ColumnStats.java
index 446c41c..873aa09 100644
--- a/src/java/org/apache/cassandra/io/sstable/ColumnStats.java
+++ b/src/java/org/apache/cassandra/io/sstable/ColumnStats.java

@@ -42,7 +42,16 @@
     public final List<ByteBuffer> minColumnNames;
     public final List<ByteBuffer> maxColumnNames;
 
-    public ColumnStats(int columnCount, long minTimestamp, long maxTimestamp, int maxLocalDeletionTime, StreamingHistogram tombstoneHistogram, List<ByteBuffer> minColumnNames, List<ByteBuffer> maxColumnNames)
+    public final boolean hasLegacyCounterShards;
+
+    public ColumnStats(int columnCount,
+                       long minTimestamp,
+                       long maxTimestamp,
+                       int maxLocalDeletionTime,
+                       StreamingHistogram tombstoneHistogram,
+                       List<ByteBuffer> minColumnNames,
+                       List<ByteBuffer> maxColumnNames,
+                       boolean hasLegacyCounterShards)
     {
         this.minTimestamp = minTimestamp;
         this.maxTimestamp = maxTimestamp;
@@ -51,6 +60,7 @@
         this.tombstoneHistogram = tombstoneHistogram;
         this.minColumnNames = minColumnNames;
         this.maxColumnNames = maxColumnNames;
+        this.hasLegacyCounterShards = hasLegacyCounterShards;
     }
 
     public static class MinTracker<T extends Comparable<T>>

diff --git a/src/java/org/apache/cassandra/io/sstable/Component.java b/src/java/org/apache/cassandra/io/sstable/Component.java
index 35efca5..3eacd07 100644
--- a/src/java/org/apache/cassandra/io/sstable/Component.java
+++ b/src/java/org/apache/cassandra/io/sstable/Component.java

@@ -43,10 +43,6 @@
         PRIMARY_INDEX("Index.db"),
         // serialized bloom filter for the row keys in the sstable
         FILTER("Filter.db"),
-        // 0-length file that is created when an sstable is ready to be deleted
-        // @deprecated: deletion of compacted file is based on the lineag information stored in the compacted sstabl
-        // metadata. This ensure we can guarantee never using a sstable and some of its parents, even in case of failure.
-        COMPACTED_MARKER("Compacted"),
         // file to hold information about uncompressed data length, chunk offsets etc.
         COMPRESSION_INFO("CompressionInfo.db"),
         // statistical metadata about the content of the sstable
@@ -81,7 +77,6 @@
     public final static Component DATA = new Component(Type.DATA);
     public final static Component PRIMARY_INDEX = new Component(Type.PRIMARY_INDEX);
     public final static Component FILTER = new Component(Type.FILTER);
-    public final static Component COMPACTED_MARKER = new Component(Type.COMPACTED_MARKER);
     public final static Component COMPRESSION_INFO = new Component(Type.COMPRESSION_INFO);
     public final static Component STATS = new Component(Type.STATS);
     public final static Component DIGEST = new Component(Type.DIGEST);
@@ -133,7 +128,6 @@
             case DATA:              component = Component.DATA;                         break;
             case PRIMARY_INDEX:     component = Component.PRIMARY_INDEX;                break;
             case FILTER:            component = Component.FILTER;                       break;
-            case COMPACTED_MARKER:  component = Component.COMPACTED_MARKER;             break;
             case COMPRESSION_INFO:  component = Component.COMPRESSION_INFO;             break;
             case STATS:             component = Component.STATS;                        break;
             case DIGEST:            component = Component.DIGEST;                       break;

diff --git a/src/java/org/apache/cassandra/io/sstable/Descriptor.java b/src/java/org/apache/cassandra/io/sstable/Descriptor.java
index d65da45..4415db4 100644
--- a/src/java/org/apache/cassandra/io/sstable/Descriptor.java
+++ b/src/java/org/apache/cassandra/io/sstable/Descriptor.java

@@ -22,6 +22,9 @@
 
 import com.google.common.base.Objects;
 
+import org.apache.cassandra.io.sstable.metadata.IMetadataSerializer;
+import org.apache.cassandra.io.sstable.metadata.LegacyMetadataSerializer;
+import org.apache.cassandra.io.sstable.metadata.MetadataSerializer;
 import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.io.sstable.Component.separator;
@@ -44,9 +47,8 @@
     public static class Version
     {
         // This needs to be at the begining for initialization sake
-        public static final String current_version = "jb";
+        public static final String current_version = "ka";
 
-        // ic (1.2.5): omits per-row bloom filter of column names
         // ja (2.0.0): super columns are serialized as composites (note that there is no real format change,
         //               this is mostly a marker to know if we should expect super columns or not. We do need
         //               a major version bump however, because we should not allow streaming of super columns
@@ -57,31 +59,33 @@
         //             tracks max/min column values (according to comparator)
         // jb (2.0.1): switch from crc32 to adler32 for compression checksums
         //             checksum the compressed data
+        // ka (2.1.0): new Statistics.db file format
+        //             index summaries can be downsampled and the sampling level is persisted
+        //             switch uncompressed checksums to adler32
+        //             tracks presense of legacy (local and remote) counter shards
 
         public static final Version CURRENT = new Version(current_version);
 
         private final String version;
 
         public final boolean isLatestVersion;
-        public final boolean hasSuperColumns;
-        public final boolean tracksMaxLocalDeletionTime;
-        public final boolean hasBloomFilterFPChance;
-        public final boolean offHeapSummaries;
-        public final boolean hasRowSizeAndColumnCount;
-        public final boolean tracksMaxMinColumnNames;
         public final boolean hasPostCompressionAdlerChecksums;
+        public final boolean hasSamplingLevel;
+        public final boolean newStatsFile;
+        public final boolean hasAllAdlerChecksums;
+        public final boolean hasRepairedAt;
+        public final boolean tracksLegacyCounterShards;
 
         public Version(String version)
         {
             this.version = version;
-            tracksMaxLocalDeletionTime = version.compareTo("ja") >= 0;
             isLatestVersion = version.compareTo(current_version) == 0;
-            hasSuperColumns = version.compareTo("ja") < 0;
-            hasBloomFilterFPChance = version.compareTo("ja") >= 0;
-            offHeapSummaries = version.compareTo("ja") >= 0;
-            hasRowSizeAndColumnCount = version.compareTo("ja") < 0;
-            tracksMaxMinColumnNames = version.compareTo("ja") >= 0;
             hasPostCompressionAdlerChecksums = version.compareTo("jb") >= 0;
+            hasSamplingLevel = version.compareTo("ka") >= 0;
+            newStatsFile = version.compareTo("ka") >= 0;
+            hasAllAdlerChecksums = version.compareTo("ka") >= 0;
+            hasRepairedAt = version.compareTo("ka") >= 0;
+            tracksLegacyCounterShards = version.compareTo("ka") >= 0;
         }
 
         /**
@@ -96,12 +100,7 @@
 
         public boolean isCompatible()
         {
-            return version.compareTo("ic") >= 0 && version.charAt(0) <= CURRENT.version.charAt(0);
-        }
-
-        public boolean isStreamCompatible()
-        {
-            return isCompatible() && version.charAt(0) >= 'j';
+            return version.compareTo("ja") >= 0 && version.charAt(0) <= CURRENT.version.charAt(0);
         }
 
         @Override
@@ -113,11 +112,7 @@
         @Override
         public boolean equals(Object o)
         {
-            if (o == this)
-                return true;
-            if (!(o instanceof Version))
-                return false;
-            return version.equals(((Version)o).version);
+            return o == this || o instanceof Version && version.equals(((Version) o).version);
         }
 
         @Override
@@ -127,29 +122,41 @@
         }
     }
 
+    public static enum Type
+    {
+        TEMP("tmp", true), TEMPLINK("tmplink", true), FINAL(null, false);
+        public final boolean isTemporary;
+        public final String marker;
+        Type(String marker, boolean isTemporary)
+        {
+            this.isTemporary = isTemporary;
+            this.marker = marker;
+        }
+    }
+
     public final File directory;
     /** version has the following format: <code>[a-z]+</code> */
     public final Version version;
     public final String ksname;
     public final String cfname;
     public final int generation;
-    public final boolean temporary;
+    public final Type type;
     private final int hashCode;
 
     /**
      * A descriptor that assumes CURRENT_VERSION.
      */
-    public Descriptor(File directory, String ksname, String cfname, int generation, boolean temp)
+    public Descriptor(File directory, String ksname, String cfname, int generation, Type temp)
     {
         this(Version.CURRENT, directory, ksname, cfname, generation, temp);
     }
 
-    public Descriptor(String version, File directory, String ksname, String cfname, int generation, boolean temp)
+    public Descriptor(String version, File directory, String ksname, String cfname, int generation, Type temp)
     {
         this(new Version(version), directory, ksname, cfname, generation, temp);
     }
 
-    public Descriptor(Version version, File directory, String ksname, String cfname, int generation, boolean temp)
+    public Descriptor(Version version, File directory, String ksname, String cfname, int generation, Type temp)
     {
         assert version != null && directory != null && ksname != null && cfname != null;
         this.version = version;
@@ -157,13 +164,13 @@
         this.ksname = ksname;
         this.cfname = cfname;
         this.generation = generation;
-        temporary = temp;
+        type = temp;
         hashCode = Objects.hashCode(directory, generation, ksname, cfname, temp);
     }
 
     public Descriptor withGeneration(int newGeneration)
     {
-        return new Descriptor(version, directory, ksname, cfname, newGeneration, temporary);
+        return new Descriptor(version, directory, ksname, cfname, newGeneration, type);
     }
 
     public String filenameFor(Component component)
@@ -175,12 +182,25 @@
     {
         StringBuilder buff = new StringBuilder();
         buff.append(directory).append(File.separatorChar);
+        appendFileName(buff);
+        return buff.toString();
+    }
+
+    private void appendFileName(StringBuilder buff)
+    {
         buff.append(ksname).append(separator);
         buff.append(cfname).append(separator);
-        if (temporary)
-            buff.append(SSTable.TEMPFILE_MARKER).append(separator);
+        if (type.isTemporary)
+            buff.append(type.marker).append(separator);
         buff.append(version).append(separator);
         buff.append(generation);
+    }
+
+    public String relativeFilenameFor(Component component)
+    {
+        final StringBuilder buff = new StringBuilder();
+        appendFileName(buff);
+        buff.append(separator).append(component.name());
         return buff.toString();
     }
 
@@ -236,15 +256,20 @@
 
         // optional temporary marker
         nexttok = st.nextToken();
-        boolean temporary = false;
-        if (nexttok.equals(SSTable.TEMPFILE_MARKER))
+        Type type = Type.FINAL;
+        if (nexttok.equals(Type.TEMP.marker))
         {
-            temporary = true;
+            type = Type.TEMP;
+            nexttok = st.nextToken();
+        }
+        else if (nexttok.equals(Type.TEMPLINK.marker))
+        {
+            type = Type.TEMPLINK;
             nexttok = st.nextToken();
         }
 
         if (!Version.validate(nexttok))
-            throw new UnsupportedOperationException("SSTable " + name + " is too old to open.  Upgrade to 1.2.5 first, and run upgradesstables");
+            throw new UnsupportedOperationException("SSTable " + name + " is too old to open.  Upgrade to 2.0 first, and run upgradesstables");
         Version version = new Version(nexttok);
 
         nexttok = st.nextToken();
@@ -255,16 +280,24 @@
         if (!skipComponent)
             component = st.nextToken();
         directory = directory != null ? directory : new File(".");
-        return Pair.create(new Descriptor(version, directory, ksname, cfname, generation, temporary), component);
+        return Pair.create(new Descriptor(version, directory, ksname, cfname, generation, type), component);
     }
 
     /**
-     * @param temporary temporary flag
+     * @param type temporary flag
      * @return A clone of this descriptor with the given 'temporary' status.
      */
-    public Descriptor asTemporary(boolean temporary)
+    public Descriptor asType(Type type)
     {
-        return new Descriptor(version, directory, ksname, cfname, generation, temporary);
+        return new Descriptor(version, directory, ksname, cfname, generation, type);
+    }
+
+    public IMetadataSerializer getMetadataSerializer()
+    {
+        if (version.newStatsFile)
+            return new MetadataSerializer();
+        else
+            return new LegacyMetadataSerializer();
     }
 
     /**
@@ -275,17 +308,6 @@
         return version.isCompatible();
     }
 
-    /**
-     * @return true if the current Cassandra version can stream the given sstable version
-     * from another node.  This is stricter than opening it locally [isCompatible] because
-     * streaming needs to rebuild all the non-data components, and it only knows how to write
-     * the latest version.
-     */
-    public boolean isStreamCompatible()
-    {
-        return version.isStreamCompatible();
-    }
-
     @Override
     public String toString()
     {
@@ -300,7 +322,11 @@
         if (!(o instanceof Descriptor))
             return false;
         Descriptor that = (Descriptor)o;
-        return that.directory.equals(this.directory) && that.generation == this.generation && that.ksname.equals(this.ksname) && that.cfname.equals(this.cfname) && that.temporary == this.temporary;
+        return that.directory.equals(this.directory)
+                       && that.generation == this.generation
+                       && that.ksname.equals(this.ksname)
+                       && that.cfname.equals(this.cfname)
+                       && that.type == this.type;
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/io/sstable/Downsampling.java b/src/java/org/apache/cassandra/io/sstable/Downsampling.java
new file mode 100644
index 0000000..6842b25
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/Downsampling.java

@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.io.sstable;
+
+import java.util.*;
+
+public class Downsampling
+{
+    /**
+     * The base (down)sampling level determines the granularity at which we can down/upsample.
+     *
+     * A higher number allows us to approximate more closely the ideal sampling.  (It could also mean we do a lot of
+     * expensive almost-no-op resamplings from N to N-1, but the thresholds in IndexSummaryManager prevent that.)
+     *
+     * BSL must be a power of two in order to have good sampling patterns. This cannot be changed without rebuilding
+     * all index summaries at full sampling; for now we treat it as a constant.
+     */
+    public static final int BASE_SAMPLING_LEVEL = 128;
+
+    private static final Map<Integer, List<Integer>> samplePatternCache = new HashMap<>();
+
+    private static final Map<Integer, List<Integer>> originalIndexCache = new HashMap<>();
+
+    /**
+     * Gets a list L of starting indices for downsampling rounds: the first round should start with the offset
+     * given by L[0], the second by the offset in L[1], etc.
+     *
+     * @param samplingLevel the base sampling level
+     *
+     * @return A list of `samplingLevel` unique indices between 0 and `samplingLevel`
+     */
+    public static List<Integer> getSamplingPattern(int samplingLevel)
+    {
+        List<Integer> pattern = samplePatternCache.get(samplingLevel);
+        if (pattern != null)
+            return pattern;
+
+        if (samplingLevel <= 1)
+            return Arrays.asList(0);
+
+        int[] odds = new int[samplingLevel / 2];
+        int[] evens = new int[samplingLevel / 2];
+        for (int i = 1; i < samplingLevel; i += 2)
+            odds[i/2] = i;
+        for (int i = 0; i < samplingLevel; i += 2)
+            evens[i/2] = i;
+
+        // especially for latter rounds, it's important that we spread out the start points, so we'll
+        // make a recursive call to get an ordering for this list of start points
+        List<Integer> ordering = getSamplingPattern(samplingLevel/2);
+        List<Integer> startIndices = new ArrayList<>(samplingLevel);
+
+        for (Integer index : ordering)
+            startIndices.add(odds[index]);
+        for (Integer index : ordering)
+            startIndices.add(evens[index]);
+
+        samplePatternCache.put(samplingLevel, startIndices);
+        return startIndices;
+    }
+
+    /**
+     * Returns a list that can be used to translate current index summary indexes to their original index before
+     * downsampling.  (This repeats every `samplingLevel`, so that's how many entries we return.)
+     *
+     * For example, if [7, 15] is returned, the current index summary entry at index 0 was originally
+     * at index 7, and the current index 1 was originally at index 15.
+     *
+     * @param samplingLevel the current sampling level for the index summary
+     *
+     * @return a list of original indexes for current summary entries
+     */
+    public static List<Integer> getOriginalIndexes(int samplingLevel)
+    {
+        List<Integer> originalIndexes = originalIndexCache.get(samplingLevel);
+        if (originalIndexes != null)
+            return originalIndexes;
+
+        List<Integer> pattern = getSamplingPattern(BASE_SAMPLING_LEVEL).subList(0, BASE_SAMPLING_LEVEL - samplingLevel);
+        originalIndexes = new ArrayList<>(samplingLevel);
+        for (int j = 0; j < BASE_SAMPLING_LEVEL; j++)
+        {
+            if (!pattern.contains(j))
+                originalIndexes.add(j);
+        }
+
+        originalIndexCache.put(samplingLevel, originalIndexes);
+        return originalIndexes;
+    }
+
+    /**
+     * Calculates the effective index interval after the entry at `index` in an IndexSummary.  In other words, this
+     * returns the number of partitions in the primary on-disk index before the next partition that has an entry in
+     * the index summary.  If samplingLevel == BASE_SAMPLING_LEVEL, this will be equal to the index interval.
+     * @param index an index into an IndexSummary
+     * @param samplingLevel the current sampling level for that IndexSummary
+     * @param minIndexInterval the min index interval (effective index interval at full sampling)
+     * @return the number of partitions before the next index summary entry, inclusive on one end
+     */
+    public static int getEffectiveIndexIntervalAfterIndex(int index, int samplingLevel, int minIndexInterval)
+    {
+        assert index >= -1;
+        List<Integer> originalIndexes = getOriginalIndexes(samplingLevel);
+        if (index == -1)
+            return originalIndexes.get(0) * minIndexInterval;
+
+        index %= samplingLevel;
+        if (index == originalIndexes.size() - 1)
+        {
+            // account for partitions after the "last" entry as well as partitions before the "first" entry
+            return ((BASE_SAMPLING_LEVEL - originalIndexes.get(index)) + originalIndexes.get(0)) * minIndexInterval;
+        }
+        else
+        {
+            return (originalIndexes.get(index + 1) - originalIndexes.get(index)) * minIndexInterval;
+        }
+    }
+
+    public static int[] getStartPoints(int currentSamplingLevel, int newSamplingLevel)
+    {
+        List<Integer> allStartPoints = getSamplingPattern(BASE_SAMPLING_LEVEL);
+
+        // calculate starting indexes for sampling rounds
+        int initialRound = BASE_SAMPLING_LEVEL - currentSamplingLevel;
+        int numRounds = Math.abs(currentSamplingLevel - newSamplingLevel);
+        int[] startPoints = new int[numRounds];
+        for (int i = 0; i < numRounds; ++i)
+        {
+            int start = allStartPoints.get(initialRound + i);
+
+            // our "ideal" start points will be affected by the removal of items in earlier rounds, so go through all
+            // earlier rounds, and if we see an index that comes before our ideal start point, decrement the start point
+            int adjustment = 0;
+            for (int j = 0; j < initialRound; ++j)
+            {
+                if (allStartPoints.get(j) < start)
+                    adjustment++;
+            }
+            startPoints[i] = start - adjustment;
+        }
+        return startPoints;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexHelper.java b/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
index d70ff19..b0bbfc4 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexHelper.java

@@ -18,14 +18,16 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.*;
-import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 
+import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileMark;
 import org.apache.cassandra.io.util.FileUtils;
@@ -67,21 +69,23 @@
     /**
      * Deserialize the index into a structure and return it
      *
-     * @param in - input source
+     * @param in input source
+     * @param type the comparator type for the column family
      *
      * @return ArrayList<IndexInfo> - list of de-serialized indexes
      * @throws IOException if an I/O error occurs.
      */
-    public static List<IndexInfo> deserializeIndex(FileDataInput in) throws IOException
+    public static List<IndexInfo> deserializeIndex(FileDataInput in, CType type) throws IOException
     {
         int columnIndexSize = in.readInt();
         if (columnIndexSize == 0)
             return Collections.<IndexInfo>emptyList();
         ArrayList<IndexInfo> indexList = new ArrayList<IndexInfo>();
         FileMark mark = in.mark();
+        ISerializer<IndexInfo> serializer = type.indexSerializer();
         while (in.bytesPastMark(mark) < columnIndexSize)
         {
-            indexList.add(IndexInfo.deserialize(in));
+            indexList.add(serializer.deserialize(in));
         }
         assert in.bytesPastMark(mark) == columnIndexSize;
 
@@ -105,10 +109,10 @@
      *
      * @return int index
      */
-    public static int indexFor(ByteBuffer name, List<IndexInfo> indexList, AbstractType<?> comparator, boolean reversed, int lastIndex)
+    public static int indexFor(Composite name, List<IndexInfo> indexList, CType comparator, boolean reversed, int lastIndex)
     {
-        if (name.remaining() == 0 && reversed)
-            return indexList.size() - 1;
+        if (name.isEmpty())
+            return lastIndex >= 0 ? lastIndex : reversed ? indexList.size() - 1 : 0;
 
         if (lastIndex >= indexList.size())
             return -1;
@@ -145,19 +149,21 @@
         return startIdx + (index < 0 ? -index - (reversed ? 2 : 1) : index);
     }
 
-    public static Comparator<IndexInfo> getComparator(final AbstractType<?> nameComparator, boolean reversed)
+    public static Comparator<IndexInfo> getComparator(final CType nameComparator, boolean reversed)
     {
-        return reversed ? nameComparator.indexReverseComparator : nameComparator.indexComparator;
+        return reversed ? nameComparator.indexReverseComparator() : nameComparator.indexComparator();
     }
 
     public static class IndexInfo
     {
+        private static final long EMPTY_SIZE = ObjectSizes.measure(new IndexInfo(null, null, 0, 0));
+
         public final long width;
-        public final ByteBuffer lastName;
-        public final ByteBuffer firstName;
+        public final Composite lastName;
+        public final Composite firstName;
         public final long offset;
 
-        public IndexInfo(ByteBuffer firstName, ByteBuffer lastName, long offset, long width)
+        public IndexInfo(Composite firstName, Composite lastName, long offset, long width)
         {
             this.firstName = firstName;
             this.lastName = lastName;
@@ -165,37 +171,43 @@
             this.width = width;
         }
 
-        public void serialize(DataOutput out) throws IOException
+        public static class Serializer implements ISerializer<IndexInfo>
         {
-            ByteBufferUtil.writeWithShortLength(firstName, out);
-            ByteBufferUtil.writeWithShortLength(lastName, out);
-            out.writeLong(offset);
-            out.writeLong(width);
+            private final CType type;
+
+            public Serializer(CType type)
+            {
+                this.type = type;
+            }
+
+            public void serialize(IndexInfo info, DataOutputPlus out) throws IOException
+            {
+                type.serializer().serialize(info.firstName, out);
+                type.serializer().serialize(info.lastName, out);
+                out.writeLong(info.offset);
+                out.writeLong(info.width);
+            }
+
+            public IndexInfo deserialize(DataInput in) throws IOException
+            {
+                return new IndexInfo(type.serializer().deserialize(in),
+                                     type.serializer().deserialize(in),
+                                     in.readLong(),
+                                     in.readLong());
+            }
+
+            public long serializedSize(IndexInfo info, TypeSizes typeSizes)
+            {
+                return type.serializer().serializedSize(info.firstName, typeSizes)
+                     + type.serializer().serializedSize(info.lastName, typeSizes)
+                     + typeSizes.sizeof(info.offset)
+                     + typeSizes.sizeof(info.width);
+            }
         }
 
-        public int serializedSize(TypeSizes typeSizes)
+        public long unsharedHeapSize()
         {
-            int firstNameSize = firstName.remaining();
-            int lastNameSize = lastName.remaining();
-            return typeSizes.sizeof((short) firstNameSize) + firstNameSize +
-                   typeSizes.sizeof((short) lastNameSize) + lastNameSize +
-                   typeSizes.sizeof(offset) + typeSizes.sizeof(width);
-        }
-
-        public static IndexInfo deserialize(DataInput in) throws IOException
-        {
-            return new IndexInfo(ByteBufferUtil.readWithShortLength(in), ByteBufferUtil.readWithShortLength(in), in.readLong(), in.readLong());
-        }
-
-        public long memorySize()
-        {
-            return ObjectSizes.getFieldSize(// firstName
-                                            ObjectSizes.getReferenceSize() +
-                                            // lastName
-                                            ObjectSizes.getReferenceSize() +
-                                            TypeSizes.NATIVE.sizeof(offset) +
-                                            TypeSizes.NATIVE.sizeof(width))
-                   + ObjectSizes.getSize(firstName) + ObjectSizes.getSize(lastName);
+            return EMPTY_SIZE + firstName.unsharedHeapSize() + lastName.unsharedHeapSize();
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummary.java b/src/java/org/apache/cassandra/io/sstable/IndexSummary.java
index be7977e..f53a7e4 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummary.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummary.java

@@ -19,39 +19,73 @@
 
 import java.io.Closeable;
 import java.io.DataInputStream;
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cache.RefCountedMemory;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.RowPosition;
 import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.io.util.Memory;
-import org.apache.cassandra.io.util.MemoryInputStream;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.MemoryOutputStream;
 import org.apache.cassandra.utils.FBUtilities;
 
+import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
+
+/*
+ * Layout of Memory for index summaries:
+ *
+ * There are two sections:
+ *  1. A "header" containing the offset into `bytes` of entries in the summary summary data, consisting of
+ *     one four byte position for each entry in the summary.  This allows us do simple math in getIndex()
+ *     to find the position in the Memory to start reading the actual index summary entry.
+ *     (This is necessary because keys can have different lengths.)
+ *  2.  A sequence of (DecoratedKey, position) pairs, where position is the offset into the actual index file.
+ */
 public class IndexSummary implements Closeable
 {
-    public static final IndexSummarySerializer serializer = new IndexSummarySerializer();
-    private final int indexInterval;
-    private final IPartitioner partitioner;
-    private final int summary_size;
-    private final Memory bytes;
+    private static final Logger logger = LoggerFactory.getLogger(IndexSummary.class);
 
-    public IndexSummary(IPartitioner partitioner, Memory memory, int summary_size, int indexInterval)
+    public static final IndexSummarySerializer serializer = new IndexSummarySerializer();
+
+    /**
+     * A lower bound for the average number of partitions in between each index summary entry. A lower value means
+     * that more partitions will have an entry in the index summary when at the full sampling level.
+     */
+    private final int minIndexInterval;
+
+    private final IPartitioner partitioner;
+    private final int summarySize;
+    private final int sizeAtFullSampling;
+    private final RefCountedMemory bytes;
+
+    /**
+     * A value between 1 and BASE_SAMPLING_LEVEL that represents how many of the original
+     * index summary entries ((1 / indexInterval) * numKeys) have been retained.
+     *
+     * Thus, this summary contains (samplingLevel / BASE_SAMPLING_LEVEL) * ((1 / indexInterval) * numKeys)) entries.
+     */
+    private final int samplingLevel;
+
+    public IndexSummary(IPartitioner partitioner, RefCountedMemory memory, int summarySize, int sizeAtFullSampling,
+                        int minIndexInterval, int samplingLevel)
     {
         this.partitioner = partitioner;
-        this.indexInterval = indexInterval;
-        this.summary_size = summary_size;
+        this.minIndexInterval = minIndexInterval;
+        this.summarySize = summarySize;
+        this.sizeAtFullSampling = sizeAtFullSampling;
         this.bytes = memory;
+        this.samplingLevel = samplingLevel;
     }
 
     // binary search is notoriously more difficult to get right than it looks; this is lifted from
     // Harmony's Collections implementation
     public int binarySearch(RowPosition key)
     {
-        int low = 0, mid = summary_size, high = mid - 1, result = -1;
+        int low = 0, mid = summarySize, high = mid - 1, result = -1;
         while (low <= high)
         {
             mid = (low + high) >> 1;
@@ -73,16 +107,21 @@
         return -mid - (result < 0 ? 1 : 2);
     }
 
-    public int getIndex(int index)
+    /**
+     * Gets the position of the actual index summary entry in our Memory attribute, 'bytes'.
+     * @param index The index of the entry or key to get the position for
+     * @return an offset into our Memory attribute where the actual entry resides
+     */
+    public int getPositionInSummary(int index)
     {
-        // multiply by 4.
+        // The first section of bytes holds a four-byte position for each entry in the summary, so just multiply by 4.
         return bytes.getInt(index << 2);
     }
 
     public byte[] getKey(int index)
     {
-        long start = getIndex(index);
-        int keySize = (int) (caclculateEnd(index) - start - 8L);
+        long start = getPositionInSummary(index);
+        int keySize = (int) (calculateEnd(index) - start - 8L);
         byte[] key = new byte[keySize];
         bytes.getBytes(start, key, 0, keySize);
         return key;
@@ -90,48 +129,143 @@
 
     public long getPosition(int index)
     {
-        return bytes.getLong(caclculateEnd(index) - 8);
+        return bytes.getLong(calculateEnd(index) - 8);
     }
 
-    private long caclculateEnd(int index)
+    public byte[] getEntry(int index)
     {
-        return index == (summary_size - 1) ? bytes.size() : getIndex(index + 1);
+        long start = getPositionInSummary(index);
+        long end = calculateEnd(index);
+        byte[] entry = new byte[(int)(end - start)];
+        bytes.getBytes(start, entry, 0, (int)(end - start));
+        return entry;
     }
 
-    public int getIndexInterval()
+    private long calculateEnd(int index)
     {
-        return indexInterval;
+        return index == (summarySize - 1) ? bytes.size() : getPositionInSummary(index + 1);
+    }
+
+    public int getMinIndexInterval()
+    {
+        return minIndexInterval;
+    }
+
+    public double getEffectiveIndexInterval()
+    {
+        return (BASE_SAMPLING_LEVEL / (double) samplingLevel) * minIndexInterval;
+    }
+
+    /**
+     * Returns an estimate of the total number of keys in the SSTable.
+     */
+    public long getEstimatedKeyCount()
+    {
+        return ((long) getMaxNumberOfEntries() + 1) * minIndexInterval;
     }
 
     public int size()
     {
-        return summary_size;
+        return summarySize;
+    }
+
+    public int getSamplingLevel()
+    {
+        return samplingLevel;
+    }
+
+    /**
+     * Returns the number of entries this summary would have if it were at the full sampling level, which is equal
+     * to the number of entries in the primary on-disk index divided by the min index interval.
+     */
+    public int getMaxNumberOfEntries()
+    {
+        return sizeAtFullSampling;
+    }
+
+    /**
+     * Returns the amount of off-heap memory used for this summary.
+     * @return size in bytes
+     */
+    public long getOffHeapSize()
+    {
+        return bytes.size();
+    }
+
+    /**
+     * Returns the number of primary (on-disk) index entries between the index summary entry at `index` and the next
+     * index summary entry (assuming there is one).  Without any downsampling, this will always be equivalent to
+     * the index interval.
+     *
+     * @param index the index of an index summary entry (between zero and the index entry size)
+     *
+     * @return the number of partitions after `index` until the next partition with a summary entry
+     */
+    public int getEffectiveIndexIntervalAfterIndex(int index)
+    {
+        return Downsampling.getEffectiveIndexIntervalAfterIndex(index, samplingLevel, minIndexInterval);
     }
 
     public static class IndexSummarySerializer
     {
-        public void serialize(IndexSummary t, DataOutputStream out) throws IOException
+        public void serialize(IndexSummary t, DataOutputPlus out, boolean withSamplingLevel) throws IOException
         {
-            out.writeInt(t.indexInterval);
-            out.writeInt(t.summary_size);
+            out.writeInt(t.minIndexInterval);
+            out.writeInt(t.summarySize);
             out.writeLong(t.bytes.size());
-            FBUtilities.copy(new MemoryInputStream(t.bytes), out, t.bytes.size());
+            if (withSamplingLevel)
+            {
+                out.writeInt(t.samplingLevel);
+                out.writeInt(t.sizeAtFullSampling);
+            }
+            out.write(t.bytes);
         }
 
-        public IndexSummary deserialize(DataInputStream in, IPartitioner partitioner) throws IOException
+        public IndexSummary deserialize(DataInputStream in, IPartitioner partitioner, boolean haveSamplingLevel, int expectedMinIndexInterval, int maxIndexInterval) throws IOException
         {
-            int indexInterval = in.readInt();
-            int summary_size = in.readInt();
-            long offheap_size = in.readLong();
-            Memory memory = Memory.allocate(offheap_size);
-            FBUtilities.copy(in, new MemoryOutputStream(memory), offheap_size);
-            return new IndexSummary(partitioner, memory, summary_size, indexInterval);
+            int minIndexInterval = in.readInt();
+            if (minIndexInterval != expectedMinIndexInterval)
+            {
+                throw new IOException(String.format("Cannot read index summary because min_index_interval changed from %d to %d.",
+                                                    minIndexInterval, expectedMinIndexInterval));
+            }
+
+            int summarySize = in.readInt();
+            long offheapSize = in.readLong();
+            int samplingLevel, fullSamplingSummarySize;
+            if (haveSamplingLevel)
+            {
+                samplingLevel = in.readInt();
+                fullSamplingSummarySize = in.readInt();
+            }
+            else
+            {
+                samplingLevel = BASE_SAMPLING_LEVEL;
+                fullSamplingSummarySize = summarySize;
+            }
+
+            int effectiveIndexInterval = (int) Math.ceil((BASE_SAMPLING_LEVEL / (double) samplingLevel) * minIndexInterval);
+            if (effectiveIndexInterval > maxIndexInterval)
+            {
+                throw new IOException(String.format("Rebuilding index summary because the effective index interval (%d) is higher than" +
+                                                    " the current max index interval (%d)", effectiveIndexInterval, maxIndexInterval));
+            }
+
+            RefCountedMemory memory = new RefCountedMemory(offheapSize);
+            FBUtilities.copy(in, new MemoryOutputStream(memory), offheapSize);
+            return new IndexSummary(partitioner, memory, summarySize, fullSamplingSummarySize, minIndexInterval, samplingLevel);
         }
     }
 
     @Override
-    public void close() throws IOException
+    public void close()
     {
-        bytes.free();
+        bytes.unreference();
+    }
+
+    public IndexSummary readOnlyClone()
+    {
+        bytes.reference();
+        return this;
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryBuilder.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryBuilder.java
index 1fa2912..8e9cc30 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummaryBuilder.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryBuilder.java

@@ -17,52 +17,106 @@
  */
 package org.apache.cassandra.io.sstable;
 
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
+import java.util.Collections;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cache.RefCountedMemory;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.io.util.Memory;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+
+import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
+import static org.apache.cassandra.io.sstable.SSTable.getMinimalKey;
 
 public class IndexSummaryBuilder
 {
     private static final Logger logger = LoggerFactory.getLogger(IndexSummaryBuilder.class);
 
     private final ArrayList<Long> positions;
-    private final ArrayList<byte[]> keys;
-    private final int indexInterval;
+    private final ArrayList<DecoratedKey> keys;
+    private final int minIndexInterval;
+    private final int samplingLevel;
+    private final int[] startPoints;
     private long keysWritten = 0;
+    private long indexIntervalMatches = 0;
     private long offheapSize = 0;
 
-    public IndexSummaryBuilder(long expectedKeys, int indexInterval)
+    public IndexSummaryBuilder(long expectedKeys, int minIndexInterval, int samplingLevel)
     {
-        this.indexInterval = indexInterval;
-        long expectedEntries = expectedKeys / indexInterval;
-        if (expectedEntries > Integer.MAX_VALUE)
+        this.samplingLevel = samplingLevel;
+        this.startPoints = Downsampling.getStartPoints(BASE_SAMPLING_LEVEL, samplingLevel);
+
+        long maxExpectedEntries = expectedKeys / minIndexInterval;
+        if (maxExpectedEntries > Integer.MAX_VALUE)
         {
-            // that's a _lot_ of keys, and a very low interval
-            int effectiveInterval = (int) Math.ceil((double) Integer.MAX_VALUE / expectedKeys);
-            expectedEntries = expectedKeys / effectiveInterval;
-            assert expectedEntries <= Integer.MAX_VALUE : expectedEntries;
-            logger.warn("Index interval of {} is too low for {} expected keys; using interval of {} instead",
-                        indexInterval, expectedKeys, effectiveInterval);
+            // that's a _lot_ of keys, and a very low min index interval
+            int effectiveMinInterval = (int) Math.ceil((double) Integer.MAX_VALUE / expectedKeys);
+            maxExpectedEntries = expectedKeys / effectiveMinInterval;
+            assert maxExpectedEntries <= Integer.MAX_VALUE : maxExpectedEntries;
+            logger.warn("min_index_interval of {} is too low for {} expected keys; using interval of {} instead",
+                        minIndexInterval, expectedKeys, effectiveMinInterval);
+            this.minIndexInterval = effectiveMinInterval;
         }
-        positions = new ArrayList<Long>((int)expectedEntries);
-        keys = new ArrayList<byte[]>((int)expectedEntries);
+        else
+        {
+            this.minIndexInterval = minIndexInterval;
+        }
+
+        // for initializing data structures, adjust our estimates based on the sampling level
+        maxExpectedEntries = (maxExpectedEntries * samplingLevel) / BASE_SAMPLING_LEVEL;
+        positions = new ArrayList<>((int)maxExpectedEntries);
+        keys = new ArrayList<>((int)maxExpectedEntries);
+    }
+
+    // finds the last (-offset) decorated key that can be guaranteed to occur fully in the index file before the provided file position
+    public DecoratedKey getMaxReadableKey(long position, int offset)
+    {
+        int i = Collections.binarySearch(positions, position);
+        if (i < 0)
+        {
+            i = -1 - i;
+            if (i == positions.size())
+                i -= 2;
+            else
+                i -= 1;
+        }
+        else
+            i -= 1;
+        i -= offset;
+        // we don't want to return any key if there's only 1 item in the summary, to make sure the sstable range is non-empty
+        if (i <= 0)
+            return null;
+        return keys.get(i);
     }
 
     public IndexSummaryBuilder maybeAddEntry(DecoratedKey decoratedKey, long indexPosition)
     {
-        if (keysWritten % indexInterval == 0)
+        if (keysWritten % minIndexInterval == 0)
         {
-            byte[] key = ByteBufferUtil.getArray(decoratedKey.key);
-            keys.add(key);
-            offheapSize += key.length;
-            positions.add(indexPosition);
-            offheapSize += TypeSizes.NATIVE.sizeof(indexPosition);
+            // see if we should skip this key based on our sampling level
+            boolean shouldSkip = false;
+            for (int start : startPoints)
+            {
+                if ((indexIntervalMatches - start) % BASE_SAMPLING_LEVEL == 0)
+                {
+                    shouldSkip = true;
+                    break;
+                }
+            }
+
+            if (!shouldSkip)
+            {
+                keys.add(getMinimalKey(decoratedKey));
+                offheapSize += decoratedKey.getKey().remaining();
+                positions.add(indexPosition);
+                offheapSize += TypeSizes.NATIVE.sizeof(indexPosition);
+            }
+
+            indexIntervalMatches++;
         }
         keysWritten++;
 
@@ -71,24 +125,139 @@
 
     public IndexSummary build(IPartitioner partitioner)
     {
-        assert keys != null && keys.size() > 0;
+        return build(partitioner, null);
+    }
+
+    public IndexSummary build(IPartitioner partitioner, DecoratedKey exclusiveUpperBound)
+    {
+        assert keys.size() > 0;
         assert keys.size() == positions.size();
 
-        Memory memory = Memory.allocate(offheapSize + (keys.size() * 4));
+        int length;
+        if (exclusiveUpperBound == null)
+            length = keys.size();
+        else
+            length = Collections.binarySearch(keys, exclusiveUpperBound);
+
+        assert length > 0;
+
+        long offheapSize = this.offheapSize;
+        if (length < keys.size())
+            for (int i = length ; i < keys.size() ; i++)
+                offheapSize -= keys.get(i).getKey().remaining() + TypeSizes.NATIVE.sizeof(positions.get(i));
+
+        // first we write out the position in the *summary* for each key in the summary,
+        // then we write out (key, actual index position) pairs
+        RefCountedMemory memory = new RefCountedMemory(offheapSize + (length * 4));
         int idxPosition = 0;
-        int keyPosition = keys.size() * 4;
-        for (int i = 0; i < keys.size(); i++)
+        int keyPosition = length * 4;
+        for (int i = 0; i < length; i++)
         {
+            // write the position of the actual entry in the index summary (4 bytes)
             memory.setInt(idxPosition, keyPosition);
             idxPosition += TypeSizes.NATIVE.sizeof(keyPosition);
 
-            byte[] temp = keys.get(i);
-            memory.setBytes(keyPosition, temp, 0, temp.length);
-            keyPosition += temp.length;
-            long tempPosition = positions.get(i);
-            memory.setLong(keyPosition, tempPosition);
-            keyPosition += TypeSizes.NATIVE.sizeof(tempPosition);
+            // write the key
+            ByteBuffer keyBytes = keys.get(i).getKey();
+            memory.setBytes(keyPosition, keyBytes);
+            keyPosition += keyBytes.remaining();
+
+            // write the position in the actual index file
+            long actualIndexPosition = positions.get(i);
+            memory.setLong(keyPosition, actualIndexPosition);
+            keyPosition += TypeSizes.NATIVE.sizeof(actualIndexPosition);
         }
-        return new IndexSummary(partitioner, memory, keys.size(), indexInterval);
+        assert keyPosition == offheapSize + (length * 4);
+        int sizeAtFullSampling = (int) Math.ceil(keysWritten / (double) minIndexInterval);
+        return new IndexSummary(partitioner, memory, length, sizeAtFullSampling, minIndexInterval, samplingLevel);
+    }
+
+    public static int entriesAtSamplingLevel(int samplingLevel, int maxSummarySize)
+    {
+        return (int) Math.ceil((samplingLevel * maxSummarySize) / (double) BASE_SAMPLING_LEVEL);
+    }
+
+    public static int calculateSamplingLevel(int currentSamplingLevel, int currentNumEntries, long targetNumEntries, int minIndexInterval, int maxIndexInterval)
+    {
+        // effective index interval == (BASE_SAMPLING_LEVEL / samplingLevel) * minIndexInterval
+        // so we can just solve for minSamplingLevel here:
+        // maxIndexInterval == (BASE_SAMPLING_LEVEL / minSamplingLevel) * minIndexInterval
+        int effectiveMinSamplingLevel = Math.max(1, (int) Math.ceil((BASE_SAMPLING_LEVEL * minIndexInterval) / (double) maxIndexInterval));
+
+        // Algebraic explanation for calculating the new sampling level (solve for newSamplingLevel):
+        // originalNumEntries = (baseSamplingLevel / currentSamplingLevel) * currentNumEntries
+        // newSpaceUsed = (newSamplingLevel / baseSamplingLevel) * originalNumEntries
+        // newSpaceUsed = (newSamplingLevel / baseSamplingLevel) * (baseSamplingLevel / currentSamplingLevel) * currentNumEntries
+        // newSpaceUsed = (newSamplingLevel / currentSamplingLevel) * currentNumEntries
+        // (newSpaceUsed * currentSamplingLevel) / currentNumEntries = newSamplingLevel
+        int newSamplingLevel = (int) (targetNumEntries * currentSamplingLevel) / currentNumEntries;
+        return Math.min(BASE_SAMPLING_LEVEL, Math.max(effectiveMinSamplingLevel, newSamplingLevel));
+    }
+
+    /**
+     * Downsamples an existing index summary to a new sampling level.
+     * @param existing an existing IndexSummary
+     * @param newSamplingLevel the target level for the new IndexSummary.  This must be less than the current sampling
+     *                         level for `existing`.
+     * @param partitioner the partitioner used for the index summary
+     * @return a new IndexSummary
+     */
+    public static IndexSummary downsample(IndexSummary existing, int newSamplingLevel, int minIndexInterval, IPartitioner partitioner)
+    {
+        // To downsample the old index summary, we'll go through (potentially) several rounds of downsampling.
+        // Conceptually, each round starts at position X and then removes every Nth item.  The value of X follows
+        // a particular pattern to evenly space out the items that we remove.  The value of N decreases by one each
+        // round.
+
+        int currentSamplingLevel = existing.getSamplingLevel();
+        assert currentSamplingLevel > newSamplingLevel;
+        assert minIndexInterval == existing.getMinIndexInterval();
+
+        // calculate starting indexes for downsampling rounds
+        int[] startPoints = Downsampling.getStartPoints(currentSamplingLevel, newSamplingLevel);
+
+        // calculate new off-heap size
+        int removedKeyCount = 0;
+        long newOffHeapSize = existing.getOffHeapSize();
+        for (int start : startPoints)
+        {
+            for (int j = start; j < existing.size(); j += currentSamplingLevel)
+            {
+                removedKeyCount++;
+                newOffHeapSize -= existing.getEntry(j).length;
+            }
+        }
+
+        int newKeyCount = existing.size() - removedKeyCount;
+
+        // Subtract (removedKeyCount * 4) from the new size to account for fewer entries in the first section, which
+        // stores the position of the actual entries in the summary.
+        RefCountedMemory memory = new RefCountedMemory(newOffHeapSize - (removedKeyCount * 4));
+
+        // Copy old entries to our new Memory.
+        int idxPosition = 0;
+        int keyPosition = newKeyCount * 4;
+        outer:
+        for (int oldSummaryIndex = 0; oldSummaryIndex < existing.size(); oldSummaryIndex++)
+        {
+            // to determine if we can skip this entry, go through the starting points for our downsampling rounds
+            // and see if the entry's index is covered by that round
+            for (int start : startPoints)
+            {
+                if ((oldSummaryIndex - start) % currentSamplingLevel == 0)
+                    continue outer;
+            }
+
+            // write the position of the actual entry in the index summary (4 bytes)
+            memory.setInt(idxPosition, keyPosition);
+            idxPosition += TypeSizes.NATIVE.sizeof(keyPosition);
+
+            // write the entry itself
+            byte[] entry = existing.getEntry(oldSummaryIndex);
+            memory.setBytes(keyPosition, entry, 0, entry.length);
+            keyPosition += entry.length;
+        }
+        return new IndexSummary(partitioner, memory, newKeyCount, existing.getMaxNumberOfEntries(),
+                                minIndexInterval, newSamplingLevel);
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
new file mode 100644
index 0000000..d5b7364
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java

@@ -0,0 +1,488 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable;
+
+import java.io.IOException;
+import java.lang.management.ManagementFactory;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+import javax.management.MBeanServer;
+import javax.management.ObjectName;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.DebuggableScheduledThreadPoolExecutor;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataTracker;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.WrappedRunnable;
+
+import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
+
+/**
+ * Manages the fixed-size memory pool for index summaries, periodically resizing them
+ * in order to give more memory to hot sstables and less memory to cold sstables.
+ */
+public class IndexSummaryManager implements IndexSummaryManagerMBean
+{
+    private static final Logger logger = LoggerFactory.getLogger(IndexSummaryManager.class);
+    public static final String MBEAN_NAME = "org.apache.cassandra.db:type=IndexSummaries";
+    public static final IndexSummaryManager instance;
+
+    private int resizeIntervalInMinutes = 0;
+    private long memoryPoolBytes;
+
+    // The target (or ideal) number of index summary entries must differ from the actual number of
+    // entries by this ratio in order to trigger an upsample or downsample of the summary.  Because
+    // upsampling requires reading the primary index in order to rebuild the summary, the threshold
+    // for upsampling is is higher.
+    static final double UPSAMPLE_THRESHOLD = 1.5;
+    static final double DOWNSAMPLE_THESHOLD = 0.75;
+
+    private final DebuggableScheduledThreadPoolExecutor executor;
+
+    // our next scheduled resizing run
+    private ScheduledFuture future;
+
+    static
+    {
+        instance = new IndexSummaryManager();
+        MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
+
+        try
+        {
+            mbs.registerMBean(instance, new ObjectName(MBEAN_NAME));
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private IndexSummaryManager()
+    {
+        executor = new DebuggableScheduledThreadPoolExecutor(1, "IndexSummaryManager", Thread.MIN_PRIORITY);
+
+        long indexSummarySizeInMB = DatabaseDescriptor.getIndexSummaryCapacityInMB();
+        int interval = DatabaseDescriptor.getIndexSummaryResizeIntervalInMinutes();
+        logger.info("Initializing index summary manager with a memory pool size of {} MB and a resize interval of {} minutes",
+                    indexSummarySizeInMB, interval);
+
+        setMemoryPoolCapacityInMB(DatabaseDescriptor.getIndexSummaryCapacityInMB());
+        setResizeIntervalInMinutes(DatabaseDescriptor.getIndexSummaryResizeIntervalInMinutes());
+    }
+
+    public int getResizeIntervalInMinutes()
+    {
+        return resizeIntervalInMinutes;
+    }
+
+    public void setResizeIntervalInMinutes(int resizeIntervalInMinutes)
+    {
+        int oldInterval = this.resizeIntervalInMinutes;
+        this.resizeIntervalInMinutes = resizeIntervalInMinutes;
+
+        long initialDelay;
+        if (future != null)
+        {
+            initialDelay = oldInterval < 0
+                           ? resizeIntervalInMinutes
+                           : Math.max(0, resizeIntervalInMinutes - (oldInterval - future.getDelay(TimeUnit.MINUTES)));
+            future.cancel(false);
+        }
+        else
+        {
+            initialDelay = resizeIntervalInMinutes;
+        }
+
+        if (this.resizeIntervalInMinutes < 0)
+        {
+            future = null;
+            return;
+        }
+
+        future = executor.scheduleWithFixedDelay(new WrappedRunnable()
+        {
+            protected void runMayThrow() throws Exception
+            {
+                redistributeSummaries();
+            }
+        }, initialDelay, resizeIntervalInMinutes, TimeUnit.MINUTES);
+    }
+
+    // for testing only
+    @VisibleForTesting
+    Long getTimeToNextResize(TimeUnit timeUnit)
+    {
+        if (future == null)
+            return null;
+
+        return future.getDelay(timeUnit);
+    }
+
+    public long getMemoryPoolCapacityInMB()
+    {
+        return memoryPoolBytes / 1024L / 1024L;
+    }
+
+    public Map<String, Integer> getIndexIntervals()
+    {
+        List<SSTableReader> sstables = getAllSSTables();
+        Map<String, Integer> intervals = new HashMap<>(sstables.size());
+        for (SSTableReader sstable : sstables)
+            intervals.put(sstable.getFilename(), (int) Math.round(sstable.getEffectiveIndexInterval()));
+
+        return intervals;
+    }
+
+    public double getAverageIndexInterval()
+    {
+        List<SSTableReader> sstables = getAllSSTables();
+        double total = 0.0;
+        for (SSTableReader sstable : sstables)
+            total += sstable.getEffectiveIndexInterval();
+        return total / sstables.size();
+    }
+
+    public void setMemoryPoolCapacityInMB(long memoryPoolCapacityInMB)
+    {
+        this.memoryPoolBytes = memoryPoolCapacityInMB * 1024L * 1024L;
+    }
+
+    /**
+     * Returns the actual space consumed by index summaries for all sstables.
+     * @return space currently used in MB
+     */
+    public double getMemoryPoolSizeInMB()
+    {
+        long total = 0;
+        for (SSTableReader sstable : getAllSSTables())
+            total += sstable.getIndexSummaryOffHeapSize();
+        return total / 1024.0 / 1024.0;
+    }
+
+    private List<SSTableReader> getAllSSTables()
+    {
+        List<SSTableReader> result = new ArrayList<>();
+        for (Keyspace ks : Keyspace.all())
+        {
+            for (ColumnFamilyStore cfStore: ks.getColumnFamilyStores())
+                result.addAll(cfStore.getSSTables());
+        }
+
+        return result;
+    }
+
+    /**
+     * Returns a Pair of all compacting and non-compacting sstables.  Non-compacting sstables will be marked as
+     * compacting.
+     */
+    private Pair<List<SSTableReader>, Multimap<DataTracker, SSTableReader>> getCompactingAndNonCompactingSSTables()
+    {
+        List<SSTableReader> allCompacting = new ArrayList<>();
+        Multimap<DataTracker, SSTableReader> allNonCompacting = HashMultimap.create();
+        for (Keyspace ks : Keyspace.all())
+        {
+            for (ColumnFamilyStore cfStore: ks.getColumnFamilyStores())
+            {
+                Set<SSTableReader> nonCompacting, allSSTables;
+                do
+                {
+                    allSSTables = cfStore.getDataTracker().getSSTables();
+                    nonCompacting = Sets.newHashSet(cfStore.getDataTracker().getUncompactingSSTables(allSSTables));
+                }
+                while (!(nonCompacting.isEmpty() || cfStore.getDataTracker().markCompacting(nonCompacting)));
+                allNonCompacting.putAll(cfStore.getDataTracker(), nonCompacting);
+                allCompacting.addAll(Sets.difference(allSSTables, nonCompacting));
+            }
+        }
+        return Pair.create(allCompacting, allNonCompacting);
+    }
+
+    public void redistributeSummaries() throws IOException
+    {
+        Pair<List<SSTableReader>, Multimap<DataTracker, SSTableReader>> compactingAndNonCompacting = getCompactingAndNonCompactingSSTables();
+        try
+        {
+            redistributeSummaries(compactingAndNonCompacting.left, Lists.newArrayList(compactingAndNonCompacting.right.values()), this.memoryPoolBytes);
+        }
+        finally
+        {
+            for(DataTracker tracker : compactingAndNonCompacting.right.keySet())
+                tracker.unmarkCompacting(compactingAndNonCompacting.right.get(tracker));
+        }
+    }
+
+    /**
+     * Attempts to fairly distribute a fixed pool of memory for index summaries across a set of SSTables based on
+     * their recent read rates.
+     * @param nonCompacting a list of sstables to share the memory pool across
+     * @param memoryPoolBytes a size (in bytes) that the total index summary space usage should stay close to or
+     *                        under, if possible
+     * @return a list of new SSTableReader instances
+     */
+    @VisibleForTesting
+    public static List<SSTableReader> redistributeSummaries(List<SSTableReader> compacting, List<SSTableReader> nonCompacting, long memoryPoolBytes) throws IOException
+    {
+        long total = 0;
+        for (SSTableReader sstable : Iterables.concat(compacting, nonCompacting))
+            total += sstable.getIndexSummaryOffHeapSize();
+
+        logger.debug("Beginning redistribution of index summaries for {} sstables with memory pool size {} MB; current spaced used is {} MB",
+                     nonCompacting.size(), memoryPoolBytes / 1024L / 1024L, total / 1024.0 / 1024.0);
+
+        double totalReadsPerSec = 0.0;
+        for (SSTableReader sstable : nonCompacting)
+        {
+            if (sstable.readMeter != null)
+            {
+                totalReadsPerSec += sstable.readMeter.fifteenMinuteRate();
+            }
+        }
+        logger.trace("Total reads/sec across all sstables in index summary resize process: {}", totalReadsPerSec);
+
+        // copy and sort by read rates (ascending)
+        List<SSTableReader> sstablesByHotness = new ArrayList<>(nonCompacting);
+        Collections.sort(sstablesByHotness, new Comparator<SSTableReader>()
+        {
+            public int compare(SSTableReader o1, SSTableReader o2)
+            {
+                if (o1.readMeter == null && o2.readMeter == null)
+                    return 0;
+                else if (o1.readMeter == null)
+                    return -1;
+                else if (o2.readMeter == null)
+                    return 1;
+                else
+                    return Double.compare(o1.readMeter.fifteenMinuteRate(), o2.readMeter.fifteenMinuteRate());
+            }
+        });
+
+        long remainingBytes = memoryPoolBytes;
+        for (SSTableReader sstable : compacting)
+            remainingBytes -= sstable.getIndexSummaryOffHeapSize();
+
+        logger.trace("Index summaries for compacting SSTables are using {} MB of space",
+                     (memoryPoolBytes - remainingBytes) / 1024.0 / 1024.0);
+        List<SSTableReader> newSSTables = adjustSamplingLevels(sstablesByHotness, totalReadsPerSec, remainingBytes);
+
+        total = 0;
+        for (SSTableReader sstable : Iterables.concat(compacting, newSSTables))
+            total += sstable.getIndexSummaryOffHeapSize();
+        logger.debug("Completed resizing of index summaries; current approximate memory used: {} MB",
+                     total / 1024.0 / 1024.0);
+
+        return newSSTables;
+    }
+
+    private static List<SSTableReader> adjustSamplingLevels(List<SSTableReader> sstables,
+                                                            double totalReadsPerSec, long memoryPoolCapacity) throws IOException
+    {
+
+        List<ResampleEntry> toDownsample = new ArrayList<>(sstables.size() / 4);
+        List<ResampleEntry> toUpsample = new ArrayList<>(sstables.size() / 4);
+        List<ResampleEntry> forceResample = new ArrayList<>();
+        List<ResampleEntry> forceUpsample = new ArrayList<>();
+        List<SSTableReader> newSSTables = new ArrayList<>(sstables.size());
+
+        // Going from the coldest to the hottest sstables, try to give each sstable an amount of space proportional
+        // to the number of total reads/sec it handles.
+        long remainingSpace = memoryPoolCapacity;
+        for (SSTableReader sstable : sstables)
+        {
+            int minIndexInterval = sstable.metadata.getMinIndexInterval();
+            int maxIndexInterval = sstable.metadata.getMaxIndexInterval();
+
+            double readsPerSec = sstable.readMeter == null ? 0.0 : sstable.readMeter.fifteenMinuteRate();
+            long idealSpace = Math.round(remainingSpace * (readsPerSec / totalReadsPerSec));
+
+            // figure out how many entries our idealSpace would buy us, and pick a new sampling level based on that
+            int currentNumEntries = sstable.getIndexSummarySize();
+            double avgEntrySize = sstable.getIndexSummaryOffHeapSize() / (double) currentNumEntries;
+            long targetNumEntries = Math.max(1, Math.round(idealSpace / avgEntrySize));
+            int currentSamplingLevel = sstable.getIndexSummarySamplingLevel();
+            int maxSummarySize = sstable.getMaxIndexSummarySize();
+
+            // if the min_index_interval changed, calculate what our current sampling level would be under the new min
+            if (sstable.getMinIndexInterval() != minIndexInterval)
+            {
+                int effectiveSamplingLevel = (int) Math.round(currentSamplingLevel * (minIndexInterval / (double) sstable.getMinIndexInterval()));
+                maxSummarySize = (int) Math.round(maxSummarySize * (sstable.getMinIndexInterval() / (double) minIndexInterval));
+                logger.trace("min_index_interval changed from {} to {}, so the current sampling level for {} is effectively now {} (was {})",
+                             sstable.getMinIndexInterval(), minIndexInterval, sstable, effectiveSamplingLevel, currentSamplingLevel);
+                currentSamplingLevel = effectiveSamplingLevel;
+            }
+
+            int newSamplingLevel = IndexSummaryBuilder.calculateSamplingLevel(currentSamplingLevel, currentNumEntries, targetNumEntries,
+                    minIndexInterval, maxIndexInterval);
+            int numEntriesAtNewSamplingLevel = IndexSummaryBuilder.entriesAtSamplingLevel(newSamplingLevel, maxSummarySize);
+            double effectiveIndexInterval = sstable.getEffectiveIndexInterval();
+
+            logger.trace("{} has {} reads/sec; ideal space for index summary: {} bytes ({} entries); considering moving " +
+                    "from level {} ({} entries, {} bytes) to level {} ({} entries, {} bytes)",
+                    sstable.getFilename(), readsPerSec, idealSpace, targetNumEntries, currentSamplingLevel, currentNumEntries,
+                    currentNumEntries * avgEntrySize, newSamplingLevel, numEntriesAtNewSamplingLevel,
+                    numEntriesAtNewSamplingLevel * avgEntrySize);
+
+            if (effectiveIndexInterval < minIndexInterval)
+            {
+                // The min_index_interval was changed; re-sample to match it.
+                logger.debug("Forcing resample of {} because the current index interval ({}) is below min_index_interval ({})",
+                        sstable, effectiveIndexInterval, minIndexInterval);
+                long spaceUsed = (long) Math.ceil(avgEntrySize * numEntriesAtNewSamplingLevel);
+                forceResample.add(new ResampleEntry(sstable, spaceUsed, newSamplingLevel));
+                remainingSpace -= spaceUsed;
+            }
+            else if (effectiveIndexInterval > maxIndexInterval)
+            {
+                // The max_index_interval was lowered; force an upsample to the effective minimum sampling level
+                logger.debug("Forcing upsample of {} because the current index interval ({}) is above max_index_interval ({})",
+                        sstable, effectiveIndexInterval, maxIndexInterval);
+                newSamplingLevel = Math.max(1, (BASE_SAMPLING_LEVEL * minIndexInterval) / maxIndexInterval);
+                numEntriesAtNewSamplingLevel = IndexSummaryBuilder.entriesAtSamplingLevel(newSamplingLevel, sstable.getMaxIndexSummarySize());
+                long spaceUsed = (long) Math.ceil(avgEntrySize * numEntriesAtNewSamplingLevel);
+                forceUpsample.add(new ResampleEntry(sstable, spaceUsed, newSamplingLevel));
+                remainingSpace -= avgEntrySize * numEntriesAtNewSamplingLevel;
+            }
+            else if (targetNumEntries >= currentNumEntries * UPSAMPLE_THRESHOLD && newSamplingLevel > currentSamplingLevel)
+            {
+                long spaceUsed = (long) Math.ceil(avgEntrySize * numEntriesAtNewSamplingLevel);
+                toUpsample.add(new ResampleEntry(sstable, spaceUsed, newSamplingLevel));
+                remainingSpace -= avgEntrySize * numEntriesAtNewSamplingLevel;
+            }
+            else if (targetNumEntries < currentNumEntries * DOWNSAMPLE_THESHOLD && newSamplingLevel < currentSamplingLevel)
+            {
+                long spaceUsed = (long) Math.ceil(avgEntrySize * numEntriesAtNewSamplingLevel);
+                toDownsample.add(new ResampleEntry(sstable, spaceUsed, newSamplingLevel));
+                remainingSpace -= spaceUsed;
+            }
+            else
+            {
+                // keep the same sampling level
+                logger.trace("SSTable {} is within thresholds of ideal sampling", sstable);
+                remainingSpace -= sstable.getIndexSummaryOffHeapSize();
+                newSSTables.add(sstable);
+            }
+            totalReadsPerSec -= readsPerSec;
+        }
+
+        if (remainingSpace > 0)
+        {
+            Pair<List<SSTableReader>, List<ResampleEntry>> result = distributeRemainingSpace(toDownsample, remainingSpace);
+            toDownsample = result.right;
+            newSSTables.addAll(result.left);
+        }
+
+        // downsample first, then upsample
+        toDownsample.addAll(forceResample);
+        toDownsample.addAll(toUpsample);
+        toDownsample.addAll(forceUpsample);
+        Multimap<DataTracker, SSTableReader> replacedByTracker = HashMultimap.create();
+        Multimap<DataTracker, SSTableReader> replacementsByTracker = HashMultimap.create();
+        for (ResampleEntry entry : toDownsample)
+        {
+            SSTableReader sstable = entry.sstable;
+            logger.debug("Re-sampling index summary for {} from {}/{} to {}/{} of the original number of entries",
+                         sstable, sstable.getIndexSummarySamplingLevel(), Downsampling.BASE_SAMPLING_LEVEL,
+                         entry.newSamplingLevel, Downsampling.BASE_SAMPLING_LEVEL);
+            ColumnFamilyStore cfs = Keyspace.open(sstable.getKeyspaceName()).getColumnFamilyStore(sstable.getColumnFamilyName());
+            SSTableReader replacement = sstable.cloneWithNewSummarySamplingLevel(cfs, entry.newSamplingLevel);
+            DataTracker tracker = cfs.getDataTracker();
+
+            replacedByTracker.put(tracker, sstable);
+            replacementsByTracker.put(tracker, replacement);
+        }
+
+        for (DataTracker tracker : replacedByTracker.keySet())
+        {
+            tracker.replaceReaders(replacedByTracker.get(tracker), replacementsByTracker.get(tracker));
+            newSSTables.addAll(replacementsByTracker.get(tracker));
+        }
+
+        return newSSTables;
+    }
+
+    @VisibleForTesting
+    static Pair<List<SSTableReader>, List<ResampleEntry>> distributeRemainingSpace(List<ResampleEntry> toDownsample, long remainingSpace)
+    {
+        // sort by the amount of space regained by doing the downsample operation; we want to try to avoid operations
+        // that will make little difference.
+        Collections.sort(toDownsample, new Comparator<ResampleEntry>()
+        {
+            public int compare(ResampleEntry o1, ResampleEntry o2)
+            {
+                return Double.compare(o1.sstable.getIndexSummaryOffHeapSize() - o1.newSpaceUsed,
+                                      o2.sstable.getIndexSummaryOffHeapSize() - o2.newSpaceUsed);
+            }
+        });
+
+        int noDownsampleCutoff = 0;
+        List<SSTableReader> willNotDownsample = new ArrayList<>();
+        while (remainingSpace > 0 && noDownsampleCutoff < toDownsample.size())
+        {
+            ResampleEntry entry = toDownsample.get(noDownsampleCutoff);
+
+            long extraSpaceRequired = entry.sstable.getIndexSummaryOffHeapSize() - entry.newSpaceUsed;
+            // see if we have enough leftover space to keep the current sampling level
+            if (extraSpaceRequired <= remainingSpace)
+            {
+                logger.trace("Using leftover space to keep {} at the current sampling level ({})",
+                             entry.sstable, entry.sstable.getIndexSummarySamplingLevel());
+                willNotDownsample.add(entry.sstable);
+                remainingSpace -= extraSpaceRequired;
+            }
+            else
+            {
+                break;
+            }
+
+            noDownsampleCutoff++;
+        }
+        return Pair.create(willNotDownsample, toDownsample.subList(noDownsampleCutoff, toDownsample.size()));
+    }
+
+    private static class ResampleEntry
+    {
+        public final SSTableReader sstable;
+        public final long newSpaceUsed;
+        public final int newSamplingLevel;
+
+        public ResampleEntry(SSTableReader sstable, long newSpaceUsed, int newSamplingLevel)
+        {
+            this.sstable = sstable;
+            this.newSpaceUsed = newSpaceUsed;
+            this.newSamplingLevel = newSamplingLevel;
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryManagerMBean.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManagerMBean.java
new file mode 100644
index 0000000..3382350
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManagerMBean.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable;
+
+import java.io.IOException;
+import java.util.Map;
+
+public interface IndexSummaryManagerMBean
+{
+    public long getMemoryPoolCapacityInMB();
+    public void setMemoryPoolCapacityInMB(long memoryPoolCapacityInMB);
+
+    /**
+     * Returns the current actual off-heap memory usage of the index summaries for all non-compacting sstables.
+     * @return The amount of memory used in MB.
+     */
+    public double getMemoryPoolSizeInMB();
+
+    /**
+     * Returns a map of SSTable filenames to their current effective index interval.
+     */
+    public Map<String, Integer> getIndexIntervals();
+
+    public double getAverageIndexInterval();
+
+    public void redistributeSummaries() throws IOException;
+
+    public int getResizeIntervalInMinutes();
+    public void setResizeIntervalInMinutes(int resizeIntervalInMinutes);
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
index f4f7ee5..4d1c663 100644
--- a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java

@@ -35,7 +35,7 @@
 
     public KeyIterator(Descriptor desc)
     {
-        File path = new File(desc.filenameFor(SSTable.COMPONENT_INDEX));
+        File path = new File(desc.filenameFor(Component.PRIMARY_INDEX));
         in = RandomAccessReader.open(path);
     }
 
@@ -46,7 +46,7 @@
             if (in.isEOF())
                 return endOfData();
             DecoratedKey key = StorageService.getPartitioner().decorateKey(ByteBufferUtil.readWithShortLength(in));
-            RowIndexEntry.serializer.skip(in); // skip remainder of the entry
+            RowIndexEntry.Serializer.skip(in); // skip remainder of the entry
             return key;
         }
         catch (IOException e)

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java
index 69c6521..6eff369 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTable.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java

@@ -24,13 +24,13 @@
 
 import com.google.common.base.Predicates;
 import com.google.common.collect.Collections2;
-import com.google.common.collect.Ordering;
 import com.google.common.collect.Sets;
 import com.google.common.io.Files;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.dht.IPartitioner;
@@ -38,7 +38,7 @@
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.HeapAllocator;
+import org.apache.cassandra.utils.memory.HeapAllocator;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -57,27 +57,8 @@
 {
     static final Logger logger = LoggerFactory.getLogger(SSTable.class);
 
-    // TODO: replace with 'Component' objects
-    public static final String COMPONENT_DATA = Component.Type.DATA.repr;
-    public static final String COMPONENT_INDEX = Component.Type.PRIMARY_INDEX.repr;
-    public static final String COMPONENT_FILTER = Component.Type.FILTER.repr;
-    public static final String COMPONENT_STATS = Component.Type.STATS.repr;
-    public static final String COMPONENT_DIGEST = Component.Type.DIGEST.repr;
-
-    public static final String TEMPFILE_MARKER = "tmp";
-
     public static final int TOMBSTONE_HISTOGRAM_BIN_SIZE = 100;
 
-    public static final Comparator<SSTableReader> maxTimestampComparator = new Comparator<SSTableReader>()
-    {
-        public int compare(SSTableReader o1, SSTableReader o2)
-        {
-            long ts1 = o1.getMaxTimestamp();
-            long ts2 = o2.getMaxTimestamp();
-            return (ts1 > ts2 ? -1 : (ts1 == ts2 ? 0 : 1));
-        }
-    };
-
     public final Descriptor descriptor;
     protected final Set<Component> components;
     public final CFMetaData metadata;
@@ -101,26 +82,13 @@
         assert partitioner != null;
 
         this.descriptor = descriptor;
-        Set<Component> dataComponents = new HashSet<Component>(components);
-        for (Component component : components)
-            assert component.type != Component.Type.COMPACTED_MARKER;
-
+        Set<Component> dataComponents = new HashSet<>(components);
         this.compression = dataComponents.contains(Component.COMPRESSION_INFO);
-        this.components = new CopyOnWriteArraySet<Component>(dataComponents);
+        this.components = new CopyOnWriteArraySet<>(dataComponents);
         this.metadata = metadata;
         this.partitioner = partitioner;
     }
 
-    public static final Comparator<SSTableReader> sstableComparator = new Comparator<SSTableReader>()
-    {
-        public int compare(SSTableReader o1, SSTableReader o2)
-        {
-            return o1.first.compareTo(o2.first);
-        }
-    };
-
-    public static final Ordering<SSTableReader> sstableOrdering = Ordering.from(sstableComparator);
-
     /**
      * We use a ReferenceQueue to manage deleting files that have been compacted
      * and for which no more SSTable references exist.  But this is not guaranteed
@@ -139,15 +107,11 @@
             FileUtils.deleteWithConfirm(desc.filenameFor(Component.DATA));
         for (Component component : components)
         {
-            if (component.equals(Component.DATA) || component.equals(Component.COMPACTED_MARKER) || component.equals(Component.SUMMARY))
+            if (component.equals(Component.DATA) || component.equals(Component.SUMMARY))
                 continue;
 
             FileUtils.deleteWithConfirm(desc.filenameFor(component));
         }
-        // remove the COMPACTED_MARKER component last if it exists
-        // Note: newly created sstable should not have a marker, but we keep this for now to make sure
-        // we don't leave older marker around
-        FileUtils.delete(desc.filenameFor(Component.COMPACTED_MARKER));
         FileUtils.delete(desc.filenameFor(Component.SUMMARY));
 
         logger.debug("Deleted {}", desc);
@@ -160,19 +124,19 @@
      */
     public static DecoratedKey getMinimalKey(DecoratedKey key)
     {
-        return key.key.position() > 0 || key.key.hasRemaining()
-                                       ? new DecoratedKey(key.token, HeapAllocator.instance.clone(key.key))
+        return key.getKey().position() > 0 || key.getKey().hasRemaining() || !key.getKey().hasArray()
+                                       ? new BufferDecoratedKey(key.getToken(), HeapAllocator.instance.clone(key.getKey()))
                                        : key;
     }
 
     public String getFilename()
     {
-        return descriptor.filenameFor(COMPONENT_DATA);
+        return descriptor.filenameFor(Component.DATA);
     }
 
     public String getIndexFilename()
     {
-        return descriptor.filenameFor(COMPONENT_INDEX);
+        return descriptor.filenameFor(Component.PRIMARY_INDEX);
     }
 
     public String getColumnFamilyName()
@@ -253,7 +217,7 @@
         while (ifile.getFilePointer() < BYTES_CAP && keys < SAMPLES_CAP)
         {
             ByteBufferUtil.skipShortLength(ifile);
-            RowIndexEntry.serializer.skip(ifile);
+            RowIndexEntry.Serializer.skip(ifile);
             keys++;
         }
         assert keys > 0 && ifile.getFilePointer() > 0 && ifile.length() > 0 : "Unexpected empty index file: " + ifile;
@@ -262,16 +226,6 @@
         return estimatedRows;
     }
 
-    public static long getTotalBytes(Iterable<SSTableReader> sstables)
-    {
-        long sum = 0;
-        for (SSTableReader sstable : sstables)
-        {
-            sum += sstable.onDiskLength();
-        }
-        return sum;
-    }
-
     public long bytesOnDisk()
     {
         long bytes = 0;
@@ -303,7 +257,7 @@
         {
             Component component = new Component(Component.Type.fromRepresentation(componentName), componentName);
             if (!new File(descriptor.filenameFor(component)).exists())
-                logger.error("Missing component: " + descriptor.filenameFor(component));
+                logger.error("Missing component: {}", descriptor.filenameFor(component));
             else
                 components.add(component);
         }

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableDeletingTask.java b/src/java/org/apache/cassandra/io/sstable/SSTableDeletingTask.java
index fb7f036..785e23b 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableDeletingTask.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableDeletingTask.java

@@ -51,8 +51,16 @@
     public SSTableDeletingTask(SSTableReader referent)
     {
         this.referent = referent;
-        this.desc = referent.descriptor;
-        this.components = referent.components;
+        if (referent.openReason == SSTableReader.OpenReason.EARLY)
+        {
+            this.desc = referent.descriptor.asType(Descriptor.Type.TEMPLINK);
+            this.components = Sets.newHashSet(Component.DATA, Component.PRIMARY_INDEX);
+        }
+        else
+        {
+            this.desc = referent.descriptor;
+            this.components = referent.components;
+        }
         this.size = referent.bytesOnDisk();
     }
 
@@ -78,7 +86,7 @@
         File datafile = new File(desc.filenameFor(Component.DATA));
         if (!datafile.delete())
         {
-            logger.error("Unable to delete " + datafile + " (it will be removed on server restart; we'll also retry after GC)");
+            logger.error("Unable to delete {} (it will be removed on server restart; we'll also retry after GC)", datafile);
             failedTasks.add(this);
             return;
         }

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
index 52da9bb..b784a7e 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java

@@ -20,33 +20,21 @@
 import java.io.*;
 import java.util.Iterator;
 
-import org.apache.cassandra.serializers.MarshalException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.serializers.MarshalException;
 
 public class SSTableIdentityIterator implements Comparable<SSTableIdentityIterator>, OnDiskAtomIterator
 {
-    private static final Logger logger = LoggerFactory.getLogger(SSTableIdentityIterator.class);
-
     private final DecoratedKey key;
     private final DataInput in;
     public final long dataSize; // we [still] require this so compaction can tell if it's safe to read the row into memory
     public final ColumnSerializer.Flag flag;
 
     private final ColumnFamily columnFamily;
-    private final int columnCount;
-
     private final Iterator<OnDiskAtom> atomIterator;
-    private final Descriptor.Version dataVersion;
-
-    // Used by lazilyCompactedRow, so that we see the same things when deserializing the first and second time
-    private final int expireBefore;
-
     private final boolean validateColumns;
     private final String filename;
 
@@ -56,7 +44,6 @@
      * @param file Reading using this file.
      * @param key Key of this row.
      * @param dataSize length of row data
-     * @throws IOException
      */
     public SSTableIdentityIterator(SSTableReader sstable, RandomAccessReader file, DecoratedKey key, long dataSize)
     {
@@ -92,17 +79,17 @@
         this.filename = filename;
         this.key = key;
         this.dataSize = dataSize;
-        this.expireBefore = (int)(System.currentTimeMillis() / 1000);
         this.flag = flag;
         this.validateColumns = checkData;
-        this.dataVersion = sstable == null ? Descriptor.Version.CURRENT : sstable.descriptor.version;
+
+        Descriptor.Version dataVersion = sstable == null ? Descriptor.Version.CURRENT : sstable.descriptor.version;
+        int expireBefore = (int) (System.currentTimeMillis() / 1000);
+        columnFamily = ArrayBackedSortedColumns.factory.create(metadata);
 
         try
         {
-            columnFamily = EmptyColumns.factory.create(metadata);
             columnFamily.delete(DeletionTime.serializer.deserialize(in));
-            columnCount = dataVersion.hasRowSizeAndColumnCount ? in.readInt() : Integer.MAX_VALUE;
-            atomIterator = columnFamily.metadata().getOnDiskIterator(in, columnCount, flag, expireBefore, dataVersion);
+            atomIterator = columnFamily.metadata().getOnDiskIterator(in, flag, expireBefore, dataVersion);
         }
         catch (IOException e)
         {
@@ -177,31 +164,8 @@
         }
     }
 
-    public ColumnFamily getColumnFamilyWithColumns(ColumnFamily.Factory containerFactory)
-    {
-        ColumnFamily cf = columnFamily.cloneMeShallow(containerFactory, false);
-        // since we already read column count, just pass that value and continue deserialization
-        Iterator<OnDiskAtom> iter = cf.metadata().getOnDiskIterator(in, columnCount, flag, expireBefore, dataVersion);
-        while (iter.hasNext())
-            cf.addAtom(iter.next());
-
-        if (validateColumns)
-        {
-            try
-            {
-                cf.metadata().validateColumns(cf);
-            }
-            catch (MarshalException e)
-            {
-                throw new RuntimeException("Error validating row " + key, e);
-            }
-        }
-        return cf;
-    }
-
     public int compareTo(SSTableIdentityIterator o)
     {
         return key.compareTo(o.key);
     }
-
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java b/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java
index 85dc0e4..3d7eea7 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java

@@ -33,6 +33,7 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.streaming.*;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.OutputHandler;
@@ -47,6 +48,7 @@
     private final File directory;
     private final String keyspace;
     private final Client client;
+    private final int connectionsPerHost;
     private final OutputHandler outputHandler;
     private final Set<InetAddress> failedHosts = new HashSet<>();
 
@@ -60,10 +62,16 @@
 
     public SSTableLoader(File directory, Client client, OutputHandler outputHandler)
     {
+        this(directory, client, outputHandler, 1);
+    }
+
+    public SSTableLoader(File directory, Client client, OutputHandler outputHandler, int connectionsPerHost)
+    {
         this.directory = directory;
         this.keyspace = directory.getParentFile().getName();
         this.client = client;
         this.outputHandler = outputHandler;
+        this.connectionsPerHost = connectionsPerHost;
     }
 
     protected Collection<SSTableReader> openSSTables(final Map<InetAddress, Collection<Range<Token>>> ranges)
@@ -78,7 +86,7 @@
                     return false;
                 Pair<Descriptor, Component> p = SSTable.tryComponentFromFilename(dir, name);
                 Descriptor desc = p == null ? null : p.left;
-                if (p == null || !p.right.equals(Component.DATA) || desc.temporary)
+                if (p == null || !p.right.equals(Component.DATA) || desc.type.isTemporary)
                     return false;
 
                 if (!new File(desc.filenameFor(Component.PRIMARY_INDEX)).exists())
@@ -122,7 +130,7 @@
                         List<Pair<Long, Long>> sstableSections = sstable.getPositionsForRanges(tokenRanges);
                         long estimatedKeys = sstable.estimatedKeysForRanges(tokenRanges);
 
-                        StreamSession.SSTableStreamingSections details = new StreamSession.SSTableStreamingSections(sstable, sstableSections, estimatedKeys);
+                        StreamSession.SSTableStreamingSections details = new StreamSession.SSTableStreamingSections(sstable, sstableSections, estimatedKeys, ActiveRepairService.UNREPAIRED_SSTABLE);
                         streamingDetails.put(endpoint, details);
                     }
 
@@ -149,7 +157,7 @@
         client.init(keyspace);
         outputHandler.output("Established connection to initial hosts");
 
-        StreamPlan plan = new StreamPlan("Bulk Load").connectionFactory(client.getConnectionFactory());
+        StreamPlan plan = new StreamPlan("Bulk Load", 0, connectionsPerHost).connectionFactory(client.getConnectionFactory());
 
         Map<InetAddress, Collection<Range<Token>>> endpointToRanges = client.getEndpointToRangesMap();
         openSSTables(endpointToRanges);

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableMetadata.java b/src/java/org/apache/cassandra/io/sstable/SSTableMetadata.java
deleted file mode 100644
index 140e08b..0000000
--- a/src/java/org/apache/cassandra/io/sstable/SSTableMetadata.java
+++ /dev/null

@@ -1,518 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.sstable;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.Pair;
-import org.apache.cassandra.utils.StreamingHistogram;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.db.commitlog.ReplayPosition;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.EstimatedHistogram;
-
-/**
- * Metadata for a SSTable.
- * Metadata includes:
- *  - estimated row size histogram
- *  - estimated column count histogram
- *  - replay position
- *  - max column timestamp
- *  - max local deletion time
- *  - bloom filter fp chance
- *  - compression ratio
- *  - partitioner
- *  - generations of sstables from which this sstable was compacted, if any
- *  - tombstone drop time histogram
- *
- * An SSTableMetadata should be instantiated via the Collector, openFromDescriptor()
- * or createDefaultInstance()
- */
-public class SSTableMetadata
-{
-    public static final double NO_BLOOM_FLITER_FP_CHANCE = -1.0;
-    public static final double NO_COMPRESSION_RATIO = -1.0;
-    public static final SSTableMetadataSerializer serializer = new SSTableMetadataSerializer();
-
-    public final EstimatedHistogram estimatedRowSize;
-    public final EstimatedHistogram estimatedColumnCount;
-    public final ReplayPosition replayPosition;
-    public final long minTimestamp;
-    public final long maxTimestamp;
-    public final int maxLocalDeletionTime;
-    public final double bloomFilterFPChance;
-    public final double compressionRatio;
-    public final String partitioner;
-    public final StreamingHistogram estimatedTombstoneDropTime;
-    public final int sstableLevel;
-    public final List<ByteBuffer> maxColumnNames;
-    public final List<ByteBuffer> minColumnNames;
-
-    private SSTableMetadata()
-    {
-        this(defaultRowSizeHistogram(),
-             defaultColumnCountHistogram(),
-             ReplayPosition.NONE,
-             Long.MIN_VALUE,
-             Long.MAX_VALUE,
-             Integer.MAX_VALUE,
-             NO_BLOOM_FLITER_FP_CHANCE,
-             NO_COMPRESSION_RATIO,
-             null,
-             defaultTombstoneDropTimeHistogram(),
-             0,
-             Collections.<ByteBuffer>emptyList(),
-             Collections.<ByteBuffer>emptyList());
-    }
-
-    private SSTableMetadata(EstimatedHistogram rowSizes,
-                            EstimatedHistogram columnCounts,
-                            ReplayPosition replayPosition,
-                            long minTimestamp,
-                            long maxTimestamp,
-                            int maxLocalDeletionTime,
-                            double bloomFilterFPChance,
-                            double compressionRatio,
-                            String partitioner,
-                            StreamingHistogram estimatedTombstoneDropTime,
-                            int sstableLevel,
-                            List<ByteBuffer> minColumnNames,
-                            List<ByteBuffer> maxColumnNames)
-    {
-        this.estimatedRowSize = rowSizes;
-        this.estimatedColumnCount = columnCounts;
-        this.replayPosition = replayPosition;
-        this.minTimestamp = minTimestamp;
-        this.maxTimestamp = maxTimestamp;
-        this.maxLocalDeletionTime = maxLocalDeletionTime;
-        this.bloomFilterFPChance = bloomFilterFPChance;
-        this.compressionRatio = compressionRatio;
-        this.partitioner = partitioner;
-        this.estimatedTombstoneDropTime = estimatedTombstoneDropTime;
-        this.sstableLevel = sstableLevel;
-        this.minColumnNames = minColumnNames;
-        this.maxColumnNames = maxColumnNames;
-    }
-
-    public static Collector createCollector(AbstractType<?> columnNameComparator)
-    {
-        return new Collector(columnNameComparator);
-    }
-
-    public static Collector createCollector(Collection<SSTableReader> sstables, AbstractType<?> columnNameComparator, int level)
-    {
-        Collector collector = new Collector(columnNameComparator);
-
-        collector.replayPosition(ReplayPosition.getReplayPosition(sstables));
-        collector.sstableLevel(level);
-        // Get the max timestamp of the precompacted sstables
-        // and adds generation of live ancestors
-        for (SSTableReader sstable : sstables)
-        {
-            collector.addAncestor(sstable.descriptor.generation);
-            for (Integer i : sstable.getAncestors())
-            {
-                if (new File(sstable.descriptor.withGeneration(i).filenameFor(Component.DATA)).exists())
-                    collector.addAncestor(i);
-            }
-        }
-
-        return collector;
-    }
-
-    /**
-     * Used when updating sstablemetadata files with an sstable level
-     * @param metadata
-     * @param sstableLevel
-     * @return
-     */
-    @Deprecated
-    public static SSTableMetadata copyWithNewSSTableLevel(SSTableMetadata metadata, int sstableLevel)
-    {
-        return new SSTableMetadata(metadata.estimatedRowSize,
-                                   metadata.estimatedColumnCount,
-                                   metadata.replayPosition,
-                                   metadata.minTimestamp,
-                                   metadata.maxTimestamp,
-                                   metadata.maxLocalDeletionTime,
-                                   metadata.bloomFilterFPChance,
-                                   metadata.compressionRatio,
-                                   metadata.partitioner,
-                                   metadata.estimatedTombstoneDropTime,
-                                   sstableLevel,
-                                   metadata.minColumnNames,
-                                   metadata.maxColumnNames);
-
-    }
-
-    static EstimatedHistogram defaultColumnCountHistogram()
-    {
-        // EH of 114 can track a max value of 2395318855, i.e., > 2B columns
-        return new EstimatedHistogram(114);
-    }
-
-    static EstimatedHistogram defaultRowSizeHistogram()
-    {
-        // EH of 150 can track a max value of 1697806495183, i.e., > 1.5PB
-        return new EstimatedHistogram(150);
-    }
-
-    static StreamingHistogram defaultTombstoneDropTimeHistogram()
-    {
-        return new StreamingHistogram(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE);
-    }
-
-    /**
-     * @param gcBefore
-     * @return estimated droppable tombstone ratio at given gcBefore time.
-     */
-    public double getEstimatedDroppableTombstoneRatio(int gcBefore)
-    {
-        long estimatedColumnCount = this.estimatedColumnCount.mean() * this.estimatedColumnCount.count();
-        if (estimatedColumnCount > 0)
-        {
-            double droppable = getDroppableTombstonesBefore(gcBefore);
-            return droppable / estimatedColumnCount;
-        }
-        return 0.0f;
-    }
-
-    /**
-     * Get the amount of droppable tombstones
-     * @param gcBefore the gc time
-     * @return amount of droppable tombstones
-     */
-    public double getDroppableTombstonesBefore(int gcBefore)
-    {
-        return estimatedTombstoneDropTime.sum(gcBefore);
-    }
-
-    public static class Collector
-    {
-        protected EstimatedHistogram estimatedRowSize = defaultRowSizeHistogram();
-        protected EstimatedHistogram estimatedColumnCount = defaultColumnCountHistogram();
-        protected ReplayPosition replayPosition = ReplayPosition.NONE;
-        protected long minTimestamp = Long.MAX_VALUE;
-        protected long maxTimestamp = Long.MIN_VALUE;
-        protected int maxLocalDeletionTime = Integer.MIN_VALUE;
-        protected double compressionRatio = NO_COMPRESSION_RATIO;
-        protected Set<Integer> ancestors = new HashSet<Integer>();
-        protected StreamingHistogram estimatedTombstoneDropTime = defaultTombstoneDropTimeHistogram();
-        protected int sstableLevel;
-        protected List<ByteBuffer> minColumnNames = Collections.emptyList();
-        protected List<ByteBuffer> maxColumnNames = Collections.emptyList();
-        private final AbstractType<?> columnNameComparator;
-
-        private Collector(AbstractType<?> columnNameComparator)
-        {
-            this.columnNameComparator = columnNameComparator;
-        }
-        public void addRowSize(long rowSize)
-        {
-            estimatedRowSize.add(rowSize);
-        }
-
-        public void addColumnCount(long columnCount)
-        {
-            estimatedColumnCount.add(columnCount);
-        }
-
-        public void mergeTombstoneHistogram(StreamingHistogram histogram)
-        {
-            estimatedTombstoneDropTime.merge(histogram);
-        }
-
-        /**
-         * Ratio is compressed/uncompressed and it is
-         * if you have 1.x then compression isn't helping
-         */
-        public void addCompressionRatio(long compressed, long uncompressed)
-        {
-            compressionRatio = (double) compressed/uncompressed;
-        }
-
-        public void updateMinTimestamp(long potentialMin)
-        {
-            minTimestamp = Math.min(minTimestamp, potentialMin);
-        }
-
-        public void updateMaxTimestamp(long potentialMax)
-        {
-            maxTimestamp = Math.max(maxTimestamp, potentialMax);
-        }
-
-        public void updateMaxLocalDeletionTime(int maxLocalDeletionTime)
-        {
-            this.maxLocalDeletionTime = Math.max(this.maxLocalDeletionTime, maxLocalDeletionTime);
-        }
-
-        public SSTableMetadata finalizeMetadata(String partitioner, double bloomFilterFPChance)
-        {
-            return new SSTableMetadata(estimatedRowSize,
-                                       estimatedColumnCount,
-                                       replayPosition,
-                                       minTimestamp,
-                                       maxTimestamp,
-                                       maxLocalDeletionTime,
-                                       bloomFilterFPChance,
-                                       compressionRatio,
-                                       partitioner,
-                                       estimatedTombstoneDropTime,
-                                       sstableLevel,
-                                       minColumnNames,
-                                       maxColumnNames);
-        }
-
-        public Collector estimatedRowSize(EstimatedHistogram estimatedRowSize)
-        {
-            this.estimatedRowSize = estimatedRowSize;
-            return this;
-        }
-
-        public Collector estimatedColumnCount(EstimatedHistogram estimatedColumnCount)
-        {
-            this.estimatedColumnCount = estimatedColumnCount;
-            return this;
-        }
-
-        public Collector replayPosition(ReplayPosition replayPosition)
-        {
-            this.replayPosition = replayPosition;
-            return this;
-        }
-
-        public Collector addAncestor(int generation)
-        {
-            this.ancestors.add(generation);
-            return this;
-        }
-
-        void update(long size, ColumnStats stats)
-        {
-            updateMinTimestamp(stats.minTimestamp);
-            /*
-             * The max timestamp is not always collected here (more precisely, row.maxTimestamp() may return Long.MIN_VALUE),
-             * to avoid deserializing an EchoedRow.
-             * This is the reason why it is collected first when calling ColumnFamilyStore.createCompactionWriter
-             * However, for old sstables without timestamp, we still want to update the timestamp (and we know
-             * that in this case we will not use EchoedRow, since CompactionControler.needsDeserialize() will be true).
-            */
-            updateMaxTimestamp(stats.maxTimestamp);
-            updateMaxLocalDeletionTime(stats.maxLocalDeletionTime);
-            addRowSize(size);
-            addColumnCount(stats.columnCount);
-            mergeTombstoneHistogram(stats.tombstoneHistogram);
-            updateMinColumnNames(stats.minColumnNames);
-            updateMaxColumnNames(stats.maxColumnNames);
-        }
-
-        public Collector sstableLevel(int sstableLevel)
-        {
-            this.sstableLevel = sstableLevel;
-            return this;
-        }
-
-        public Collector updateMinColumnNames(List<ByteBuffer> minColumnNames)
-        {
-            if (minColumnNames.size() > 0)
-                this.minColumnNames = ColumnNameHelper.mergeMin(this.minColumnNames, minColumnNames, columnNameComparator);
-            return this;
-        }
-
-        public Collector updateMaxColumnNames(List<ByteBuffer> maxColumnNames)
-        {
-            if (maxColumnNames.size() > 0)
-                this.maxColumnNames = ColumnNameHelper.mergeMax(this.maxColumnNames, maxColumnNames, columnNameComparator);
-            return this;
-        }
-    }
-
-    public static class SSTableMetadataSerializer
-    {
-        private static final Logger logger = LoggerFactory.getLogger(SSTableMetadataSerializer.class);
-
-        public void serialize(SSTableMetadata sstableStats, Set<Integer> ancestors, DataOutput out) throws IOException
-        {
-            assert sstableStats.partitioner != null;
-
-            EstimatedHistogram.serializer.serialize(sstableStats.estimatedRowSize, out);
-            EstimatedHistogram.serializer.serialize(sstableStats.estimatedColumnCount, out);
-            ReplayPosition.serializer.serialize(sstableStats.replayPosition, out);
-            out.writeLong(sstableStats.minTimestamp);
-            out.writeLong(sstableStats.maxTimestamp);
-            out.writeInt(sstableStats.maxLocalDeletionTime);
-            out.writeDouble(sstableStats.bloomFilterFPChance);
-            out.writeDouble(sstableStats.compressionRatio);
-            out.writeUTF(sstableStats.partitioner);
-            out.writeInt(ancestors.size());
-            for (Integer g : ancestors)
-                out.writeInt(g);
-            StreamingHistogram.serializer.serialize(sstableStats.estimatedTombstoneDropTime, out);
-            out.writeInt(sstableStats.sstableLevel);
-            serializeMinMaxColumnNames(sstableStats.minColumnNames, sstableStats.maxColumnNames, out);
-        }
-
-        private void serializeMinMaxColumnNames(List<ByteBuffer> minColNames, List<ByteBuffer> maxColNames, DataOutput out) throws IOException
-        {
-            out.writeInt(minColNames.size());
-            for (ByteBuffer columnName : minColNames)
-                ByteBufferUtil.writeWithShortLength(columnName, out);
-            out.writeInt(maxColNames.size());
-            for (ByteBuffer columnName : maxColNames)
-                ByteBufferUtil.writeWithShortLength(columnName, out);
-        }
-        /**
-         * Used to serialize to an old version - needed to be able to update sstable level without a full compaction.
-         *
-         * @deprecated will be removed when it is assumed that the minimum upgrade-from-version is the version that this
-         * patch made it into
-         *
-         * @param sstableStats
-         * @param legacyDesc
-         * @param out
-         * @throws IOException
-         */
-        @Deprecated
-        public void legacySerialize(SSTableMetadata sstableStats, Set<Integer> ancestors, Descriptor legacyDesc, DataOutput out) throws IOException
-        {
-            EstimatedHistogram.serializer.serialize(sstableStats.estimatedRowSize, out);
-            EstimatedHistogram.serializer.serialize(sstableStats.estimatedColumnCount, out);
-            ReplayPosition.serializer.serialize(sstableStats.replayPosition, out);
-            out.writeLong(sstableStats.minTimestamp);
-            out.writeLong(sstableStats.maxTimestamp);
-            if (legacyDesc.version.tracksMaxLocalDeletionTime)
-                out.writeInt(sstableStats.maxLocalDeletionTime);
-            if (legacyDesc.version.hasBloomFilterFPChance)
-                out.writeDouble(sstableStats.bloomFilterFPChance);
-            out.writeDouble(sstableStats.compressionRatio);
-            out.writeUTF(sstableStats.partitioner);
-            out.writeInt(ancestors.size());
-            for (Integer g : ancestors)
-                out.writeInt(g);
-            StreamingHistogram.serializer.serialize(sstableStats.estimatedTombstoneDropTime, out);
-            out.writeInt(sstableStats.sstableLevel);
-            if (legacyDesc.version.tracksMaxMinColumnNames)
-                serializeMinMaxColumnNames(sstableStats.minColumnNames, sstableStats.maxColumnNames, out);
-        }
-
-        /**
-         * deserializes the metadata
-         *
-         * returns a pair containing the part of the metadata meant to be kept-in memory and the part
-         * that should not.
-         *
-         * @param descriptor the descriptor
-         * @return a pair containing data that needs to be in memory and data that is potentially big and is not needed
-         *         in memory
-         * @throws IOException
-         */
-        public Pair<SSTableMetadata, Set<Integer>> deserialize(Descriptor descriptor) throws IOException
-        {
-            return deserialize(descriptor, true);
-        }
-
-        public Pair<SSTableMetadata, Set<Integer>> deserialize(Descriptor descriptor, boolean loadSSTableLevel) throws IOException
-        {
-            logger.debug("Load metadata for {}", descriptor);
-            File statsFile = new File(descriptor.filenameFor(SSTable.COMPONENT_STATS));
-            if (!statsFile.exists())
-            {
-                logger.debug("No sstable stats for {}", descriptor);
-                return Pair.create(new SSTableMetadata(), Collections.<Integer>emptySet());
-            }
-
-            DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(statsFile)));
-            try
-            {
-                return deserialize(in, descriptor, loadSSTableLevel);
-            }
-            finally
-            {
-                FileUtils.closeQuietly(in);
-            }
-        }
-        public Pair<SSTableMetadata, Set<Integer>> deserialize(DataInputStream in, Descriptor desc) throws IOException
-        {
-            return deserialize(in, desc, true);
-        }
-
-        public Pair<SSTableMetadata, Set<Integer>> deserialize(DataInputStream in, Descriptor desc, boolean loadSSTableLevel) throws IOException
-        {
-            EstimatedHistogram rowSizes = EstimatedHistogram.serializer.deserialize(in);
-            EstimatedHistogram columnCounts = EstimatedHistogram.serializer.deserialize(in);
-            ReplayPosition replayPosition = ReplayPosition.serializer.deserialize(in);
-            long minTimestamp = in.readLong();
-            long maxTimestamp = in.readLong();
-            int maxLocalDeletionTime = desc.version.tracksMaxLocalDeletionTime ? in.readInt() : Integer.MAX_VALUE;
-            double bloomFilterFPChance = desc.version.hasBloomFilterFPChance ? in.readDouble() : NO_BLOOM_FLITER_FP_CHANCE;
-            double compressionRatio = in.readDouble();
-            String partitioner = in.readUTF();
-            int nbAncestors = in.readInt();
-            Set<Integer> ancestors = new HashSet<Integer>(nbAncestors);
-            for (int i = 0; i < nbAncestors; i++)
-                ancestors.add(in.readInt());
-            StreamingHistogram tombstoneHistogram = StreamingHistogram.serializer.deserialize(in);
-            int sstableLevel = 0;
-
-            if (loadSSTableLevel && in.available() > 0)
-                sstableLevel = in.readInt();
-
-            List<ByteBuffer> minColumnNames;
-            List<ByteBuffer> maxColumnNames;
-            if (desc.version.tracksMaxMinColumnNames)
-            {
-                int colCount = in.readInt();
-                minColumnNames = new ArrayList<ByteBuffer>(colCount);
-                for (int i = 0; i < colCount; i++)
-                {
-                    minColumnNames.add(ByteBufferUtil.readWithShortLength(in));
-                }
-                colCount = in.readInt();
-                maxColumnNames = new ArrayList<ByteBuffer>(colCount);
-                for (int i = 0; i < colCount; i++)
-                {
-                    maxColumnNames.add(ByteBufferUtil.readWithShortLength(in));
-                }
-            }
-            else
-            {
-                minColumnNames = Collections.emptyList();
-                maxColumnNames = Collections.emptyList();
-            }
-            return Pair.create(new SSTableMetadata(rowSizes,
-                                       columnCounts,
-                                       replayPosition,
-                                       minTimestamp,
-                                       maxTimestamp,
-                                       maxLocalDeletionTime,
-                                       bloomFilterFPChance,
-                                       compressionRatio,
-                                       partitioner,
-                                       tombstoneHistogram,
-                                       sstableLevel,
-                                       minColumnNames,
-                                       maxColumnNames), ancestors);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/SSTableReader.java
index 92dee99..872f7df 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableReader.java

@@ -17,9 +17,24 @@
  */
 package org.apache.cassandra.io.sstable;
 
-import java.io.*;
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.ScheduledFuture;
@@ -30,30 +45,73 @@
 import java.util.concurrent.atomic.AtomicLong;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Ordering;
 import com.google.common.primitives.Longs;
 import com.google.common.util.concurrent.RateLimiter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.clearspring.analytics.stream.cardinality.CardinalityMergeException;
+import com.clearspring.analytics.stream.cardinality.ICardinality;
+import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.cache.InstrumentingCache;
 import org.apache.cassandra.cache.KeyCacheKey;
 import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
-import org.apache.cassandra.config.*;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DataTracker;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.db.compaction.ICompactionScanner;
 import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.dht.*;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.compress.CompressedRandomAccessReader;
 import org.apache.cassandra.io.compress.CompressedThrottledReader;
 import org.apache.cassandra.io.compress.CompressionMetadata;
-import org.apache.cassandra.io.util.*;
+import org.apache.cassandra.io.sstable.metadata.CompactionMetadata;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+import org.apache.cassandra.io.util.BufferedSegmentedFile;
+import org.apache.cassandra.io.util.CompressedSegmentedFile;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.ICompressedFile;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.io.util.SegmentedFile;
+import org.apache.cassandra.io.util.ThrottledReader;
 import org.apache.cassandra.metrics.RestorableMeter;
+import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.CLibrary;
+import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.FilterFactory;
+import org.apache.cassandra.utils.IFilter;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
 import static org.apache.cassandra.db.Directories.SECONDARY_INDEX_NAME_SEPARATOR;
 
@@ -61,15 +119,35 @@
  * SSTableReaders are open()ed by Keyspace.onStart; after that they are created by SSTableWriter.renameAndOpen.
  * Do not re-call open() on existing SSTable files; use the references kept by ColumnFamilyStore post-start instead.
  */
-public class SSTableReader extends SSTable implements Closeable
+public class SSTableReader extends SSTable
 {
     private static final Logger logger = LoggerFactory.getLogger(SSTableReader.class);
 
     private static final ScheduledThreadPoolExecutor syncExecutor = new ScheduledThreadPoolExecutor(1);
     private static final RateLimiter meterSyncThrottle = RateLimiter.create(100.0);
 
+    public static final Comparator<SSTableReader> maxTimestampComparator = new Comparator<SSTableReader>()
+    {
+        public int compare(SSTableReader o1, SSTableReader o2)
+        {
+            long ts1 = o1.getMaxTimestamp();
+            long ts2 = o2.getMaxTimestamp();
+            return (ts1 > ts2 ? -1 : (ts1 == ts2 ? 0 : 1));
+        }
+    };
+
+    public static final Comparator<SSTableReader> sstableComparator = new Comparator<SSTableReader>()
+    {
+        public int compare(SSTableReader o1, SSTableReader o2)
+        {
+            return o1.first.compareTo(o2.first);
+        }
+    };
+
+    public static final Ordering<SSTableReader> sstableOrdering = Ordering.from(sstableComparator);
+
     /**
-     * maxDataAge is a timestamp in local server time (e.g. System.currentTimeMilli) which represents an uppper bound
+     * maxDataAge is a timestamp in local server time (e.g. System.currentTimeMilli) which represents an upper bound
      * to the newest piece of data stored in the sstable. In other words, this sstable does not contain items created
      * later than maxDataAge.
      *
@@ -82,6 +160,15 @@
      */
     public final long maxDataAge;
 
+    public enum OpenReason
+    {
+        NORMAL,
+        EARLY,
+        METADATA_CHANGE
+    }
+
+    public final OpenReason openReason;
+
     // indexfile and datafile: might be null before a call to load()
     private SegmentedFile ifile;
     private SegmentedFile dfile;
@@ -98,29 +185,89 @@
     // but it seems like a good extra layer of protection against reference counting bugs to not delete data based on that alone
     private final AtomicBoolean isCompacted = new AtomicBoolean(false);
     private final AtomicBoolean isSuspect = new AtomicBoolean(false);
-    private final SSTableDeletingTask deletingTask;
+
     // not final since we need to be able to change level on a file.
-    private volatile SSTableMetadata sstableMetadata;
+    private volatile StatsMetadata sstableMetadata;
 
     private final AtomicLong keyCacheHit = new AtomicLong(0);
     private final AtomicLong keyCacheRequest = new AtomicLong(0);
 
+    /**
+     * To support replacing this sstablereader with another object that represents that same underlying sstable, but with different associated resources,
+     * we build a linked-list chain of replacement, which we synchronise using a shared object to make maintenance of the list across multiple threads simple.
+     * On close we check if any of the closeable resources differ between any chains either side of us; any that are in neither of the adjacent links (if any) are closed.
+     * Once we've made this decision we remove ourselves from the linked list, so that anybody behind/ahead will compare against only other still opened resources.
+     */
+    private Object replaceLock = new Object();
+    private SSTableReader replacedBy;
+    private SSTableReader replaces;
+    private SSTableDeletingTask deletingTask;
+    private Runnable runOnClose;
+
     @VisibleForTesting
     public RestorableMeter readMeter;
     private ScheduledFuture readMeterSyncFuture;
 
-    public static long getApproximateKeyCount(Iterable<SSTableReader> sstables, CFMetaData metadata)
+    /**
+     * Calculate approximate key count.
+     * If cardinality estimator is available on all given sstables, then this method use them to estimate
+     * key count.
+     * If not, then this uses index summaries.
+     *
+     * @param sstables SSTables to calculate key count
+     * @return estimated key count
+     */
+    public static long getApproximateKeyCount(Collection<SSTableReader> sstables)
     {
-        long count = 0;
+        long count = -1;
 
-        for (SSTableReader sstable : sstables)
+        // check if cardinality estimator is available for all SSTables
+        boolean cardinalityAvailable = !sstables.isEmpty() && Iterators.all(sstables.iterator(), new Predicate<SSTableReader>()
         {
-            int indexKeyCount = sstable.getKeySampleSize();
-            count = count + (indexKeyCount + 1) * sstable.indexSummary.getIndexInterval();
-            if (logger.isDebugEnabled())
-                logger.debug("index size for bloom filter calc for file  : " + sstable.getFilename() + "   : " + count);
+            public boolean apply(SSTableReader sstable)
+            {
+                return sstable.descriptor.version.newStatsFile;
+            }
+        });
+
+        // if it is, load them to estimate key count
+        if (cardinalityAvailable)
+        {
+            boolean failed = false;
+            ICardinality cardinality = null;
+            for (SSTableReader sstable : sstables)
+            {
+                try
+                {
+                    CompactionMetadata metadata = (CompactionMetadata) sstable.descriptor.getMetadataSerializer().deserialize(sstable.descriptor, MetadataType.COMPACTION);
+                    if (cardinality == null)
+                        cardinality = metadata.cardinalityEstimator;
+                    else
+                        cardinality = cardinality.merge(metadata.cardinalityEstimator);
+                }
+                catch (IOException e)
+                {
+                    logger.warn("Reading cardinality from Statistics.db failed.", e);
+                    failed = true;
+                    break;
+                }
+                catch (CardinalityMergeException e)
+                {
+                    logger.warn("Cardinality merge failed.", e);
+                    failed = true;
+                    break;
+                }
+            }
+            if (cardinality != null && !failed)
+                count = cardinality.cardinality();
         }
 
+        // if something went wrong above or cardinality is not available, calculate using index summary
+        if (count < 0)
+        {
+            for (SSTableReader sstable : sstables)
+                count += sstable.estimatedKeys();
+        }
         return count;
     }
 
@@ -150,38 +297,69 @@
         return open(desc, componentsFor(desc), metadata, p);
     }
 
+    public static SSTableReader open(Descriptor descriptor, Set<Component> components, CFMetaData metadata, IPartitioner partitioner) throws IOException
+    {
+        return open(descriptor, components, metadata, partitioner, true);
+    }
+
     public static SSTableReader openNoValidation(Descriptor descriptor, Set<Component> components, CFMetaData metadata) throws IOException
     {
         return open(descriptor, components, metadata, StorageService.getPartitioner(), false);
     }
 
+    /**
+     * Open SSTable reader to be used in batch mode(such as sstableloader).
+     *
+     * @param descriptor
+     * @param components
+     * @param metadata
+     * @param partitioner
+     * @return opened SSTableReader
+     * @throws IOException
+     */
     public static SSTableReader openForBatch(Descriptor descriptor, Set<Component> components, CFMetaData metadata, IPartitioner partitioner) throws IOException
     {
-        SSTableMetadata sstableMetadata = openMetadata(descriptor, components, partitioner);
+        // Minimum components without which we can't do anything
+        assert components.contains(Component.DATA) : "Data component is missing for sstable" + descriptor;
+        assert components.contains(Component.PRIMARY_INDEX) : "Primary index component is missing for sstable " + descriptor;
+
+        Map<MetadataType, MetadataComponent> sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor,
+                                                                                                               EnumSet.of(MetadataType.VALIDATION, MetadataType.STATS));
+        ValidationMetadata validationMetadata = (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
+        StatsMetadata statsMetadata = (StatsMetadata) sstableMetadata.get(MetadataType.STATS);
+
+        // Check if sstable is created using same partitioner.
+        // Partitioner can be null, which indicates older version of sstable or no stats available.
+        // In that case, we skip the check.
+        String partitionerName = partitioner.getClass().getCanonicalName();
+        if (validationMetadata != null && !partitionerName.equals(validationMetadata.partitioner))
+        {
+            logger.error(String.format("Cannot open %s; partitioner %s does not match system partitioner %s.  Note that the default partitioner starting with Cassandra 1.2 is Murmur3Partitioner, so you will need to edit that to match your old partitioner if upgrading.",
+                                              descriptor, validationMetadata.partitioner, partitionerName));
+            System.exit(1);
+        }
+
+        logger.info("Opening {} ({} bytes)", descriptor, new File(descriptor.filenameFor(Component.DATA)).length());
         SSTableReader sstable = new SSTableReader(descriptor,
                                                   components,
                                                   metadata,
                                                   partitioner,
                                                   System.currentTimeMillis(),
-                                                  sstableMetadata);
+                                                  statsMetadata,
+                                                  OpenReason.NORMAL);
 
         // special implementation of load to use non-pooled SegmentedFile builders
         SegmentedFile.Builder ibuilder = new BufferedSegmentedFile.Builder();
         SegmentedFile.Builder dbuilder = sstable.compression
-                                       ? new CompressedSegmentedFile.Builder()
+                                       ? new CompressedSegmentedFile.Builder(null)
                                        : new BufferedSegmentedFile.Builder();
-        if (!loadSummary(sstable, ibuilder, dbuilder, sstable.metadata))
-            sstable.buildSummary(false, ibuilder, dbuilder, false);
+        if (!sstable.loadSummary(ibuilder, dbuilder))
+            sstable.buildSummary(false, ibuilder, dbuilder, false, Downsampling.BASE_SAMPLING_LEVEL);
         sstable.ifile = ibuilder.complete(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX));
         sstable.dfile = dbuilder.complete(sstable.descriptor.filenameFor(Component.DATA));
-
         sstable.bf = FilterFactory.AlwaysPresent;
-        return sstable;
-    }
 
-    public static SSTableReader open(Descriptor descriptor, Set<Component> components, CFMetaData metadata, IPartitioner partitioner) throws IOException
-    {
-        return open(descriptor, components, metadata, partitioner, true);
+        return sstable;
     }
 
     private static SSTableReader open(Descriptor descriptor,
@@ -190,66 +368,62 @@
                                       IPartitioner partitioner,
                                       boolean validate) throws IOException
     {
-        long start = System.nanoTime();
-        SSTableMetadata sstableMetadata = openMetadata(descriptor, components, partitioner);
+        // Minimum components without which we can't do anything
+        assert components.contains(Component.DATA) : "Data component is missing for sstable" + descriptor;
+        assert components.contains(Component.PRIMARY_INDEX) : "Primary index component is missing for sstable " + descriptor;
 
+        Map<MetadataType, MetadataComponent> sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor,
+                                                                                                               EnumSet.of(MetadataType.VALIDATION, MetadataType.STATS));
+        ValidationMetadata validationMetadata = (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
+        StatsMetadata statsMetadata = (StatsMetadata) sstableMetadata.get(MetadataType.STATS);
+
+        // Check if sstable is created using same partitioner.
+        // Partitioner can be null, which indicates older version of sstable or no stats available.
+        // In that case, we skip the check.
+        String partitionerName = partitioner.getClass().getCanonicalName();
+        if (validationMetadata != null && !partitionerName.equals(validationMetadata.partitioner))
+        {
+            logger.error(String.format("Cannot open %s; partitioner %s does not match system partitioner %s.  Note that the default partitioner starting with Cassandra 1.2 is Murmur3Partitioner, so you will need to edit that to match your old partitioner if upgrading.",
+                                              descriptor, validationMetadata.partitioner, partitionerName));
+            System.exit(1);
+        }
+
+        logger.info("Opening {} ({} bytes)", descriptor, new File(descriptor.filenameFor(Component.DATA)).length());
         SSTableReader sstable = new SSTableReader(descriptor,
                                                   components,
                                                   metadata,
                                                   partitioner,
                                                   System.currentTimeMillis(),
-                                                  sstableMetadata);
+                                                  statsMetadata,
+                                                  OpenReason.NORMAL);
 
-        sstable.load();
+        // load index and filter
+        long start = System.nanoTime();
+        sstable.load(validationMetadata);
+        logger.debug("INDEX LOAD TIME for {}: {} ms.", descriptor, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
 
         if (validate)
             sstable.validate();
 
-        logger.debug("INDEX LOAD TIME for {}: {} ms.", descriptor, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
-
         if (sstable.getKeyCache() != null)
             logger.debug("key cache contains {}/{} keys", sstable.getKeyCache().size(), sstable.getKeyCache().getCapacity());
 
         return sstable;
     }
 
-    private static SSTableMetadata openMetadata(Descriptor descriptor, Set<Component> components, IPartitioner partitioner) throws IOException
-    {
-        assert partitioner != null;
-        // Minimum components without which we can't do anything
-        assert components.contains(Component.DATA) : "Data component is missing for sstable" + descriptor;
-        assert components.contains(Component.PRIMARY_INDEX) : "Primary index component is missing for sstable " + descriptor;
-
-        logger.info("Opening {} ({} bytes)", descriptor, new File(descriptor.filenameFor(COMPONENT_DATA)).length());
-
-        SSTableMetadata sstableMetadata = SSTableMetadata.serializer.deserialize(descriptor).left;
-
-        // Check if sstable is created using same partitioner.
-        // Partitioner can be null, which indicates older version of sstable or no stats available.
-        // In that case, we skip the check.
-        String partitionerName = partitioner.getClass().getCanonicalName();
-        if (sstableMetadata.partitioner != null && !partitionerName.equals(sstableMetadata.partitioner))
-        {
-            logger.error(String.format("Cannot open %s; partitioner %s does not match system partitioner %s.  Note that the default partitioner starting with Cassandra 1.2 is Murmur3Partitioner, so you will need to edit that to match your old partitioner if upgrading.",
-                                       descriptor, sstableMetadata.partitioner, partitionerName));
-            System.exit(1);
-        }
-        return sstableMetadata;
-    }
-
     public static void logOpenException(Descriptor descriptor, IOException e)
     {
         if (e instanceof FileNotFoundException)
-            logger.error("Missing sstable component in " + descriptor + "; skipped because of " + e.getMessage());
+            logger.error("Missing sstable component in {}; skipped because of {}", descriptor, e.getMessage());
         else
-            logger.error("Corrupt sstable " + descriptor + "; skipped", e);
+            logger.error("Corrupt sstable {}; skipped", descriptor, e);
     }
 
     public static Collection<SSTableReader> openAll(Set<Map.Entry<Descriptor, Set<Component>>> entries,
                                                       final CFMetaData metadata,
                                                       final IPartitioner partitioner)
     {
-        final Collection<SSTableReader> sstables = new LinkedBlockingQueue<SSTableReader>();
+        final Collection<SSTableReader> sstables = new LinkedBlockingQueue<>();
 
         ExecutorService executor = DebuggableThreadPoolExecutor.createWithFixedPoolSize("SSTableBatchOpen", FBUtilities.getAvailableProcessors());
         for (final Map.Entry<Descriptor, Set<Component>> entry : entries)
@@ -265,7 +439,7 @@
                     }
                     catch (IOException ex)
                     {
-                        logger.error("Corrupt sstable " + entry + "; skipped", ex);
+                        logger.error("Corrupt sstable {}; skipped", entry, ex);
                         return;
                     }
                     sstables.add(sstable);
@@ -300,7 +474,8 @@
                                       IndexSummary isummary,
                                       IFilter bf,
                                       long maxDataAge,
-                                      SSTableMetadata sstableMetadata)
+                                      StatsMetadata sstableMetadata,
+                                      OpenReason openReason)
     {
         assert desc != null && partitioner != null && ifile != null && dfile != null && isummary != null && bf != null && sstableMetadata != null;
         return new SSTableReader(desc,
@@ -311,7 +486,8 @@
                                  isummary,
                                  bf,
                                  maxDataAge,
-                                 sstableMetadata);
+                                 sstableMetadata,
+                                 openReason);
     }
 
 
@@ -320,17 +496,20 @@
                           CFMetaData metadata,
                           IPartitioner partitioner,
                           long maxDataAge,
-                          SSTableMetadata sstableMetadata)
+                          StatsMetadata sstableMetadata,
+                          OpenReason openReason)
     {
         super(desc, components, metadata, partitioner);
         this.sstableMetadata = sstableMetadata;
         this.maxDataAge = maxDataAge;
+        this.openReason = openReason;
 
         deletingTask = new SSTableDeletingTask(this);
 
         // Don't track read rates for tables in the system keyspace and don't bother trying to load or persist
-        // the read meter when in client mode
-        if (Keyspace.SYSTEM_KS.equals(desc.ksname) || Config.isClientMode())
+        // the read meter when in client mode.  Also don't track reads for special operations (like early open)
+        // this is to avoid overflowing the executor queue (see CASSANDRA-8066)
+        if (Keyspace.SYSTEM_KS.equals(desc.ksname) || Config.isClientMode() || openReason != OpenReason.NORMAL)
         {
             readMeter = null;
             readMeterSyncFuture = null;
@@ -361,9 +540,10 @@
                           IndexSummary indexSummary,
                           IFilter bloomFilter,
                           long maxDataAge,
-                          SSTableMetadata sstableMetadata)
+                          StatsMetadata sstableMetadata,
+                          OpenReason openReason)
     {
-        this(desc, components, metadata, partitioner, maxDataAge, sstableMetadata);
+        this(desc, components, metadata, partitioner, maxDataAge, sstableMetadata, openReason);
 
         this.ifile = ifile;
         this.dfile = dfile;
@@ -371,22 +551,145 @@
         this.bf = bloomFilter;
     }
 
-    /**
-     * Clean up all opened resources.
-     *
-     * @throws IOException
-     */
-    public void close() throws IOException
+    public static long getTotalBytes(Iterable<SSTableReader> sstables)
+    {
+        long sum = 0;
+        for (SSTableReader sstable : sstables)
+        {
+            sum += sstable.onDiskLength();
+        }
+        return sum;
+    }
+
+    private void tidy(boolean release)
     {
         if (readMeterSyncFuture != null)
             readMeterSyncFuture.cancel(false);
 
-        // Force finalizing mmapping if necessary
-        ifile.cleanup();
-        dfile.cleanup();
-        // close the BF so it can be opened later.
-        bf.close();
-        indexSummary.close();
+        if (references.get() != 0)
+        {
+            throw new IllegalStateException("SSTable is not fully released (" + references.get() + " references)");
+        }
+
+        synchronized (replaceLock)
+        {
+            boolean closeBf = true, closeSummary = true, closeFiles = true, deleteFiles = false;
+
+            if (replacedBy != null)
+            {
+                closeBf = replacedBy.bf != bf;
+                closeSummary = replacedBy.indexSummary != indexSummary;
+                closeFiles = replacedBy.dfile != dfile;
+                // if the replacement sstablereader uses a different path, clean up our paths
+                deleteFiles = !dfile.path.equals(replacedBy.dfile.path);
+            }
+
+            if (replaces != null)
+            {
+                closeBf &= replaces.bf != bf;
+                closeSummary &= replaces.indexSummary != indexSummary;
+                closeFiles &= replaces.dfile != dfile;
+                deleteFiles &= !dfile.path.equals(replaces.dfile.path);
+            }
+
+            boolean deleteAll = false;
+            if (release && isCompacted.get())
+            {
+                assert replacedBy == null;
+                if (replaces != null)
+                {
+                    replaces.replacedBy = null;
+                    replaces.deletingTask = deletingTask;
+                    replaces.markObsolete();
+                }
+                else
+                {
+                    deleteAll = true;
+                }
+            }
+            else
+            {
+                if (replaces != null)
+                    replaces.replacedBy = replacedBy;
+                if (replacedBy != null)
+                    replacedBy.replaces = replaces;
+            }
+
+            scheduleTidy(closeBf, closeSummary, closeFiles, deleteFiles, deleteAll);
+        }
+    }
+
+    private void scheduleTidy(final boolean closeBf, final boolean closeSummary, final boolean closeFiles, final boolean deleteFiles, final boolean deleteAll)
+    {
+        if (references.get() != 0)
+            throw new IllegalStateException("SSTable is not fully released (" + references.get() + " references)");
+
+        final ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(metadata.cfId);
+        final OpOrder.Barrier barrier;
+        if (cfs != null)
+        {
+            barrier = cfs.readOrdering.newBarrier();
+            barrier.issue();
+        }
+        else
+            barrier = null;
+
+        StorageService.tasks.execute(new Runnable()
+        {
+            public void run()
+            {
+                if (barrier != null)
+                    barrier.await();
+                if (closeBf)
+                    bf.close();
+                if (closeSummary)
+                    indexSummary.close();
+                if (closeFiles)
+                {
+                    ifile.cleanup();
+                    dfile.cleanup();
+                }
+                if (runOnClose != null)
+                    runOnClose.run();
+                if (deleteAll)
+                {
+                    /**
+                     * Do the OS a favour and suggest (using fadvice call) that we
+                     * don't want to see pages of this SSTable in memory anymore.
+                     *
+                     * NOTE: We can't use madvice in java because it requires the address of
+                     * the mapping, so instead we always open a file and run fadvice(fd, 0, 0) on it
+                     */
+                    dropPageCache();
+                    deletingTask.run();
+                }
+                else if (deleteFiles)
+                {
+                    FileUtils.deleteWithConfirm(new File(dfile.path));
+                    FileUtils.deleteWithConfirm(new File(ifile.path));
+                }
+            }
+        });
+    }
+
+    public boolean equals(Object that)
+    {
+        return that instanceof SSTableReader && ((SSTableReader) that).descriptor.equals(this.descriptor);
+    }
+
+    public int hashCode()
+    {
+        return this.descriptor.hashCode();
+    }
+
+    public String getFilename()
+    {
+        return dfile.path;
+    }
+
+    public String getIndexFilename()
+    {
+        return ifile.path;
     }
 
     public void setTrackedBy(DataTracker tracker)
@@ -398,7 +701,7 @@
         keyCache = CacheService.instance.keyCache;
     }
 
-    private void load() throws IOException
+    private void load(ValidationMetadata validation) throws IOException
     {
         if (metadata.getBloomFilterFpChance() == 1.0)
         {
@@ -406,25 +709,30 @@
             load(false, true);
             bf = FilterFactory.AlwaysPresent;
         }
-        else if (!components.contains(Component.FILTER))
+        else if (!components.contains(Component.FILTER) || validation == null)
         {
             // bf is enabled, but filter component is missing.
             load(true, true);
         }
-        else if (descriptor.version.hasBloomFilterFPChance && sstableMetadata.bloomFilterFPChance != metadata.getBloomFilterFpChance())
+        else if (validation.bloomFilterFPChance != metadata.getBloomFilterFpChance())
         {
             // bf fp chance in sstable metadata and it has changed since compaction.
             load(true, true);
         }
         else
         {
-            // bf is enabled, but fp chance isn't present in metadata (pre-ja) OR matches the currently configured value.
+            // bf is enabled and fp chance matches the currently configured value.
             load(false, true);
             loadBloomFilter();
         }
     }
 
-    void loadBloomFilter() throws IOException
+    /**
+     * Load bloom filter from Filter.db file.
+     *
+     * @throws IOException
+     */
+    private void loadBloomFilter() throws IOException
     {
         DataInputStream stream = null;
         try
@@ -450,18 +758,27 @@
                                          ? SegmentedFile.getCompressedBuilder()
                                          : SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode());
 
-        boolean summaryLoaded = loadSummary(this, ibuilder, dbuilder, metadata);
+        boolean summaryLoaded = loadSummary(ibuilder, dbuilder);
         if (recreateBloomFilter || !summaryLoaded)
-            buildSummary(recreateBloomFilter, ibuilder, dbuilder, summaryLoaded);
+            buildSummary(recreateBloomFilter, ibuilder, dbuilder, summaryLoaded, Downsampling.BASE_SAMPLING_LEVEL);
 
         ifile = ibuilder.complete(descriptor.filenameFor(Component.PRIMARY_INDEX));
         dfile = dbuilder.complete(descriptor.filenameFor(Component.DATA));
         if (saveSummaryIfCreated && (recreateBloomFilter || !summaryLoaded)) // save summary information to disk
-            saveSummary(this, ibuilder, dbuilder);
+            saveSummary(ibuilder, dbuilder);
     }
 
-     private void buildSummary(boolean recreateBloomFilter, SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder, boolean summaryLoaded) throws IOException
-     {
+    /**
+     * Build index summary(and optionally bloom filter) by reading through Index.db file.
+     *
+     * @param recreateBloomFilter true if recreate bloom filter
+     * @param ibuilder
+     * @param dbuilder
+     * @param summaryLoaded true if index summary is already loaded and not need to build again
+     * @throws IOException
+     */
+    private void buildSummary(boolean recreateBloomFilter, SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder, boolean summaryLoaded, int samplingLevel) throws IOException
+    {
         // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
         RandomAccessReader primaryIndex = RandomAccessReader.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)));
 
@@ -478,20 +795,20 @@
 
             IndexSummaryBuilder summaryBuilder = null;
             if (!summaryLoaded)
-                summaryBuilder = new IndexSummaryBuilder(estimatedKeys, metadata.getIndexInterval());
+                summaryBuilder = new IndexSummaryBuilder(estimatedKeys, metadata.getMinIndexInterval(), samplingLevel);
 
             long indexPosition;
             while ((indexPosition = primaryIndex.getFilePointer()) != indexSize)
             {
                 ByteBuffer key = ByteBufferUtil.readWithShortLength(primaryIndex);
-                RowIndexEntry indexEntry = RowIndexEntry.serializer.deserialize(primaryIndex, descriptor.version);
+                RowIndexEntry indexEntry = metadata.comparator.rowIndexEntrySerializer().deserialize(primaryIndex, descriptor.version);
                 DecoratedKey decoratedKey = partitioner.decorateKey(key);
                 if (first == null)
                     first = decoratedKey;
                 last = decoratedKey;
 
                 if (recreateBloomFilter)
-                    bf.add(decoratedKey.key);
+                    bf.add(decoratedKey.getKey());
 
                 // if summary was already read from disk we don't want to re-populate it using primary index
                 if (!summaryLoaded)
@@ -514,35 +831,38 @@
         last = getMinimalKey(last);
     }
 
-    public static boolean loadSummary(SSTableReader reader, SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder, CFMetaData metadata)
+    /**
+     * Load index summary from Summary.db file if it exists.
+     *
+     * if loaded index summary has different index interval from current value stored in schema,
+     * then Summary.db file will be deleted and this returns false to rebuild summary.
+     *
+     * @param ibuilder
+     * @param dbuilder
+     * @return true if index summary is loaded successfully from Summary.db file.
+     */
+    public boolean loadSummary(SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder)
     {
-        File summariesFile = new File(reader.descriptor.filenameFor(Component.SUMMARY));
-        if (!reader.descriptor.version.offHeapSummaries || !summariesFile.exists())
+        File summariesFile = new File(descriptor.filenameFor(Component.SUMMARY));
+        if (!summariesFile.exists())
             return false;
 
         DataInputStream iStream = null;
         try
         {
             iStream = new DataInputStream(new FileInputStream(summariesFile));
-            reader.indexSummary = IndexSummary.serializer.deserialize(iStream, reader.partitioner);
-            if (reader.indexSummary.getIndexInterval() != metadata.getIndexInterval())
-            {
-                iStream.close();
-                logger.debug("Cannot read the saved summary for {} because Index Interval changed from {} to {}.",
-                             reader.toString(), reader.indexSummary.getIndexInterval(), metadata.getIndexInterval());
-                FileUtils.deleteWithConfirm(summariesFile);
-                return false;
-            }
-            reader.first = reader.partitioner.decorateKey(ByteBufferUtil.readWithLength(iStream));
-            reader.last = reader.partitioner.decorateKey(ByteBufferUtil.readWithLength(iStream));
+            indexSummary = IndexSummary.serializer.deserialize(iStream, partitioner, descriptor.version.hasSamplingLevel, metadata.getMinIndexInterval(), metadata.getMaxIndexInterval());
+            first = partitioner.decorateKey(ByteBufferUtil.readWithLength(iStream));
+            last = partitioner.decorateKey(ByteBufferUtil.readWithLength(iStream));
             ibuilder.deserializeBounds(iStream);
             dbuilder.deserializeBounds(iStream);
         }
         catch (IOException e)
         {
-            logger.debug("Cannot deserialize SSTable Summary: ", e);
+            logger.debug("Cannot deserialize SSTable Summary File {}: {}", summariesFile.getPath(), e.getMessage());
             // corrupted; delete it and fall back to creating a new summary
             FileUtils.closeQuietly(iStream);
+            // delete it and fall back to creating a new summary
             FileUtils.deleteWithConfirm(summariesFile);
             return false;
         }
@@ -554,19 +874,30 @@
         return true;
     }
 
-    public static void saveSummary(SSTableReader reader, SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder)
+    /**
+     * Save index summary to Summary.db file.
+     *
+     * @param ibuilder
+     * @param dbuilder
+     */
+    public void saveSummary(SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder)
     {
-        File summariesFile = new File(reader.descriptor.filenameFor(Component.SUMMARY));
-        if (summariesFile.exists())
-            summariesFile.delete();
+        saveSummary(ibuilder, dbuilder, indexSummary);
+    }
 
-        DataOutputStream oStream = null;
+    private void saveSummary(SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder, IndexSummary summary)
+    {
+        File summariesFile = new File(descriptor.filenameFor(Component.SUMMARY));
+        if (summariesFile.exists())
+            FileUtils.deleteWithConfirm(summariesFile);
+
+        DataOutputStreamAndChannel oStream = null;
         try
         {
-            oStream = new DataOutputStream(new FileOutputStream(summariesFile));
-            IndexSummary.serializer.serialize(reader.indexSummary, oStream);
-            ByteBufferUtil.writeWithLength(reader.first.key, oStream);
-            ByteBufferUtil.writeWithLength(reader.last.key, oStream);
+            oStream = new DataOutputStreamAndChannel(new FileOutputStream(summariesFile));
+            IndexSummary.serializer.serialize(summary, oStream, descriptor.version.hasSamplingLevel);
+            ByteBufferUtil.writeWithLength(first.getKey(), oStream);
+            ByteBufferUtil.writeWithLength(last.getKey(), oStream);
             ibuilder.serializeBounds(oStream);
             dbuilder.serializeBounds(oStream);
         }
@@ -576,7 +907,7 @@
 
             // corrupted hence delete it and let it load it now.
             if (summariesFile.exists())
-                summariesFile.delete();
+                FileUtils.deleteWithConfirm(summariesFile);
         }
         finally
         {
@@ -584,6 +915,170 @@
         }
     }
 
+    public void setReplacedBy(SSTableReader replacement)
+    {
+        synchronized (replaceLock)
+        {
+            assert replacedBy == null;
+            replacedBy = replacement;
+            replacement.replaces = this;
+            replacement.replaceLock = replaceLock;
+        }
+    }
+
+    public SSTableReader cloneWithNewStart(DecoratedKey newStart, final Runnable runOnClose)
+    {
+        synchronized (replaceLock)
+        {
+            assert replacedBy == null;
+
+            if (newStart.compareTo(this.first) > 0)
+            {
+                if (newStart.compareTo(this.last) > 0)
+                {
+                    this.runOnClose = new Runnable()
+                    {
+                        public void run()
+                        {
+                            CLibrary.trySkipCache(dfile.path, 0, 0);
+                            CLibrary.trySkipCache(ifile.path, 0, 0);
+                            runOnClose.run();
+                        }
+                    };
+                }
+                else
+                {
+                    final long dataStart = getPosition(newStart, Operator.GE).position;
+                    final long indexStart = getIndexScanPosition(newStart);
+                    this.runOnClose = new Runnable()
+                    {
+                        public void run()
+                        {
+                            CLibrary.trySkipCache(dfile.path, 0, dataStart);
+                            CLibrary.trySkipCache(ifile.path, 0, indexStart);
+                            runOnClose.run();
+                        }
+                    };
+                }
+            }
+
+            SSTableReader replacement = new SSTableReader(descriptor, components, metadata, partitioner, ifile, dfile, indexSummary.readOnlyClone(), bf, maxDataAge, sstableMetadata,
+                    openReason == OpenReason.EARLY ? openReason : OpenReason.METADATA_CHANGE);
+            replacement.readMeterSyncFuture = this.readMeterSyncFuture;
+            replacement.readMeter = this.readMeter;
+            replacement.first = this.last.compareTo(newStart) > 0 ? newStart : this.last;
+            replacement.last = this.last;
+            setReplacedBy(replacement);
+            return replacement;
+        }
+    }
+
+    /**
+     * Returns a new SSTableReader with the same properties as this SSTableReader except that a new IndexSummary will
+     * be built at the target samplingLevel.  This (original) SSTableReader instance will be marked as replaced, have
+     * its DeletingTask removed, and have its periodic read-meter sync task cancelled.
+     * @param samplingLevel the desired sampling level for the index summary on the new SSTableReader
+     * @return a new SSTableReader
+     * @throws IOException
+     */
+    public SSTableReader cloneWithNewSummarySamplingLevel(ColumnFamilyStore parent, int samplingLevel) throws IOException
+    {
+        synchronized (replaceLock)
+        {
+            assert replacedBy == null;
+
+            int minIndexInterval = metadata.getMinIndexInterval();
+            int maxIndexInterval = metadata.getMaxIndexInterval();
+            double effectiveInterval = indexSummary.getEffectiveIndexInterval();
+
+            IndexSummary newSummary;
+            long oldSize = bytesOnDisk();
+
+            // We have to rebuild the summary from the on-disk primary index in three cases:
+            // 1. The sampling level went up, so we need to read more entries off disk
+            // 2. The min_index_interval changed (in either direction); this changes what entries would be in the summary
+            //    at full sampling (and consequently at any other sampling level)
+            // 3. The max_index_interval was lowered, forcing us to raise the sampling level
+            if (samplingLevel > indexSummary.getSamplingLevel() || indexSummary.getMinIndexInterval() != minIndexInterval || effectiveInterval > maxIndexInterval)
+            {
+                newSummary = buildSummaryAtLevel(samplingLevel);
+            }
+            else if (samplingLevel < indexSummary.getSamplingLevel())
+            {
+                // we can use the existing index summary to make a smaller one
+                newSummary = IndexSummaryBuilder.downsample(indexSummary, samplingLevel, minIndexInterval, partitioner);
+
+                SegmentedFile.Builder ibuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode());
+                SegmentedFile.Builder dbuilder = compression
+                                                 ? SegmentedFile.getCompressedBuilder()
+                                                 : SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode());
+                saveSummary(ibuilder, dbuilder, newSummary);
+            }
+            else
+            {
+                throw new AssertionError("Attempted to clone SSTableReader with the same index summary sampling level and " +
+                                         "no adjustments to min/max_index_interval");
+            }
+
+            long newSize = bytesOnDisk();
+            StorageMetrics.load.inc(newSize - oldSize);
+            parent.metric.liveDiskSpaceUsed.inc(newSize - oldSize);
+
+            SSTableReader replacement = new SSTableReader(descriptor, components, metadata, partitioner, ifile, dfile, newSummary, bf, maxDataAge, sstableMetadata,
+                    openReason == OpenReason.EARLY ? openReason : OpenReason.METADATA_CHANGE);
+            replacement.readMeterSyncFuture = this.readMeterSyncFuture;
+            replacement.readMeter = this.readMeter;
+            replacement.first = this.first;
+            replacement.last = this.last;
+            setReplacedBy(replacement);
+            return replacement;
+        }
+    }
+
+    private IndexSummary buildSummaryAtLevel(int newSamplingLevel) throws IOException
+    {
+        // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
+        RandomAccessReader primaryIndex = RandomAccessReader.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)));
+        try
+        {
+            long indexSize = primaryIndex.length();
+            IndexSummaryBuilder summaryBuilder = new IndexSummaryBuilder(estimatedKeys(), metadata.getMinIndexInterval(), newSamplingLevel);
+
+            long indexPosition;
+            while ((indexPosition = primaryIndex.getFilePointer()) != indexSize)
+            {
+                summaryBuilder.maybeAddEntry(partitioner.decorateKey(ByteBufferUtil.readWithShortLength(primaryIndex)), indexPosition);
+                RowIndexEntry.Serializer.skip(primaryIndex);
+            }
+
+            return summaryBuilder.build(partitioner);
+        }
+        finally
+        {
+            FileUtils.closeQuietly(primaryIndex);
+        }
+    }
+
+    public int getIndexSummarySamplingLevel()
+    {
+        return indexSummary.getSamplingLevel();
+    }
+
+    public long getIndexSummaryOffHeapSize()
+    {
+        return indexSummary.getOffHeapSize();
+    }
+
+    public int getMinIndexInterval()
+    {
+        return indexSummary.getMinIndexInterval();
+    }
+
+    public double getEffectiveIndexInterval()
+    {
+        return indexSummary.getEffectiveIndexInterval();
+    }
+
     public void releaseSummary() throws IOException
     {
         indexSummary.close();
@@ -596,22 +1091,37 @@
             throw new IllegalStateException(String.format("SSTable first key %s > last key %s", this.first, this.last));
     }
 
-    /** get the position in the index file to start scanning to find the given key (at most indexInterval keys away) */
+    /**
+     * Gets the position in the index file to start scanning to find the given key (at most indexInterval keys away,
+     * modulo downsampling of the index summary).
+     */
     public long getIndexScanPosition(RowPosition key)
     {
-        int index = indexSummary.binarySearch(key);
-        if (index < 0)
+        return getIndexScanPositionFromBinarySearchResult(indexSummary.binarySearch(key), indexSummary);
+    }
+
+    private static long getIndexScanPositionFromBinarySearchResult(int binarySearchResult, IndexSummary referencedIndexSummary)
+    {
+        if (binarySearchResult == -1)
+            return -1;
+        else
+            return referencedIndexSummary.getPosition(getIndexSummaryIndexFromBinarySearchResult(binarySearchResult));
+    }
+
+    private static int getIndexSummaryIndexFromBinarySearchResult(int binarySearchResult)
+    {
+        if (binarySearchResult < 0)
         {
             // binary search gives us the first index _greater_ than the key searched for,
             // i.e., its insertion position
-            int greaterThan = (index + 1) * -1;
+            int greaterThan = (binarySearchResult + 1) * -1;
             if (greaterThan == 0)
                 return -1;
-            return indexSummary.getPosition(greaterThan - 1);
+            return greaterThan - 1;
         }
         else
         {
-            return indexSummary.getPosition(index);
+            return binarySearchResult;
         }
     }
 
@@ -626,7 +1136,9 @@
 
         CompressionMetadata cmd = ((ICompressedFile) dfile).getMetadata();
 
-        cmd.parameters.setLiveMetadata(Schema.instance.getCFMetaData(descriptor));
+        //We need the parent cf metadata
+        String cfName = metadata.isSecondaryIndex() ? metadata.getParentColumnFamilyName() : metadata.cfName;
+        cmd.parameters.setLiveMetadata(Schema.instance.getCFMetaData(metadata.ksName, cfName));
 
         return cmd;
     }
@@ -650,11 +1162,11 @@
     }
 
     /**
-     * @return An estimate of the number of keys in this SSTable.
+     * @return An estimate of the number of keys in this SSTable based on the index summary.
      */
     public long estimatedKeys()
     {
-        return ((long) indexSummary.size()) * indexSummary.getIndexInterval();
+        return indexSummary.getEstimatedKeyCount();
     }
 
     /**
@@ -667,26 +1179,41 @@
         List<Pair<Integer, Integer>> sampleIndexes = getSampleIndexesForRanges(indexSummary, ranges);
         for (Pair<Integer, Integer> sampleIndexRange : sampleIndexes)
             sampleKeyCount += (sampleIndexRange.right - sampleIndexRange.left + 1);
-        return Math.max(1, sampleKeyCount * indexSummary.getIndexInterval());
+
+        // adjust for the current sampling level: (BSL / SL) * index_interval_at_full_sampling
+        long estimatedKeys = sampleKeyCount * (Downsampling.BASE_SAMPLING_LEVEL * indexSummary.getMinIndexInterval()) / indexSummary.getSamplingLevel();
+        return Math.max(1, estimatedKeys);
     }
 
     /**
-     * @return Approximately 1/INDEX_INTERVALth of the keys in this SSTable.
+     * Returns the number of entries in the IndexSummary.  At full sampling, this is approximately 1/INDEX_INTERVALth of
+     * the keys in this SSTable.
      */
-    public int getKeySampleSize()
+    public int getIndexSummarySize()
     {
         return indexSummary.size();
     }
 
-    public byte[] getKeySample(int position)
+    /**
+     * Returns the approximate number of entries the IndexSummary would contain if it were at full sampling.
+     */
+    public int getMaxIndexSummarySize()
     {
-        return indexSummary.getKey(position);
+        return indexSummary.getMaxNumberOfEntries();
+    }
+
+    /**
+     * Returns the key for the index summary entry at `index`.
+     */
+    public byte[] getIndexSummaryKey(int index)
+    {
+        return indexSummary.getKey(index);
     }
 
     private static List<Pair<Integer,Integer>> getSampleIndexesForRanges(IndexSummary summary, Collection<Range<Token>> ranges)
     {
         // use the index to determine a minimal section for each range
-        List<Pair<Integer,Integer>> positions = new ArrayList<Pair<Integer,Integer>>();
+        List<Pair<Integer,Integer>> positions = new ArrayList<>();
 
         for (Range<Token> range : Range.normalize(ranges))
         {
@@ -720,7 +1247,7 @@
             if (left > right)
                 // empty range
                 continue;
-            positions.add(Pair.create(Integer.valueOf(left), Integer.valueOf(right)));
+            positions.add(Pair.create(left, right));
         }
         return positions;
     }
@@ -780,7 +1307,7 @@
     public List<Pair<Long,Long>> getPositionsForRanges(Collection<Range<Token>> ranges)
     {
         // use the index to determine a minimal section for each range
-        List<Pair<Long,Long>> positions = new ArrayList<Pair<Long,Long>>();
+        List<Pair<Long,Long>> positions = new ArrayList<>();
         for (Range<Token> range : Range.normalize(ranges))
         {
             AbstractBounds<RowPosition> keyRange = range.toRowBounds();
@@ -797,54 +1324,36 @@
             if (left == right)
                 // empty range
                 continue;
-            positions.add(Pair.create(Long.valueOf(left), Long.valueOf(right)));
+            positions.add(Pair.create(left, right));
         }
         return positions;
     }
 
+    public void invalidateCacheKey(DecoratedKey key)
+    {
+        KeyCacheKey cacheKey = new KeyCacheKey(metadata.cfId, descriptor, key.getKey());
+        keyCache.remove(cacheKey);
+    }
+
     public void cacheKey(DecoratedKey key, RowIndexEntry info)
     {
-        CFMetaData.Caching caching = metadata.getCaching();
+        CachingOptions caching = metadata.getCaching();
 
-        if (caching == CFMetaData.Caching.NONE
-            || caching == CFMetaData.Caching.ROWS_ONLY
+        if (!caching.keyCache.isEnabled()
             || keyCache == null
             || keyCache.getCapacity() == 0)
         {
             return;
         }
 
-        KeyCacheKey cacheKey = new KeyCacheKey(descriptor, key.key);
+        KeyCacheKey cacheKey = new KeyCacheKey(metadata.cfId, descriptor, key.getKey());
         logger.trace("Adding cache entry for {} -> {}", cacheKey, info);
         keyCache.put(cacheKey, info);
     }
 
-    public void preheat(Map<DecoratedKey, RowIndexEntry> cachedKeys) throws IOException
-    {
-        RandomAccessFile f = new RandomAccessFile(getFilename(), "r");
-
-        try
-        {
-            int fd = CLibrary.getfd(f.getFD());
-
-            for (Map.Entry<DecoratedKey, RowIndexEntry> entry : cachedKeys.entrySet())
-            {
-                cacheKey(entry.getKey(), entry.getValue());
-
-                // add to the cache but don't do actual preheating if we have it disabled in the config
-                if (DatabaseDescriptor.shouldPreheatPageCache() && fd > 0)
-                    CLibrary.preheatPage(fd, entry.getValue().position);
-            }
-        }
-        finally
-        {
-            FileUtils.closeQuietly(f);
-        }
-    }
-
     public RowIndexEntry getCachedPosition(DecoratedKey key, boolean updateStats)
     {
-        return getCachedPosition(new KeyCacheKey(descriptor, key.key), updateStats);
+        return getCachedPosition(new KeyCacheKey(metadata.cfId, descriptor, key.getKey()), updateStats);
     }
 
     private RowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats)
@@ -888,7 +1397,7 @@
         if (op == Operator.EQ)
         {
             assert key instanceof DecoratedKey; // EQ only make sense if the key is a valid row key
-            if (!bf.isPresent(((DecoratedKey)key).key))
+            if (!bf.isPresent(((DecoratedKey)key).getKey()))
             {
                 Tracing.trace("Bloom filter allows skipping sstable {}", descriptor.generation);
                 return null;
@@ -899,7 +1408,7 @@
         if ((op == Operator.EQ || op == Operator.GE) && (key instanceof DecoratedKey))
         {
             DecoratedKey decoratedKey = (DecoratedKey)key;
-            KeyCacheKey cacheKey = new KeyCacheKey(descriptor, decoratedKey.key);
+            KeyCacheKey cacheKey = new KeyCacheKey(metadata.cfId, descriptor, decoratedKey.getKey());
             RowIndexEntry cachedPosition = getCachedPosition(cacheKey, updateCacheAndStats);
             if (cachedPosition != null)
             {
@@ -908,25 +1417,28 @@
             }
         }
 
-        // next, see if the sampled index says it's impossible for the key to be present
-        long sampledPosition = getIndexScanPosition(key);
-        if (sampledPosition == -1)
+        // check the smallest and greatest keys in the sstable to see if it can't be present
+        if (first.compareTo(key) > 0 || last.compareTo(key) < 0)
         {
             if (op == Operator.EQ && updateCacheAndStats)
                 bloomFilterTracker.addFalsePositive();
-            // we matched the -1th position: if the operator might match forward, we'll start at the first
-            // position. We however need to return the correct index entry for that first position.
-            if (op.apply(1) >= 0)
+
+            if (op.apply(1) < 0)
             {
-                sampledPosition = 0;
-            }
-            else
-            {
-                Tracing.trace("Partition summary allows skipping sstable {}", descriptor.generation);
+                Tracing.trace("Check against min and max keys allows skipping sstable {}", descriptor.generation);
                 return null;
             }
         }
 
+        int binarySearchResult = indexSummary.binarySearch(key);
+        long sampledPosition = getIndexScanPositionFromBinarySearchResult(binarySearchResult, indexSummary);
+        int sampledIndex = getIndexSummaryIndexFromBinarySearchResult(binarySearchResult);
+
+        // if we matched the -1th position, we'll start at the first position
+        sampledPosition = sampledPosition == -1 ? 0 : sampledPosition;
+
+        int effectiveInterval = indexSummary.getEffectiveIndexIntervalAfterIndex(sampledIndex);
+
         // scan the on-disk index, starting at the nearest sampled position.
         // The check against IndexInterval is to be exit the loop in the EQ case when the key looked for is not present
         // (bloom filter false positive). But note that for non-EQ cases, we might need to check the first key of the
@@ -935,12 +1447,12 @@
         // of the next interval).
         int i = 0;
         Iterator<FileDataInput> segments = ifile.iterator(sampledPosition);
-        while (segments.hasNext() && i <= indexSummary.getIndexInterval())
+        while (segments.hasNext() && i <= effectiveInterval)
         {
             FileDataInput in = segments.next();
             try
             {
-                while (!in.isEOF() && i <= indexSummary.getIndexInterval())
+                while (!in.isEOF() && i <= effectiveInterval)
                 {
                     i++;
 
@@ -952,7 +1464,7 @@
                     // Compare raw keys if possible for performance, otherwise compare decorated keys.
                     if (op == Operator.EQ)
                     {
-                        opSatisfied = exactMatch = indexKey.equals(((DecoratedKey) key).key);
+                        opSatisfied = exactMatch = indexKey.equals(((DecoratedKey) key).getKey());
                     }
                     else
                     {
@@ -971,7 +1483,7 @@
                     if (opSatisfied)
                     {
                         // read data position from index entry
-                        RowIndexEntry indexEntry = RowIndexEntry.serializer.deserialize(in, descriptor.version);
+                        RowIndexEntry indexEntry = metadata.comparator.rowIndexEntrySerializer().deserialize(in, descriptor.version);
                         if (exactMatch && updateCacheAndStats)
                         {
                             assert key instanceof DecoratedKey; // key can be == to the index key only if it's a true row key
@@ -996,7 +1508,7 @@
                         return indexEntry;
                     }
 
-                    RowIndexEntry.serializer.skip(in);
+                    RowIndexEntry.Serializer.skip(in);
                 }
             }
             catch (IOException e)
@@ -1038,7 +1550,7 @@
                     if (indexDecoratedKey.compareTo(token) > 0)
                         return indexDecoratedKey;
 
-                    RowIndexEntry.serializer.skip(in);
+                    RowIndexEntry.Serializer.skip(in);
                 }
             }
             catch (IOException e)
@@ -1094,20 +1606,8 @@
      */
     public void releaseReference()
     {
-        if (references.decrementAndGet() == 0 && isCompacted.get())
-        {
-            /**
-             * Make OS a favour and suggest (using fadvice call) that we
-             * don't want to see pages of this SSTable in memory anymore.
-             *
-             * NOTE: We can't use madvice in java because it requires address of
-             * the mapping, so instead we always open a file and run fadvice(fd, 0, 0) on it
-             */
-            dropPageCache();
-
-            FileUtils.closeQuietly(this);
-            deletingTask.schedule();
-        }
+        if (references.decrementAndGet() == 0)
+            tidy(true);
         assert references.get() >= 0 : "Reference counter " +  references.get() + " for " + dfile.path;
     }
 
@@ -1123,8 +1623,12 @@
     public boolean markObsolete()
     {
         if (logger.isDebugEnabled())
-            logger.debug("Marking " + getFilename() + " compacted");
+            logger.debug("Marking {} compacted", getFilename());
 
+        synchronized (replaceLock)
+        {
+            assert replacedBy == null;
+        }
         return !isCompacted.getAndSet(true);
     }
 
@@ -1136,7 +1640,7 @@
     public void markSuspect()
     {
         if (logger.isDebugEnabled())
-            logger.debug("Marking " + getFilename() + " as a suspect for blacklisting.");
+            logger.debug("Marking {} as a suspect for blacklisting.", getFilename());
 
         isSuspect.getAndSet(true);
     }
@@ -1225,6 +1729,25 @@
         }
     }
 
+    public boolean isRepaired()
+    {
+        return sstableMetadata.repairedAt != ActiveRepairService.UNREPAIRED_SSTABLE;
+    }
+
+    public SSTableReader getCurrentReplacement()
+    {
+        synchronized (replaceLock)
+        {
+            SSTableReader cur = this, next = replacedBy;
+            while (next != null)
+            {
+                cur = next;
+                next = next.replacedBy;
+            }
+            return cur;
+        }
+    }
+
     /**
      * TODO: Move someplace reusable
      */
@@ -1325,7 +1848,8 @@
     {
         try
         {
-            return SSTableMetadata.serializer.deserialize(descriptor).right;
+            CompactionMetadata compactionMetadata = (CompactionMetadata) descriptor.getMetadataSerializer().deserialize(descriptor, MetadataType.COMPACTION);
+            return compactionMetadata.ancestors;
         }
         catch (IOException e)
         {
@@ -1350,10 +1874,10 @@
      */
     public void reloadSSTableMetadata() throws IOException
     {
-        this.sstableMetadata = SSTableMetadata.serializer.deserialize(descriptor).left;
+        this.sstableMetadata = (StatsMetadata) descriptor.getMetadataSerializer().deserialize(descriptor, MetadataType.STATS);
     }
 
-    public SSTableMetadata getSSTableMetadata()
+    public StatsMetadata getSSTableMetadata()
     {
         return sstableMetadata;
     }

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
new file mode 100644
index 0000000..4055b42
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java

@@ -0,0 +1,346 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.base.Function;
+import com.google.common.base.Functions;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataTracker;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.compaction.AbstractCompactedRow;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.utils.CLibrary;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * Wraps one or more writers as output for rewriting one or more readers: every sstable_preemptive_open_interval_in_mb
+ * we look in the summary we're collecting for the latest writer for the penultimate key that we know to have been fully
+ * flushed to the index file, and then double check that the key is fully present in the flushed data file.
+ * Then we move the starts of each reader forwards to that point, replace them in the datatracker, and attach a runnable
+ * for on-close (i.e. when all references expire) that drops the page cache prior to that key position
+ *
+ * hard-links are created for each partially written sstable so that readers opened against them continue to work past
+ * the rename of the temporary file, which is deleted once all readers against the hard-link have been closed.
+ * If for any reason the writer is rolled over, we immediately rename and fully expose the completed file in the DataTracker.
+ *
+ * On abort we restore the original lower bounds to the existing readers and delete any temporary files we had in progress,
+ * but leave any hard-links in place for the readers we opened to cleanup when they're finished as we would had we finished
+ * successfully.
+ */
+public class SSTableRewriter
+{
+
+    private static final long preemptiveOpenInterval;
+    static
+    {
+        long interval = DatabaseDescriptor.getSSTablePreempiveOpenIntervalInMB() * (1L << 20);
+        if (interval < 0)
+            interval = Long.MAX_VALUE;
+        preemptiveOpenInterval = interval;
+    }
+
+    private final DataTracker dataTracker;
+    private final ColumnFamilyStore cfs;
+
+    private final long maxAge;
+    private final Set<SSTableReader> rewriting; // the readers we are rewriting (updated as they are replaced)
+    private final Map<Descriptor, DecoratedKey> originalStarts = new HashMap<>(); // the start key for each reader we are rewriting
+    private final Map<Descriptor, Integer> fileDescriptors = new HashMap<>(); // the file descriptors for each reader descriptor we are rewriting
+
+    private SSTableReader currentlyOpenedEarly; // the reader for the most recent (re)opening of the target file
+    private long currentlyOpenedEarlyAt; // the position (in MB) in the target file we last (re)opened at
+
+    private final List<SSTableReader> finished = new ArrayList<>(); // the resultant sstables
+    private final OperationType rewriteType; // the type of rewrite/compaction being performed
+    private final boolean isOffline; // true for operations that are performed without Cassandra running (prevents updates of DataTracker)
+
+    private SSTableWriter writer;
+    private Map<DecoratedKey, RowIndexEntry> cachedKeys = new HashMap<>();
+
+    public SSTableRewriter(ColumnFamilyStore cfs, Set<SSTableReader> rewriting, long maxAge, OperationType rewriteType, boolean isOffline)
+    {
+        this.rewriting = rewriting;
+        for (SSTableReader sstable : rewriting)
+        {
+            originalStarts.put(sstable.descriptor, sstable.first);
+            fileDescriptors.put(sstable.descriptor, CLibrary.getfd(sstable.getFilename()));
+        }
+        this.dataTracker = cfs.getDataTracker();
+        this.cfs = cfs;
+        this.maxAge = maxAge;
+        this.rewriteType = rewriteType;
+        this.isOffline = isOffline;
+    }
+
+    public SSTableWriter currentWriter()
+    {
+        return writer;
+    }
+
+    public RowIndexEntry append(AbstractCompactedRow row)
+    {
+        // we do this before appending to ensure we can resetAndTruncate() safely if the append fails
+        maybeReopenEarly(row.key);
+        RowIndexEntry index = writer.append(row);
+        if (!isOffline)
+        {
+            if (index == null)
+            {
+                cfs.invalidateCachedRow(row.key);
+            }
+            else
+            {
+                boolean save = false;
+                for (SSTableReader reader : rewriting)
+                {
+                    if (reader.getCachedPosition(row.key, false) != null)
+                    {
+                        save = true;
+                        break;
+                    }
+                }
+                if (save)
+                    cachedKeys.put(row.key, index);
+            }
+        }
+        return index;
+    }
+
+    // attempts to append the row, if fails resets the writer position
+    public RowIndexEntry tryAppend(AbstractCompactedRow row)
+    {
+        mark();
+        try
+        {
+            return append(row);
+        }
+        catch (Throwable t)
+        {
+            resetAndTruncate();
+            throw t;
+        }
+    }
+
+    private void mark()
+    {
+        writer.mark();
+    }
+
+    private void resetAndTruncate()
+    {
+        writer.resetAndTruncate();
+    }
+
+    private void maybeReopenEarly(DecoratedKey key)
+    {
+        if (FBUtilities.isUnix() && writer.getFilePointer() - currentlyOpenedEarlyAt > preemptiveOpenInterval)
+        {
+            if (isOffline)
+            {
+                for (SSTableReader reader : rewriting)
+                {
+                    RowIndexEntry index = reader.getPosition(key, SSTableReader.Operator.GE);
+                    CLibrary.trySkipCache(fileDescriptors.get(reader.descriptor), 0, index == null ? 0 : index.position);
+                }
+            }
+            else
+            {
+                SSTableReader reader = writer.openEarly(maxAge);
+                if (reader != null)
+                {
+                    replaceReader(currentlyOpenedEarly, reader);
+                    currentlyOpenedEarly = reader;
+                    currentlyOpenedEarlyAt = writer.getFilePointer();
+                    moveStarts(reader, Functions.constant(reader.last), false);
+                }
+            }
+        }
+    }
+
+    public void abort()
+    {
+        if (writer == null)
+            return;
+        moveStarts(null, Functions.forMap(originalStarts), true);
+        List<SSTableReader> close = new ArrayList<>(finished);
+        if (currentlyOpenedEarly != null)
+            close.add(currentlyOpenedEarly);
+        // also remove already completed SSTables
+        for (SSTableReader sstable : close)
+            sstable.markObsolete();
+        // releases reference in replaceReaders
+        if (!isOffline)
+        {
+            dataTracker.replaceReaders(close, Collections.<SSTableReader>emptyList());
+            dataTracker.unmarkCompacting(close);
+        }
+        writer.abort(currentlyOpenedEarly == null);
+    }
+
+    /**
+     * Replace the readers we are rewriting with cloneWithNewStart, reclaiming any page cache that is no longer
+     * needed, and transferring any key cache entries over to the new reader, expiring them from the old. if reset
+     * is true, we are instead restoring the starts of the readers from before the rewriting began
+     *
+     * @param newReader the rewritten reader that replaces them for this region
+     * @param newStarts a function mapping a reader's descriptor to their new start value
+     * @param reset true iff we are restoring earlier starts (increasing the range over which they are valid)
+     */
+    private void moveStarts(SSTableReader newReader, Function<? super Descriptor, DecoratedKey> newStarts, boolean reset)
+    {
+        if (isOffline)
+            return;
+        List<SSTableReader> toReplace = new ArrayList<>();
+        List<SSTableReader> replaceWith = new ArrayList<>();
+        final List<DecoratedKey> invalidateKeys = new ArrayList<>();
+        if (!reset)
+        {
+            invalidateKeys.addAll(cachedKeys.keySet());
+            for (Map.Entry<DecoratedKey, RowIndexEntry> cacheKey : cachedKeys.entrySet())
+                newReader.cacheKey(cacheKey.getKey(), cacheKey.getValue());
+        }
+        cachedKeys = new HashMap<>();
+        for (final SSTableReader sstable : rewriting)
+        {
+            DecoratedKey newStart = newStarts.apply(sstable.descriptor);
+            assert newStart != null;
+            if (sstable.first.compareTo(newStart) < 0 || (reset && newStart != sstable.first))
+            {
+                toReplace.add(sstable);
+                // we call getCurrentReplacement() to support multiple rewriters operating over the same source readers at once.
+                // note: only one such writer should be written to at any moment
+                replaceWith.add(sstable.getCurrentReplacement().cloneWithNewStart(newStart, new Runnable()
+                {
+                    public void run()
+                    {
+                        // this is somewhat racey, in that we could theoretically be closing this old reader
+                        // when an even older reader is still in use, but it's not likely to have any major impact
+                        for (DecoratedKey key : invalidateKeys)
+                            sstable.invalidateCacheKey(key);
+                    }
+                }));
+            }
+        }
+        replaceReaders(toReplace, replaceWith);
+        rewriting.removeAll(toReplace);
+        rewriting.addAll(replaceWith);
+    }
+
+    private void replaceReader(SSTableReader toReplace, SSTableReader replaceWith)
+    {
+        if (isOffline)
+            return;
+        Set<SSTableReader> toReplaceSet;
+        if (toReplace != null)
+        {
+            toReplace.setReplacedBy(replaceWith);
+            toReplaceSet = Collections.singleton(toReplace);
+        }
+        else
+        {
+            dataTracker.markCompacting(Collections.singleton(replaceWith));
+            toReplaceSet = Collections.emptySet();
+        }
+        replaceReaders(toReplaceSet, Collections.singleton(replaceWith));
+    }
+
+    private void replaceReaders(Collection<SSTableReader> toReplace, Collection<SSTableReader> replaceWith)
+    {
+        if (isOffline)
+            return;
+        dataTracker.replaceReaders(toReplace, replaceWith);
+    }
+
+    public void switchWriter(SSTableWriter newWriter)
+    {
+        if (writer == null)
+        {
+            writer = newWriter;
+            return;
+        }
+        // tmp = false because later we want to query it with descriptor from SSTableReader
+        SSTableReader reader = writer.closeAndOpenReader(maxAge);
+        finished.add(reader);
+        replaceReader(currentlyOpenedEarly, reader);
+        moveStarts(reader, Functions.constant(reader.last), false);
+        currentlyOpenedEarly = null;
+        currentlyOpenedEarlyAt = 0;
+        writer = newWriter;
+    }
+
+    public void finish()
+    {
+        finish(-1);
+    }
+    public void finish(long repairedAt)
+    {
+        finish(true, repairedAt);
+    }
+    public void finish(boolean cleanupOldReaders)
+    {
+        finish(cleanupOldReaders, -1);
+    }
+    public void finish(boolean cleanupOldReaders, long repairedAt)
+    {
+        if (writer.getFilePointer() > 0)
+        {
+            SSTableReader reader = repairedAt < 0 ?
+                                    writer.closeAndOpenReader(maxAge) :
+                                    writer.closeAndOpenReader(maxAge, repairedAt);
+            finished.add(reader);
+            replaceReader(currentlyOpenedEarly, reader);
+            moveStarts(reader, Functions.constant(reader.last), false);
+        }
+        else
+        {
+            writer.abort();
+            writer = null;
+        }
+
+        if (!isOffline)
+        {
+            dataTracker.unmarkCompacting(finished);
+            if (cleanupOldReaders)
+                dataTracker.markCompactedSSTablesReplaced(rewriting, finished, rewriteType);
+        }
+        else if (cleanupOldReaders)
+        {
+            for (SSTableReader reader : rewriting)
+            {
+                reader.markObsolete();
+                reader.releaseReference();
+            }
+        }
+    }
+
+    public List<SSTableReader> finished()
+    {
+        return finished;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableScanner.java b/src/java/org/apache/cassandra/io/sstable/SSTableScanner.java
index 7a87879..62ac175 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableScanner.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableScanner.java

@@ -145,7 +145,7 @@
                 }
                 else
                 {
-                    RowIndexEntry.serializer.skip(ifile);
+                    RowIndexEntry.Serializer.skip(ifile);
                 }
             }
         }
@@ -226,7 +226,7 @@
                             return endOfData();
 
                         currentKey = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(ifile));
-                        currentEntry = RowIndexEntry.serializer.deserialize(ifile, sstable.descriptor.version);
+                        currentEntry = sstable.metadata.comparator.rowIndexEntrySerializer().deserialize(ifile, sstable.descriptor.version);
                     } while (!currentRange.contains(currentKey));
                 }
                 else
@@ -247,7 +247,7 @@
                 {
                     // we need the position of the start of the next key, regardless of whether it falls in the current range
                     nextKey = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(ifile));
-                    nextEntry = RowIndexEntry.serializer.deserialize(ifile, sstable.descriptor.version);
+                    nextEntry = sstable.metadata.comparator.rowIndexEntrySerializer().deserialize(ifile, sstable.descriptor.version);
                     readEnd = nextEntry.position;
 
                     if (!currentRange.contains(nextKey))
@@ -257,12 +257,10 @@
                     }
                 }
 
-                if (dataRange == null || dataRange.selectsFullRowFor(currentKey.key))
+                if (dataRange == null || dataRange.selectsFullRowFor(currentKey.getKey()))
                 {
                     dfile.seek(currentEntry.position);
                     ByteBufferUtil.readWithShortLength(dfile); // key
-                    if (sstable.descriptor.version.hasRowSizeAndColumnCount)
-                        dfile.readLong();
                     long dataSize = readEnd - dfile.getFilePointer();
                     return new SSTableIdentityIterator(sstable, dfile, currentKey, dataSize);
                 }
@@ -271,7 +269,7 @@
                 {
                     public OnDiskAtomIterator create()
                     {
-                        return dataRange.columnFilter(currentKey.key).getSSTableColumnIterator(sstable, dfile, currentKey, currentEntry);
+                        return dataRange.columnFilter(currentKey.getKey()).getSSTableColumnIterator(sstable, dfile, currentKey, currentEntry);
                     }
                 });
 

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleUnsortedWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleUnsortedWriter.java
index 39ec71d..3cfdc7b 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleUnsortedWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleUnsortedWriter.java

@@ -27,16 +27,16 @@
 import com.google.common.base.Throwables;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.ArrayBackedSortedColumns;
+import org.apache.cassandra.db.Cell;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyType;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.TreeMapBackedSortedColumns;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.io.compress.CompressionParameters;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
  * A SSTable writer that doesn't assume rows are in sorted order.
@@ -79,7 +79,7 @@
                                        int bufferSizeInMB,
                                        CompressionParameters compressParameters)
     {
-        this(directory, new CFMetaData(keyspace, columnFamily, subComparator == null ? ColumnFamilyType.Standard : ColumnFamilyType.Super, comparator, subComparator).compressionParameters(compressParameters), partitioner, bufferSizeInMB);
+        this(directory, CFMetaData.denseCFMetaData(keyspace, columnFamily, comparator, subComparator).compressionParameters(compressParameters), partitioner, bufferSizeInMB);
     }
 
     public SSTableSimpleUnsortedWriter(File directory,
@@ -106,15 +106,15 @@
     }
 
     @Override
-    protected void addColumn(Column column) throws IOException
+    protected void addColumn(Cell cell) throws IOException
     {
-        super.addColumn(column);
-        countColumn(column);
+        super.addColumn(cell);
+        countColumn(cell);
     }
 
-    protected void countColumn(Column column) throws IOException
+    protected void countColumn(Cell cell) throws IOException
     {
-        currentSize += column.serializedSize(TypeSizes.NATIVE);
+        currentSize += cell.serializedSize(metadata.comparator, TypeSizes.NATIVE);
 
         // We don't want to sync in writeRow() only as this might blow up the bufferSize for wide rows.
         if (currentSize > bufferSize)
@@ -134,14 +134,14 @@
             // on disk is:
             //   - the row key: 2 bytes size + key size bytes
             //   - the row level deletion infos: 4 + 8 bytes
-            currentSize += 14 + currentKey.key.remaining();
+            currentSize += 14 + currentKey.getKey().remaining();
         }
         return previous;
     }
 
     protected ColumnFamily createColumnFamily() throws IOException
     {
-        return TreeMapBackedSortedColumns.factory.create(metadata);
+        return ArrayBackedSortedColumns.factory.create(metadata);
     }
 
     public void close() throws IOException
@@ -218,6 +218,7 @@
             }
             catch (Throwable e)
             {
+                JVMStabilityInspector.inspectThrowable(e);
                 if (writer != null)
                     writer.abort();
                 exception = e;

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleWriter.java
index 9b584f0..87c8e33 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleWriter.java

@@ -56,8 +56,7 @@
                                AbstractType<?> comparator,
                                AbstractType<?> subComparator)
     {
-        this(directory,
-             new CFMetaData(keyspace, columnFamily, subComparator == null ? ColumnFamilyType.Standard : ColumnFamilyType.Super, comparator, subComparator), partitioner);
+        this(directory, CFMetaData.denseCFMetaData(keyspace, columnFamily, comparator, subComparator), partitioner);
     }
 
     public SSTableSimpleWriter(File directory, CFMetaData metadata, IPartitioner partitioner)
@@ -88,6 +87,6 @@
 
     protected ColumnFamily getColumnFamily()
     {
-        return TreeMapBackedSortedColumns.factory.create(metadata);
+        return ArrayBackedSortedColumns.factory.create(metadata);
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
index 4619ddc..e92803a 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java

@@ -17,23 +17,51 @@
  */
 package org.apache.cassandra.io.sstable;
 
-import java.io.*;
+import java.io.Closeable;
+import java.io.DataInput;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
 
 import com.google.common.collect.Sets;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.ArrayBackedSortedColumns;
+import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.db.ColumnIndex;
+import org.apache.cassandra.db.ColumnSerializer;
+import org.apache.cassandra.db.CounterCell;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.OnDiskAtom;
+import org.apache.cassandra.db.RangeTombstone;
+import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.db.compaction.AbstractCompactedRow;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.compress.CompressedSequentialWriter;
-import org.apache.cassandra.io.util.*;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
+import org.apache.cassandra.io.util.FileMark;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.SegmentedFile;
+import org.apache.cassandra.io.util.SequentialWriter;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FilterFactory;
@@ -53,15 +81,17 @@
     private final SequentialWriter dataFile;
     private DecoratedKey lastWrittenKey;
     private FileMark dataMark;
-    private final SSTableMetadata.Collector sstableMetadataCollector;
+    private final MetadataCollector sstableMetadataCollector;
+    private final long repairedAt;
 
-    public SSTableWriter(String filename, long keyCount)
+    public SSTableWriter(String filename, long keyCount, long repairedAt)
     {
         this(filename,
              keyCount,
+             repairedAt,
              Schema.instance.getCFMetaData(Descriptor.fromFilename(filename)),
              StorageService.getPartitioner(),
-             SSTableMetadata.createCollector(Schema.instance.getCFMetaData(Descriptor.fromFilename(filename)).comparator));
+             new MetadataCollector(Schema.instance.getCFMetaData(Descriptor.fromFilename(filename)).comparator));
     }
 
     private static Set<Component> components(CFMetaData metadata)
@@ -70,7 +100,8 @@
                                                                          Component.PRIMARY_INDEX,
                                                                          Component.STATS,
                                                                          Component.SUMMARY,
-                                                                         Component.TOC));
+                                                                         Component.TOC,
+                                                                         Component.DIGEST));
 
         if (metadata.getBloomFilterFpChance() < 1.0)
             components.add(Component.FILTER);
@@ -83,7 +114,6 @@
         {
             // it would feel safer to actually add this component later in maybeWriteDigest(),
             // but the components are unmodifiable after construction
-            components.add(Component.DIGEST);
             components.add(Component.CRC);
         }
         return components;
@@ -91,30 +121,30 @@
 
     public SSTableWriter(String filename,
                          long keyCount,
+                         long repairedAt,
                          CFMetaData metadata,
                          IPartitioner<?> partitioner,
-                         SSTableMetadata.Collector sstableMetadataCollector)
+                         MetadataCollector sstableMetadataCollector)
     {
         super(Descriptor.fromFilename(filename),
               components(metadata),
               metadata,
               partitioner);
+        this.repairedAt = repairedAt;
         iwriter = new IndexWriter(keyCount);
 
         if (compression)
         {
-            dbuilder = SegmentedFile.getCompressedBuilder();
-            dataFile = CompressedSequentialWriter.open(getFilename(),
-                                                       descriptor.filenameFor(Component.COMPRESSION_INFO),
-                                                       !metadata.populateIoCacheOnFlush(),
-                                                       metadata.compressionParameters(),
-                                                       sstableMetadataCollector);
+            dataFile = SequentialWriter.open(getFilename(),
+                                             descriptor.filenameFor(Component.COMPRESSION_INFO),
+                                             metadata.compressionParameters(),
+                                             sstableMetadataCollector);
+            dbuilder = SegmentedFile.getCompressedBuilder((CompressedSequentialWriter) dataFile);
         }
         else
         {
+            dataFile = SequentialWriter.open(new File(getFilename()), new File(descriptor.filenameFor(Component.CRC)));
             dbuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode());
-            dataFile = SequentialWriter.open(new File(getFilename()), !metadata.populateIoCacheOnFlush());
-            dataFile.setDataIntegrityWriter(DataIntegrityMetadata.checksumWriter(descriptor));
         }
 
         this.sstableMetadataCollector = sstableMetadataCollector;
@@ -145,6 +175,7 @@
 
     private void afterAppend(DecoratedKey decoratedKey, long dataPosition, RowIndexEntry index)
     {
+        sstableMetadataCollector.addKey(decoratedKey.getKey());
         lastWrittenKey = decoratedKey;
         last = lastWrittenKey;
         if (first == null)
@@ -194,11 +225,11 @@
         sstableMetadataCollector.update(dataFile.getFilePointer() - startPosition, cf.getColumnStats());
     }
 
-    public static RowIndexEntry rawAppend(ColumnFamily cf, long startPosition, DecoratedKey key, DataOutput out) throws IOException
+    public static RowIndexEntry rawAppend(ColumnFamily cf, long startPosition, DecoratedKey key, DataOutputPlus out) throws IOException
     {
-        assert cf.getColumnCount() > 0 || cf.isMarkedForDelete();
+        assert cf.hasColumns() || cf.isMarkedForDelete();
 
-        ColumnIndex.Builder builder = new ColumnIndex.Builder(cf, key.key, out);
+        ColumnIndex.Builder builder = new ColumnIndex.Builder(cf, key.getKey(), out);
         ColumnIndex index = builder.build(cf);
 
         out.writeShort(END_OF_ROW);
@@ -219,20 +250,12 @@
         List<ByteBuffer> minColumnNames = Collections.emptyList();
         List<ByteBuffer> maxColumnNames = Collections.emptyList();
         StreamingHistogram tombstones = new StreamingHistogram(TOMBSTONE_HISTOGRAM_BIN_SIZE);
+        boolean hasLegacyCounterShards = false;
+
         ColumnFamily cf = ArrayBackedSortedColumns.factory.create(metadata);
-
-        // skip row size for version < ja
-        if (version.hasRowSizeAndColumnCount)
-            FileUtils.skipBytesFully(in, 8);
-
         cf.delete(DeletionTime.serializer.deserialize(in));
 
-        ColumnIndex.Builder columnIndexer = new ColumnIndex.Builder(cf, key.key, dataFile.stream);
-
-        // read column count for version < ja
-        int columnCount = Integer.MAX_VALUE;
-        if (version.hasRowSizeAndColumnCount)
-            columnCount = in.readInt();
+        ColumnIndex.Builder columnIndexer = new ColumnIndex.Builder(cf, key.getKey(), dataFile.stream);
 
         if (cf.deletionInfo().getTopLevelDeletion().localDeletionTime < Integer.MAX_VALUE)
         {
@@ -247,14 +270,14 @@
         {
             RangeTombstone rangeTombstone = rangeTombstoneIterator.next();
             tombstones.update(rangeTombstone.getLocalDeletionTime());
-            minTimestampTracker.update(rangeTombstone.minTimestamp());
-            maxTimestampTracker.update(rangeTombstone.maxTimestamp());
+            minTimestampTracker.update(rangeTombstone.timestamp());
+            maxTimestampTracker.update(rangeTombstone.timestamp());
             maxDeletionTimeTracker.update(rangeTombstone.getLocalDeletionTime());
             minColumnNames = ColumnNameHelper.minComponents(minColumnNames, rangeTombstone.min, metadata.comparator);
             maxColumnNames = ColumnNameHelper.maxComponents(maxColumnNames, rangeTombstone.max, metadata.comparator);
         }
 
-        Iterator<OnDiskAtom> iter = metadata.getOnDiskIterator(in, columnCount, ColumnSerializer.Flag.PRESERVE_SIZE, Integer.MIN_VALUE, version);
+        Iterator<OnDiskAtom> iter = metadata.getOnDiskIterator(in, ColumnSerializer.Flag.PRESERVE_SIZE, Integer.MIN_VALUE, version);
         try
         {
             while (iter.hasNext())
@@ -262,16 +285,18 @@
                 OnDiskAtom atom = iter.next();
                 if (atom == null)
                     break;
-                if (atom instanceof CounterColumn)
-                    atom = ((CounterColumn) atom).markLocalToBeCleared();
+
+                if (atom instanceof CounterCell)
+                {
+                    atom = ((CounterCell) atom).markLocalToBeCleared();
+                    hasLegacyCounterShards = hasLegacyCounterShards || ((CounterCell) atom).hasLegacyShards();
+                }
 
                 int deletionTime = atom.getLocalDeletionTime();
                 if (deletionTime < Integer.MAX_VALUE)
-                {
                     tombstones.update(deletionTime);
-                }
-                minTimestampTracker.update(atom.minTimestamp());
-                maxTimestampTracker.update(atom.maxTimestamp());
+                minTimestampTracker.update(atom.timestamp());
+                maxTimestampTracker.update(atom.timestamp());
                 minColumnNames = ColumnNameHelper.minComponents(minColumnNames, atom.name(), metadata.comparator);
                 maxColumnNames = ColumnNameHelper.maxComponents(maxColumnNames, atom.name(), metadata.comparator);
                 maxDeletionTimeTracker.update(atom.getLocalDeletionTime());
@@ -287,14 +312,15 @@
             throw new FSWriteError(e, dataFile.getPath());
         }
 
-        sstableMetadataCollector.updateMinTimestamp(minTimestampTracker.get());
-        sstableMetadataCollector.updateMaxTimestamp(maxTimestampTracker.get());
-        sstableMetadataCollector.updateMaxLocalDeletionTime(maxDeletionTimeTracker.get());
-        sstableMetadataCollector.addRowSize(dataFile.getFilePointer() - currentPosition);
-        sstableMetadataCollector.addColumnCount(columnIndexer.writtenAtomCount());
-        sstableMetadataCollector.mergeTombstoneHistogram(tombstones);
-        sstableMetadataCollector.updateMinColumnNames(minColumnNames);
-        sstableMetadataCollector.updateMaxColumnNames(maxColumnNames);
+        sstableMetadataCollector.updateMinTimestamp(minTimestampTracker.get())
+                                .updateMaxTimestamp(maxTimestampTracker.get())
+                                .updateMaxLocalDeletionTime(maxDeletionTimeTracker.get())
+                                .addRowSize(dataFile.getFilePointer() - currentPosition)
+                                .addColumnCount(columnIndexer.writtenAtomCount())
+                                .mergeTombstoneHistogram(tombstones)
+                                .updateMinColumnNames(minColumnNames)
+                                .updateMaxColumnNames(maxColumnNames)
+                                .updateHasLegacyCounterShards(hasLegacyCounterShards);
         afterAppend(key, currentPosition, RowIndexEntry.create(currentPosition, cf.deletionInfo().getTopLevelDeletion(), columnIndexer.build()));
         return currentPosition;
     }
@@ -304,9 +330,23 @@
      */
     public void abort()
     {
-        assert descriptor.temporary;
-        FileUtils.closeQuietly(iwriter);
-        FileUtils.closeQuietly(dataFile);
+        abort(true);
+    }
+    public void abort(boolean closeBf)
+    {
+        assert descriptor.type.isTemporary;
+        if (iwriter == null && dataFile == null)
+            return;
+        if (iwriter != null)
+        {
+            FileUtils.closeQuietly(iwriter.indexFile);
+            if (closeBf)
+            {
+                iwriter.bf.close();
+            }
+        }
+        if (dataFile!= null)
+            FileUtils.closeQuietly(dataFile);
 
         Set<Component> components = SSTable.componentsFor(descriptor);
         try
@@ -321,6 +361,64 @@
         }
     }
 
+    // we use this method to ensure any managed data we may have retained references to during the write are no
+    // longer referenced, so that we do not need to enclose the expensive call to closeAndOpenReader() in a transaction
+    public void isolateReferences()
+    {
+        // currently we only maintain references to first/last/lastWrittenKey from the data provided; all other
+        // data retention is done through copying
+        first = getMinimalKey(first);
+        last = lastWrittenKey = getMinimalKey(last);
+    }
+
+    public SSTableReader openEarly(long maxDataAge)
+    {
+        StatsMetadata sstableMetadata = (StatsMetadata) sstableMetadataCollector.finalizeMetadata(partitioner.getClass().getCanonicalName(),
+                                                  metadata.getBloomFilterFpChance(),
+                                                  repairedAt).get(MetadataType.STATS);
+
+        // find the max (exclusive) readable key
+        DecoratedKey exclusiveUpperBoundOfReadableIndex = iwriter.getMaxReadableKey(0);
+        if (exclusiveUpperBoundOfReadableIndex == null)
+            return null;
+
+        // create temp links if they don't already exist
+        Descriptor link = descriptor.asType(Descriptor.Type.TEMPLINK);
+        if (!new File(link.filenameFor(Component.PRIMARY_INDEX)).exists())
+        {
+            FileUtils.createHardLink(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)), new File(link.filenameFor(Component.PRIMARY_INDEX)));
+            FileUtils.createHardLink(new File(descriptor.filenameFor(Component.DATA)), new File(link.filenameFor(Component.DATA)));
+        }
+
+        // open the reader early, giving it a FINAL descriptor type so that it is indistinguishable for other consumers
+        SegmentedFile ifile = iwriter.builder.openEarly(link.filenameFor(Component.PRIMARY_INDEX));
+        SegmentedFile dfile = dbuilder.openEarly(link.filenameFor(Component.DATA));
+        SSTableReader sstable = SSTableReader.internalOpen(descriptor.asType(Descriptor.Type.FINAL),
+                                                           components, metadata,
+                                                           partitioner, ifile,
+                                                           dfile, iwriter.summary.build(partitioner, exclusiveUpperBoundOfReadableIndex),
+                                                           iwriter.bf, maxDataAge, sstableMetadata, SSTableReader.OpenReason.EARLY);
+
+        // now it's open, find the ACTUAL last readable key (i.e. for which the data file has also been flushed)
+        sstable.first = getMinimalKey(first);
+        sstable.last = getMinimalKey(exclusiveUpperBoundOfReadableIndex);
+        DecoratedKey inclusiveUpperBoundOfReadableData = iwriter.getMaxReadableKey(1);
+        if (inclusiveUpperBoundOfReadableData == null)
+            return null;
+        int offset = 2;
+        while (true)
+        {
+            RowIndexEntry indexEntry = sstable.getPosition(inclusiveUpperBoundOfReadableData, SSTableReader.Operator.GT);
+            if (indexEntry != null && indexEntry.position <= dataFile.getLastFlushOffset())
+                break;
+            inclusiveUpperBoundOfReadableData = iwriter.getMaxReadableKey(offset++);
+            if (inclusiveUpperBoundOfReadableData == null)
+                return null;
+        }
+        sstable.last = getMinimalKey(inclusiveUpperBoundOfReadableData);
+        return sstable;
+    }
+
     public SSTableReader closeAndOpenReader()
     {
         return closeAndOpenReader(System.currentTimeMillis());
@@ -328,13 +426,18 @@
 
     public SSTableReader closeAndOpenReader(long maxDataAge)
     {
-        Pair<Descriptor, SSTableMetadata> p = close();
+        return closeAndOpenReader(maxDataAge, this.repairedAt);
+    }
+
+    public SSTableReader closeAndOpenReader(long maxDataAge, long repairedAt)
+    {
+        Pair<Descriptor, StatsMetadata> p = close(repairedAt);
         Descriptor newdesc = p.left;
-        SSTableMetadata sstableMetadata = p.right;
+        StatsMetadata sstableMetadata = p.right;
 
         // finalize in-memory state for the reader
-        SegmentedFile ifile = iwriter.builder.complete(newdesc.filenameFor(SSTable.COMPONENT_INDEX));
-        SegmentedFile dfile = dbuilder.complete(newdesc.filenameFor(SSTable.COMPONENT_DATA));
+        SegmentedFile ifile = iwriter.builder.complete(newdesc.filenameFor(Component.PRIMARY_INDEX));
+        SegmentedFile dfile = dbuilder.complete(newdesc.filenameFor(Component.DATA));
         SSTableReader sstable = SSTableReader.internalOpen(newdesc,
                                                            components,
                                                            metadata,
@@ -344,52 +447,67 @@
                                                            iwriter.summary.build(partitioner),
                                                            iwriter.bf,
                                                            maxDataAge,
-                                                           sstableMetadata);
+                                                           sstableMetadata,
+                                                           SSTableReader.OpenReason.NORMAL);
         sstable.first = getMinimalKey(first);
         sstable.last = getMinimalKey(last);
         // try to save the summaries to disk
-        SSTableReader.saveSummary(sstable, iwriter.builder, dbuilder);
+        sstable.saveSummary(iwriter.builder, dbuilder);
         iwriter = null;
         dbuilder = null;
         return sstable;
     }
 
     // Close the writer and return the descriptor to the new sstable and it's metadata
-    public Pair<Descriptor, SSTableMetadata> close()
+    public Pair<Descriptor, StatsMetadata> close()
     {
+        return close(this.repairedAt);
+    }
+
+    private Pair<Descriptor, StatsMetadata> close(long repairedAt)
+    {
+
         // index and filter
         iwriter.close();
         // main data, close will truncate if necessary
         dataFile.close();
+        dataFile.writeFullChecksum(descriptor);
         // write sstable statistics
-        SSTableMetadata sstableMetadata = sstableMetadataCollector.finalizeMetadata(partitioner.getClass().getCanonicalName(),
-                                                                                    metadata.getBloomFilterFpChance());
-        writeMetadata(descriptor, sstableMetadata, sstableMetadataCollector.ancestors);
+        Map<MetadataType, MetadataComponent> metadataComponents = sstableMetadataCollector.finalizeMetadata(
+                                                                                    partitioner.getClass().getCanonicalName(),
+                                                                                    metadata.getBloomFilterFpChance(),
+                                                                                    repairedAt);
+        writeMetadata(descriptor, metadataComponents);
 
         // save the table of components
         SSTable.appendTOC(descriptor, components);
 
         // remove the 'tmp' marker from all components
-        return Pair.create(rename(descriptor, components), sstableMetadata);
+        return Pair.create(rename(descriptor, components), (StatsMetadata) metadataComponents.get(MetadataType.STATS));
+
     }
 
-    private static void writeMetadata(Descriptor desc, SSTableMetadata sstableMetadata,  Set<Integer> ancestors)
+
+    private static void writeMetadata(Descriptor desc, Map<MetadataType, MetadataComponent> components)
     {
-        SequentialWriter out = SequentialWriter.open(new File(desc.filenameFor(SSTable.COMPONENT_STATS)), true);
+        SequentialWriter out = SequentialWriter.open(new File(desc.filenameFor(Component.STATS)));
         try
         {
-            SSTableMetadata.serializer.serialize(sstableMetadata, ancestors, out.stream);
+            desc.getMetadataSerializer().serialize(components, out.stream);
         }
         catch (IOException e)
         {
             throw new FSWriteError(e, out.getPath());
         }
-        out.close();
+        finally
+        {
+            out.close();
+        }
     }
 
     static Descriptor rename(Descriptor tmpdesc, Set<Component> components)
     {
-        Descriptor newdesc = tmpdesc.asTemporary(false);
+        Descriptor newdesc = tmpdesc.asType(Descriptor.Type.FINAL);
         rename(tmpdesc, newdesc, components);
         return newdesc;
     }
@@ -431,21 +549,27 @@
 
         IndexWriter(long keyCount)
         {
-            indexFile = SequentialWriter.open(new File(descriptor.filenameFor(SSTable.COMPONENT_INDEX)),
-                                              !metadata.populateIoCacheOnFlush());
+            indexFile = SequentialWriter.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)));
             builder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode());
-            summary = new IndexSummaryBuilder(keyCount, metadata.getIndexInterval());
+            summary = new IndexSummaryBuilder(keyCount, metadata.getMinIndexInterval(), Downsampling.BASE_SAMPLING_LEVEL);
             bf = FilterFactory.getFilter(keyCount, metadata.getBloomFilterFpChance(), true);
         }
 
+        // finds the last (-offset) decorated key that can be guaranteed to occur fully in the flushed portion of the index file
+        DecoratedKey getMaxReadableKey(int offset)
+        {
+            long maxIndexLength = indexFile.getLastFlushOffset();
+            return summary.getMaxReadableKey(maxIndexLength, offset);
+        }
+
         public void append(DecoratedKey key, RowIndexEntry indexEntry)
         {
-            bf.add(key.key);
+            bf.add(key.getKey());
             long indexPosition = indexFile.getFilePointer();
             try
             {
-                ByteBufferUtil.writeWithShortLength(key.key, indexFile.stream);
-                RowIndexEntry.serializer.serialize(indexEntry, indexFile.stream);
+                ByteBufferUtil.writeWithShortLength(key.getKey(), indexFile.stream);
+                metadata.comparator.rowIndexEntrySerializer().serialize(indexEntry, indexFile.stream);
             }
             catch (IOException e)
             {
@@ -466,12 +590,12 @@
         {
             if (components.contains(Component.FILTER))
             {
-                String path = descriptor.filenameFor(SSTable.COMPONENT_FILTER);
+                String path = descriptor.filenameFor(Component.FILTER);
                 try
                 {
                     // bloom filter
                     FileOutputStream fos = new FileOutputStream(path);
-                    DataOutputStream stream = new DataOutputStream(fos);
+                    DataOutputStreamAndChannel stream = new DataOutputStreamAndChannel(fos);
                     FilterFactory.serialize(bf, stream);
                     stream.flush();
                     fos.getFD().sync();

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/CompactionMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/CompactionMetadata.java
new file mode 100644
index 0000000..f801dac
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/CompactionMetadata.java

@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
+import com.clearspring.analytics.stream.cardinality.ICardinality;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Compaction related SSTable metadata.
+ *
+ * Only loaded for <b>compacting</b> SSTables at the time of compaction.
+ */
+public class CompactionMetadata extends MetadataComponent
+{
+    public static final IMetadataComponentSerializer serializer = new CompactionMetadataSerializer();
+
+    public final Set<Integer> ancestors;
+
+    public final ICardinality cardinalityEstimator;
+
+    public CompactionMetadata(Set<Integer> ancestors, ICardinality cardinalityEstimator)
+    {
+        this.ancestors = ancestors;
+        this.cardinalityEstimator = cardinalityEstimator;
+    }
+
+    public MetadataType getType()
+    {
+        return MetadataType.COMPACTION;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        CompactionMetadata that = (CompactionMetadata) o;
+        return ancestors == null ? that.ancestors == null : ancestors.equals(that.ancestors);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return ancestors != null ? ancestors.hashCode() : 0;
+    }
+
+    public static class CompactionMetadataSerializer implements IMetadataComponentSerializer<CompactionMetadata>
+    {
+        public int serializedSize(CompactionMetadata component) throws IOException
+        {
+            int size = 0;
+            size += TypeSizes.NATIVE.sizeof(component.ancestors.size());
+            for (int g : component.ancestors)
+                size += TypeSizes.NATIVE.sizeof(g);
+            byte[] serializedCardinality = component.cardinalityEstimator.getBytes();
+            size += TypeSizes.NATIVE.sizeof(serializedCardinality.length) + serializedCardinality.length;
+            return size;
+        }
+
+        public void serialize(CompactionMetadata component, DataOutputPlus out) throws IOException
+        {
+            out.writeInt(component.ancestors.size());
+            for (int g : component.ancestors)
+                out.writeInt(g);
+            ByteBufferUtil.writeWithLength(component.cardinalityEstimator.getBytes(), out);
+        }
+
+        public CompactionMetadata deserialize(Descriptor.Version version, DataInput in) throws IOException
+        {
+            int nbAncestors = in.readInt();
+            Set<Integer> ancestors = new HashSet<>(nbAncestors);
+            for (int i = 0; i < nbAncestors; i++)
+                ancestors.add(in.readInt());
+            ICardinality cardinality = HyperLogLogPlus.Builder.build(ByteBufferUtil.readBytes(in, in.readInt()));
+            return new CompactionMetadata(ancestors, cardinality);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataComponentSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataComponentSerializer.java
new file mode 100644
index 0000000..49ae378
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataComponentSerializer.java

@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * Metadata component serializer
+ */
+public interface IMetadataComponentSerializer<T extends MetadataComponent>
+{
+    /**
+     * Calculate and return serialized size.
+     *
+     * @param component MetadataComponent to calculate serialized size
+     * @return serialized size of this component
+     * @throws IOException
+     */
+    int serializedSize(T component) throws IOException;
+
+    /**
+     * Serialize metadata component to given output.
+     *
+     *
+     * @param component MetadataComponent to serialize
+     * @param out  serialize destination
+     * @throws IOException
+     */
+    void serialize(T component, DataOutputPlus out) throws IOException;
+
+    /**
+     * Deserialize metadata component from given input.
+     *
+     * @param version serialize version
+     * @param in deserialize source
+     * @return Deserialized component
+     * @throws IOException
+     */
+    T deserialize(Descriptor.Version version, DataInput in) throws IOException;
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java
new file mode 100644
index 0000000..df577df
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java

@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.IOException;
+import java.util.EnumSet;
+import java.util.Map;
+
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * Interface for SSTable metadata serializer
+ */
+public interface IMetadataSerializer
+{
+    /**
+     * Serialize given metadata components
+     *
+     *
+     * @param components Metadata components to serialize
+     * @param out
+     * @throws IOException
+     */
+    void serialize(Map<MetadataType, MetadataComponent> components, DataOutputPlus out) throws IOException;
+
+    /**
+     * Deserialize specified metadata components from given descriptor.
+     *
+     * @param descriptor SSTable descriptor
+     * @return Deserialized metadata components, in deserialized order.
+     * @throws IOException
+     */
+    Map<MetadataType, MetadataComponent> deserialize(Descriptor descriptor, EnumSet<MetadataType> types) throws IOException;
+
+    /**
+     * Deserialized only metadata component specified from given descriptor.
+     *
+     * @param descriptor SSTable descriptor
+     * @param type Metadata component type to deserialize
+     * @return Deserialized metadata component. Can be null if specified type does not exist.
+     * @throws IOException
+     */
+    MetadataComponent deserialize(Descriptor descriptor, MetadataType type) throws IOException;
+
+    /**
+     * Mutate SSTable level
+     *
+     * @param descriptor SSTable descriptor
+     * @param newLevel new SSTable level
+     * @throws IOException
+     */
+    void mutateLevel(Descriptor descriptor, int newLevel) throws IOException;
+
+    /**
+     * Mutate repairedAt time
+     */
+    void mutateRepairedAt(Descriptor descriptor, long newRepairedAt) throws IOException;
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/LegacyMetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/LegacyMetadataSerializer.java
new file mode 100644
index 0000000..4bd060e
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/LegacyMetadataSerializer.java

@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.collect.Maps;
+
+import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.StreamingHistogram;
+
+/**
+ * Serializer for SSTable from legacy versions
+ */
+@Deprecated
+public class LegacyMetadataSerializer extends MetadataSerializer
+{
+    /**
+     * Legacy serialization is only used for SSTable level reset.
+     */
+    @Override
+    public void serialize(Map<MetadataType, MetadataComponent> components, DataOutputPlus out) throws IOException
+    {
+        ValidationMetadata validation = (ValidationMetadata) components.get(MetadataType.VALIDATION);
+        StatsMetadata stats = (StatsMetadata) components.get(MetadataType.STATS);
+        CompactionMetadata compaction = (CompactionMetadata) components.get(MetadataType.COMPACTION);
+
+        assert validation != null && stats != null && compaction != null && validation.partitioner != null;
+
+        EstimatedHistogram.serializer.serialize(stats.estimatedRowSize, out);
+        EstimatedHistogram.serializer.serialize(stats.estimatedColumnCount, out);
+        ReplayPosition.serializer.serialize(stats.replayPosition, out);
+        out.writeLong(stats.minTimestamp);
+        out.writeLong(stats.maxTimestamp);
+        out.writeInt(stats.maxLocalDeletionTime);
+        out.writeDouble(validation.bloomFilterFPChance);
+        out.writeDouble(stats.compressionRatio);
+        out.writeUTF(validation.partitioner);
+        out.writeInt(compaction.ancestors.size());
+        for (Integer g : compaction.ancestors)
+            out.writeInt(g);
+        StreamingHistogram.serializer.serialize(stats.estimatedTombstoneDropTime, out);
+        out.writeInt(stats.sstableLevel);
+        out.writeInt(stats.minColumnNames.size());
+        for (ByteBuffer columnName : stats.minColumnNames)
+            ByteBufferUtil.writeWithShortLength(columnName, out);
+        out.writeInt(stats.maxColumnNames.size());
+        for (ByteBuffer columnName : stats.maxColumnNames)
+            ByteBufferUtil.writeWithShortLength(columnName, out);
+    }
+
+    /**
+     * Legacy serializer deserialize all components no matter what types are specified.
+     */
+    @Override
+    public Map<MetadataType, MetadataComponent> deserialize(Descriptor descriptor, EnumSet<MetadataType> types) throws IOException
+    {
+        Map<MetadataType, MetadataComponent> components = Maps.newHashMap();
+
+        File statsFile = new File(descriptor.filenameFor(Component.STATS));
+        if (!statsFile.exists() && types.contains(MetadataType.STATS))
+        {
+            components.put(MetadataType.STATS, MetadataCollector.defaultStatsMetadata());
+        }
+        else
+        {
+            try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(statsFile))))
+            {
+                EstimatedHistogram rowSizes = EstimatedHistogram.serializer.deserialize(in);
+                EstimatedHistogram columnCounts = EstimatedHistogram.serializer.deserialize(in);
+                ReplayPosition replayPosition = ReplayPosition.serializer.deserialize(in);
+                long minTimestamp = in.readLong();
+                long maxTimestamp = in.readLong();
+                int maxLocalDeletionTime = in.readInt();
+                double bloomFilterFPChance = in.readDouble();
+                double compressionRatio = in.readDouble();
+                String partitioner = in.readUTF();
+                int nbAncestors = in.readInt();
+                Set<Integer> ancestors = new HashSet<>(nbAncestors);
+                for (int i = 0; i < nbAncestors; i++)
+                    ancestors.add(in.readInt());
+                StreamingHistogram tombstoneHistogram = StreamingHistogram.serializer.deserialize(in);
+                int sstableLevel = 0;
+                if (in.available() > 0)
+                    sstableLevel = in.readInt();
+
+                int colCount = in.readInt();
+                List<ByteBuffer> minColumnNames = new ArrayList<>(colCount);
+                for (int i = 0; i < colCount; i++)
+                    minColumnNames.add(ByteBufferUtil.readWithShortLength(in));
+
+                colCount = in.readInt();
+                List<ByteBuffer> maxColumnNames = new ArrayList<>(colCount);
+                for (int i = 0; i < colCount; i++)
+                    maxColumnNames.add(ByteBufferUtil.readWithShortLength(in));
+
+                if (types.contains(MetadataType.VALIDATION))
+                    components.put(MetadataType.VALIDATION,
+                                   new ValidationMetadata(partitioner, bloomFilterFPChance));
+                if (types.contains(MetadataType.STATS))
+                    components.put(MetadataType.STATS,
+                                   new StatsMetadata(rowSizes,
+                                                     columnCounts,
+                                                     replayPosition,
+                                                     minTimestamp,
+                                                     maxTimestamp,
+                                                     maxLocalDeletionTime,
+                                                     compressionRatio,
+                                                     tombstoneHistogram,
+                                                     sstableLevel,
+                                                     minColumnNames,
+                                                     maxColumnNames,
+                                                     true,
+                                                     ActiveRepairService.UNREPAIRED_SSTABLE));
+                if (types.contains(MetadataType.COMPACTION))
+                    components.put(MetadataType.COMPACTION,
+                                   new CompactionMetadata(ancestors, null));
+            }
+        }
+        return components;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
new file mode 100644
index 0000000..7ba2895
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java

@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Maps;
+
+import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
+import com.clearspring.analytics.stream.cardinality.ICardinality;
+import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.io.sstable.ColumnNameHelper;
+import org.apache.cassandra.io.sstable.ColumnStats;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.MurmurHash;
+import org.apache.cassandra.utils.StreamingHistogram;
+
+public class MetadataCollector
+{
+    public static final double NO_COMPRESSION_RATIO = -1.0;
+
+    static EstimatedHistogram defaultColumnCountHistogram()
+    {
+        // EH of 114 can track a max value of 2395318855, i.e., > 2B columns
+        return new EstimatedHistogram(114);
+    }
+
+    static EstimatedHistogram defaultRowSizeHistogram()
+    {
+        // EH of 150 can track a max value of 1697806495183, i.e., > 1.5PB
+        return new EstimatedHistogram(150);
+    }
+
+    static StreamingHistogram defaultTombstoneDropTimeHistogram()
+    {
+        return new StreamingHistogram(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE);
+    }
+
+    public static StatsMetadata defaultStatsMetadata()
+    {
+        return new StatsMetadata(defaultRowSizeHistogram(),
+                                 defaultColumnCountHistogram(),
+                                 ReplayPosition.NONE,
+                                 Long.MIN_VALUE,
+                                 Long.MAX_VALUE,
+                                 Integer.MAX_VALUE,
+                                 NO_COMPRESSION_RATIO,
+                                 defaultTombstoneDropTimeHistogram(),
+                                 0,
+                                 Collections.<ByteBuffer>emptyList(),
+                                 Collections.<ByteBuffer>emptyList(),
+                                 true,
+                                 ActiveRepairService.UNREPAIRED_SSTABLE);
+    }
+
+    protected EstimatedHistogram estimatedRowSize = defaultRowSizeHistogram();
+    protected EstimatedHistogram estimatedColumnCount = defaultColumnCountHistogram();
+    protected ReplayPosition replayPosition = ReplayPosition.NONE;
+    protected long minTimestamp = Long.MAX_VALUE;
+    protected long maxTimestamp = Long.MIN_VALUE;
+    protected int maxLocalDeletionTime = Integer.MIN_VALUE;
+    protected double compressionRatio = NO_COMPRESSION_RATIO;
+    protected Set<Integer> ancestors = new HashSet<>();
+    protected StreamingHistogram estimatedTombstoneDropTime = defaultTombstoneDropTimeHistogram();
+    protected int sstableLevel;
+    protected List<ByteBuffer> minColumnNames = Collections.emptyList();
+    protected List<ByteBuffer> maxColumnNames = Collections.emptyList();
+    protected boolean hasLegacyCounterShards = false;
+
+    /**
+     * Default cardinality estimation method is to use HyperLogLog++.
+     * Parameter here(p=13, sp=25) should give reasonable estimation
+     * while lowering bytes required to hold information.
+     * See CASSANDRA-5906 for detail.
+     */
+    protected ICardinality cardinality = new HyperLogLogPlus(13, 25);
+    private final CellNameType columnNameComparator;
+
+    public MetadataCollector(CellNameType columnNameComparator)
+    {
+        this.columnNameComparator = columnNameComparator;
+    }
+
+    public MetadataCollector(Collection<SSTableReader> sstables, CellNameType columnNameComparator, int level)
+    {
+        this(columnNameComparator);
+
+        replayPosition(ReplayPosition.getReplayPosition(sstables));
+        sstableLevel(level);
+        // Get the max timestamp of the precompacted sstables
+        // and adds generation of live ancestors
+        for (SSTableReader sstable : sstables)
+        {
+            addAncestor(sstable.descriptor.generation);
+            for (Integer i : sstable.getAncestors())
+                if (new File(sstable.descriptor.withGeneration(i).filenameFor(Component.DATA)).exists())
+                    addAncestor(i);
+        }
+    }
+
+    public MetadataCollector addKey(ByteBuffer key)
+    {
+        long hashed = MurmurHash.hash2_64(key, key.position(), key.remaining(), 0);
+        cardinality.offerHashed(hashed);
+        return this;
+    }
+
+    public MetadataCollector addRowSize(long rowSize)
+    {
+        estimatedRowSize.add(rowSize);
+        return this;
+    }
+
+    public MetadataCollector addColumnCount(long columnCount)
+    {
+        estimatedColumnCount.add(columnCount);
+        return this;
+    }
+
+    public MetadataCollector mergeTombstoneHistogram(StreamingHistogram histogram)
+    {
+        estimatedTombstoneDropTime.merge(histogram);
+        return this;
+    }
+
+    /**
+     * Ratio is compressed/uncompressed and it is
+     * if you have 1.x then compression isn't helping
+     */
+    public MetadataCollector addCompressionRatio(long compressed, long uncompressed)
+    {
+        compressionRatio = (double) compressed/uncompressed;
+        return this;
+    }
+
+    public MetadataCollector updateMinTimestamp(long potentialMin)
+    {
+        minTimestamp = Math.min(minTimestamp, potentialMin);
+        return this;
+    }
+
+    public MetadataCollector updateMaxTimestamp(long potentialMax)
+    {
+        maxTimestamp = Math.max(maxTimestamp, potentialMax);
+        return this;
+    }
+
+    public MetadataCollector updateMaxLocalDeletionTime(int maxLocalDeletionTime)
+    {
+        this.maxLocalDeletionTime = Math.max(this.maxLocalDeletionTime, maxLocalDeletionTime);
+        return this;
+    }
+
+    public MetadataCollector estimatedRowSize(EstimatedHistogram estimatedRowSize)
+    {
+        this.estimatedRowSize = estimatedRowSize;
+        return this;
+    }
+
+    public MetadataCollector estimatedColumnCount(EstimatedHistogram estimatedColumnCount)
+    {
+        this.estimatedColumnCount = estimatedColumnCount;
+        return this;
+    }
+
+    public MetadataCollector replayPosition(ReplayPosition replayPosition)
+    {
+        this.replayPosition = replayPosition;
+        return this;
+    }
+
+    public MetadataCollector addAncestor(int generation)
+    {
+        this.ancestors.add(generation);
+        return this;
+    }
+
+    public MetadataCollector sstableLevel(int sstableLevel)
+    {
+        this.sstableLevel = sstableLevel;
+        return this;
+    }
+
+    public MetadataCollector updateMinColumnNames(List<ByteBuffer> minColumnNames)
+    {
+        if (minColumnNames.size() > 0)
+            this.minColumnNames = ColumnNameHelper.mergeMin(this.minColumnNames, minColumnNames, columnNameComparator);
+        return this;
+    }
+
+    public MetadataCollector updateMaxColumnNames(List<ByteBuffer> maxColumnNames)
+    {
+        if (maxColumnNames.size() > 0)
+            this.maxColumnNames = ColumnNameHelper.mergeMax(this.maxColumnNames, maxColumnNames, columnNameComparator);
+        return this;
+    }
+
+    public MetadataCollector updateHasLegacyCounterShards(boolean hasLegacyCounterShards)
+    {
+        this.hasLegacyCounterShards = this.hasLegacyCounterShards || hasLegacyCounterShards;
+        return this;
+    }
+
+    public MetadataCollector update(long rowSize, ColumnStats stats)
+    {
+        updateMinTimestamp(stats.minTimestamp);
+        updateMaxTimestamp(stats.maxTimestamp);
+        updateMaxLocalDeletionTime(stats.maxLocalDeletionTime);
+        addRowSize(rowSize);
+        addColumnCount(stats.columnCount);
+        mergeTombstoneHistogram(stats.tombstoneHistogram);
+        updateMinColumnNames(stats.minColumnNames);
+        updateMaxColumnNames(stats.maxColumnNames);
+        updateHasLegacyCounterShards(stats.hasLegacyCounterShards);
+        return this;
+    }
+
+    public Map<MetadataType, MetadataComponent> finalizeMetadata(String partitioner, double bloomFilterFPChance, long repairedAt)
+    {
+        Map<MetadataType, MetadataComponent> components = Maps.newHashMap();
+        components.put(MetadataType.VALIDATION, new ValidationMetadata(partitioner, bloomFilterFPChance));
+        components.put(MetadataType.STATS, new StatsMetadata(estimatedRowSize,
+                                                             estimatedColumnCount,
+                                                             replayPosition,
+                                                             minTimestamp,
+                                                             maxTimestamp,
+                                                             maxLocalDeletionTime,
+                                                             compressionRatio,
+                                                             estimatedTombstoneDropTime,
+                                                             sstableLevel,
+                                                             ImmutableList.copyOf(minColumnNames),
+                                                             ImmutableList.copyOf(maxColumnNames),
+                                                             hasLegacyCounterShards,
+                                                             repairedAt));
+        components.put(MetadataType.COMPACTION, new CompactionMetadata(ancestors, cardinality));
+        return components;
+    }
+}

diff --git a/src/java/org/apache/cassandra/tools/NodeToolHelp.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataComponent.java
similarity index 65%
copy from src/java/org/apache/cassandra/tools/NodeToolHelp.java
copy to src/java/org/apache/cassandra/io/sstable/metadata/MetadataComponent.java
index c89e48c..bf8a9af 100644
--- a/src/java/org/apache/cassandra/tools/NodeToolHelp.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataComponent.java

@@ -15,22 +15,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.tools;
+package org.apache.cassandra.io.sstable.metadata;
 
-import java.util.List;
-
-public class NodeToolHelp
+/**
+ * MetadataComponent is a component for SSTable metadata and serialized to Stats.db.
+ */
+public abstract class MetadataComponent implements Comparable<MetadataComponent>
 {
-    public List<NodeToolCommand> commands;
+    /**
+     * @return Metadata component type
+     */
+    public abstract MetadataType getType();
 
-    public static class NodeToolCommand
+    public int compareTo(MetadataComponent o)
     {
-        public String name;
-        public String help;
-
-        public String toString()
-        {
-            return name;
-        }
+        return this.getType().compareTo(o.getType());
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java
new file mode 100644
index 0000000..7414208
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java

@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.*;
+import java.util.*;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * Metadata serializer for SSTables version >= 'k'.
+ *
+ * <pre>
+ * File format := | number of components (4 bytes) | toc | component1 | component2 | ... |
+ * toc         := | component type (4 bytes) | position of component |
+ * </pre>
+ *
+ * IMetadataComponent.Type's ordinal() defines the order of serialization.
+ */
+public class MetadataSerializer implements IMetadataSerializer
+{
+    private static final Logger logger = LoggerFactory.getLogger(MetadataSerializer.class);
+
+    public void serialize(Map<MetadataType, MetadataComponent> components, DataOutputPlus out) throws IOException
+    {
+        // sort components by type
+        List<MetadataComponent> sortedComponents = Lists.newArrayList(components.values());
+        Collections.sort(sortedComponents);
+
+        // write number of component
+        out.writeInt(components.size());
+        // build and write toc
+        int lastPosition = 4 + (8 * sortedComponents.size());
+        for (MetadataComponent component : sortedComponents)
+        {
+            MetadataType type = component.getType();
+            // serialize type
+            out.writeInt(type.ordinal());
+            // serialize position
+            out.writeInt(lastPosition);
+            lastPosition += type.serializer.serializedSize(component);
+        }
+        // serialize components
+        for (MetadataComponent component : sortedComponents)
+        {
+            component.getType().serializer.serialize(component, out);
+        }
+    }
+
+    public Map<MetadataType, MetadataComponent> deserialize(Descriptor descriptor, EnumSet<MetadataType> types) throws IOException
+    {
+        Map<MetadataType, MetadataComponent> components;
+        logger.debug("Load metadata for {}", descriptor);
+        File statsFile = new File(descriptor.filenameFor(Component.STATS));
+        if (!statsFile.exists())
+        {
+            logger.debug("No sstable stats for {}", descriptor);
+            components = Maps.newHashMap();
+            components.put(MetadataType.STATS, MetadataCollector.defaultStatsMetadata());
+        }
+        else
+        {
+            try (RandomAccessReader r = RandomAccessReader.open(statsFile))
+            {
+                components = deserialize(descriptor, r, types);
+            }
+        }
+        return components;
+    }
+
+    public MetadataComponent deserialize(Descriptor descriptor, MetadataType type) throws IOException
+    {
+        return deserialize(descriptor, EnumSet.of(type)).get(type);
+    }
+
+    public Map<MetadataType, MetadataComponent> deserialize(Descriptor descriptor, FileDataInput in, EnumSet<MetadataType> types) throws IOException
+    {
+        Map<MetadataType, MetadataComponent> components = Maps.newHashMap();
+        // read number of components
+        int numComponents = in.readInt();
+        // read toc
+        Map<MetadataType, Integer> toc = new HashMap<>(numComponents);
+        for (int i = 0; i < numComponents; i++)
+        {
+            toc.put(MetadataType.values()[in.readInt()], in.readInt());
+        }
+        for (MetadataType type : types)
+        {
+            MetadataComponent component = null;
+            if (toc.containsKey(type))
+            {
+                in.seek(toc.get(type));
+                component = type.serializer.deserialize(descriptor.version, in);
+            }
+            components.put(type, component);
+        }
+        return components;
+    }
+
+    public void mutateLevel(Descriptor descriptor, int newLevel) throws IOException
+    {
+        logger.debug("Mutating {} to level {}", descriptor.filenameFor(Component.STATS), newLevel);
+        Map<MetadataType, MetadataComponent> currentComponents = deserialize(descriptor, EnumSet.allOf(MetadataType.class));
+        StatsMetadata stats = (StatsMetadata) currentComponents.remove(MetadataType.STATS);
+        // mutate level
+        currentComponents.put(MetadataType.STATS, stats.mutateLevel(newLevel));
+        rewriteSSTableMetadata(descriptor, currentComponents);
+    }
+
+    public void mutateRepairedAt(Descriptor descriptor, long newRepairedAt) throws IOException
+    {
+        logger.debug("Mutating {} to repairedAt time {}", descriptor.filenameFor(Component.STATS), newRepairedAt);
+        Map<MetadataType, MetadataComponent> currentComponents = deserialize(descriptor, EnumSet.allOf(MetadataType.class));
+        StatsMetadata stats = (StatsMetadata) currentComponents.remove(MetadataType.STATS);
+        // mutate level
+        currentComponents.put(MetadataType.STATS, stats.mutateRepairedAt(newRepairedAt));
+        rewriteSSTableMetadata(descriptor, currentComponents);
+    }
+
+    private void rewriteSSTableMetadata(Descriptor descriptor, Map<MetadataType, MetadataComponent> currentComponents) throws IOException
+    {
+        Descriptor tmpDescriptor = descriptor.asType(Descriptor.Type.TEMP);
+
+        try (DataOutputStreamAndChannel out = new DataOutputStreamAndChannel(new FileOutputStream(tmpDescriptor.filenameFor(Component.STATS))))
+        {
+            serialize(currentComponents, out);
+            out.flush();
+        }
+        // we cant move a file on top of another file in windows:
+        if (!FBUtilities.isUnix())
+            FileUtils.delete(descriptor.filenameFor(Component.STATS));
+        FileUtils.renameWithConfirm(tmpDescriptor.filenameFor(Component.STATS), descriptor.filenameFor(Component.STATS));
+
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataType.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataType.java
new file mode 100644
index 0000000..9717da1
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataType.java

@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+/**
+ * Defines Metadata component type.
+ */
+public enum MetadataType
+{
+    /** Metadata only used for SSTable validation */
+    VALIDATION(ValidationMetadata.serializer),
+    /** Metadata only used at compaction */
+    COMPACTION(CompactionMetadata.serializer),
+    /** Metadata always keep in memory */
+    STATS(StatsMetadata.serializer);
+
+    public final IMetadataComponentSerializer<MetadataComponent> serializer;
+
+    private MetadataType(IMetadataComponentSerializer<MetadataComponent> serializer)
+    {
+        this.serializer = serializer;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
new file mode 100644
index 0000000..a557b88
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java

@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.lang3.builder.EqualsBuilder;
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.StreamingHistogram;
+
+/**
+ * SSTable metadata that always stay on heap.
+ */
+public class StatsMetadata extends MetadataComponent
+{
+    public static final IMetadataComponentSerializer serializer = new StatsMetadataSerializer();
+
+    public final EstimatedHistogram estimatedRowSize;
+    public final EstimatedHistogram estimatedColumnCount;
+    public final ReplayPosition replayPosition;
+    public final long minTimestamp;
+    public final long maxTimestamp;
+    public final int maxLocalDeletionTime;
+    public final double compressionRatio;
+    public final StreamingHistogram estimatedTombstoneDropTime;
+    public final int sstableLevel;
+    public final List<ByteBuffer> maxColumnNames;
+    public final List<ByteBuffer> minColumnNames;
+    public final boolean hasLegacyCounterShards;
+    public final long repairedAt;
+
+    public StatsMetadata(EstimatedHistogram estimatedRowSize,
+                         EstimatedHistogram estimatedColumnCount,
+                         ReplayPosition replayPosition,
+                         long minTimestamp,
+                         long maxTimestamp,
+                         int maxLocalDeletionTime,
+                         double compressionRatio,
+                         StreamingHistogram estimatedTombstoneDropTime,
+                         int sstableLevel,
+                         List<ByteBuffer> minColumnNames,
+                         List<ByteBuffer> maxColumnNames,
+                         boolean hasLegacyCounterShards,
+                         long repairedAt)
+    {
+        this.estimatedRowSize = estimatedRowSize;
+        this.estimatedColumnCount = estimatedColumnCount;
+        this.replayPosition = replayPosition;
+        this.minTimestamp = minTimestamp;
+        this.maxTimestamp = maxTimestamp;
+        this.maxLocalDeletionTime = maxLocalDeletionTime;
+        this.compressionRatio = compressionRatio;
+        this.estimatedTombstoneDropTime = estimatedTombstoneDropTime;
+        this.sstableLevel = sstableLevel;
+        this.minColumnNames = minColumnNames;
+        this.maxColumnNames = maxColumnNames;
+        this.hasLegacyCounterShards = hasLegacyCounterShards;
+        this.repairedAt = repairedAt;
+    }
+
+    public MetadataType getType()
+    {
+        return MetadataType.STATS;
+    }
+
+    /**
+     * @param gcBefore gc time in seconds
+     * @return estimated droppable tombstone ratio at given gcBefore time.
+     */
+    public double getEstimatedDroppableTombstoneRatio(int gcBefore)
+    {
+        long estimatedColumnCount = this.estimatedColumnCount.mean() * this.estimatedColumnCount.count();
+        if (estimatedColumnCount > 0)
+        {
+            double droppable = getDroppableTombstonesBefore(gcBefore);
+            return droppable / estimatedColumnCount;
+        }
+        return 0.0f;
+    }
+
+    /**
+     * @param gcBefore gc time in seconds
+     * @return amount of droppable tombstones
+     */
+    public double getDroppableTombstonesBefore(int gcBefore)
+    {
+        return estimatedTombstoneDropTime.sum(gcBefore);
+    }
+
+    public StatsMetadata mutateLevel(int newLevel)
+    {
+        return new StatsMetadata(estimatedRowSize,
+                                 estimatedColumnCount,
+                                 replayPosition,
+                                 minTimestamp,
+                                 maxTimestamp,
+                                 maxLocalDeletionTime,
+                                 compressionRatio,
+                                 estimatedTombstoneDropTime,
+                                 newLevel,
+                                 minColumnNames,
+                                 maxColumnNames,
+                                 hasLegacyCounterShards,
+                                 repairedAt);
+    }
+
+    public StatsMetadata mutateRepairedAt(long newRepairedAt)
+    {
+        return new StatsMetadata(estimatedRowSize,
+                                 estimatedColumnCount,
+                                 replayPosition,
+                                 minTimestamp,
+                                 maxTimestamp,
+                                 maxLocalDeletionTime,
+                                 compressionRatio,
+                                 estimatedTombstoneDropTime,
+                                 sstableLevel,
+                                 minColumnNames,
+                                 maxColumnNames,
+                                 hasLegacyCounterShards,
+                                 newRepairedAt);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        StatsMetadata that = (StatsMetadata) o;
+        return new EqualsBuilder()
+                       .append(estimatedRowSize, that.estimatedRowSize)
+                       .append(estimatedColumnCount, that.estimatedColumnCount)
+                       .append(replayPosition, that.replayPosition)
+                       .append(minTimestamp, that.minTimestamp)
+                       .append(maxTimestamp, that.maxTimestamp)
+                       .append(maxLocalDeletionTime, that.maxLocalDeletionTime)
+                       .append(compressionRatio, that.compressionRatio)
+                       .append(estimatedTombstoneDropTime, that.estimatedTombstoneDropTime)
+                       .append(sstableLevel, that.sstableLevel)
+                       .append(repairedAt, that.repairedAt)
+                       .append(maxColumnNames, that.maxColumnNames)
+                       .append(minColumnNames, that.minColumnNames)
+                       .append(hasLegacyCounterShards, that.hasLegacyCounterShards)
+                       .build();
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return new HashCodeBuilder()
+                       .append(estimatedRowSize)
+                       .append(estimatedColumnCount)
+                       .append(replayPosition)
+                       .append(minTimestamp)
+                       .append(maxTimestamp)
+                       .append(maxLocalDeletionTime)
+                       .append(compressionRatio)
+                       .append(estimatedTombstoneDropTime)
+                       .append(sstableLevel)
+                       .append(repairedAt)
+                       .append(maxColumnNames)
+                       .append(minColumnNames)
+                       .append(hasLegacyCounterShards)
+                       .build();
+    }
+
+    public static class StatsMetadataSerializer implements IMetadataComponentSerializer<StatsMetadata>
+    {
+        public int serializedSize(StatsMetadata component) throws IOException
+        {
+            int size = 0;
+            size += EstimatedHistogram.serializer.serializedSize(component.estimatedRowSize, TypeSizes.NATIVE);
+            size += EstimatedHistogram.serializer.serializedSize(component.estimatedColumnCount, TypeSizes.NATIVE);
+            size += ReplayPosition.serializer.serializedSize(component.replayPosition, TypeSizes.NATIVE);
+            size += 8 + 8 + 4 + 8 + 8; // mix/max timestamp(long), maxLocalDeletionTime(int), compressionRatio(double), repairedAt (long)
+            size += StreamingHistogram.serializer.serializedSize(component.estimatedTombstoneDropTime, TypeSizes.NATIVE);
+            size += TypeSizes.NATIVE.sizeof(component.sstableLevel);
+            // min column names
+            size += 4;
+            for (ByteBuffer columnName : component.minColumnNames)
+                size += 2 + columnName.remaining(); // with short length
+            // max column names
+            size += 4;
+            for (ByteBuffer columnName : component.maxColumnNames)
+                size += 2 + columnName.remaining(); // with short length
+            size += TypeSizes.NATIVE.sizeof(component.hasLegacyCounterShards);
+            return size;
+        }
+
+        public void serialize(StatsMetadata component, DataOutputPlus out) throws IOException
+        {
+            EstimatedHistogram.serializer.serialize(component.estimatedRowSize, out);
+            EstimatedHistogram.serializer.serialize(component.estimatedColumnCount, out);
+            ReplayPosition.serializer.serialize(component.replayPosition, out);
+            out.writeLong(component.minTimestamp);
+            out.writeLong(component.maxTimestamp);
+            out.writeInt(component.maxLocalDeletionTime);
+            out.writeDouble(component.compressionRatio);
+            StreamingHistogram.serializer.serialize(component.estimatedTombstoneDropTime, out);
+            out.writeInt(component.sstableLevel);
+            out.writeLong(component.repairedAt);
+            out.writeInt(component.minColumnNames.size());
+            for (ByteBuffer columnName : component.minColumnNames)
+                ByteBufferUtil.writeWithShortLength(columnName, out);
+            out.writeInt(component.maxColumnNames.size());
+            for (ByteBuffer columnName : component.maxColumnNames)
+                ByteBufferUtil.writeWithShortLength(columnName, out);
+            out.writeBoolean(component.hasLegacyCounterShards);
+        }
+
+        public StatsMetadata deserialize(Descriptor.Version version, DataInput in) throws IOException
+        {
+            EstimatedHistogram rowSizes = EstimatedHistogram.serializer.deserialize(in);
+            EstimatedHistogram columnCounts = EstimatedHistogram.serializer.deserialize(in);
+            ReplayPosition replayPosition = ReplayPosition.serializer.deserialize(in);
+            long minTimestamp = in.readLong();
+            long maxTimestamp = in.readLong();
+            int maxLocalDeletionTime = in.readInt();
+            double compressionRatio = in.readDouble();
+            StreamingHistogram tombstoneHistogram = StreamingHistogram.serializer.deserialize(in);
+            int sstableLevel = in.readInt();
+            long repairedAt = 0;
+            if (version.hasRepairedAt)
+                repairedAt = in.readLong();
+
+            int colCount = in.readInt();
+            List<ByteBuffer> minColumnNames = new ArrayList<>(colCount);
+            for (int i = 0; i < colCount; i++)
+                minColumnNames.add(ByteBufferUtil.readWithShortLength(in));
+
+            colCount = in.readInt();
+            List<ByteBuffer> maxColumnNames = new ArrayList<>(colCount);
+            for (int i = 0; i < colCount; i++)
+                maxColumnNames.add(ByteBufferUtil.readWithShortLength(in));
+
+            boolean hasLegacyCounterShards = true;
+            if (version.tracksLegacyCounterShards)
+                hasLegacyCounterShards = in.readBoolean();
+
+            return new StatsMetadata(rowSizes,
+                                     columnCounts,
+                                     replayPosition,
+                                     minTimestamp,
+                                     maxTimestamp,
+                                     maxLocalDeletionTime,
+                                     compressionRatio,
+                                     tombstoneHistogram,
+                                     sstableLevel,
+                                     minColumnNames,
+                                     maxColumnNames,
+                                     hasLegacyCounterShards,
+                                     repairedAt);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/ValidationMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/ValidationMetadata.java
new file mode 100644
index 0000000..e00c55c
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/ValidationMetadata.java

@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * SSTable metadata component used only for validating SSTable.
+ *
+ * This part is read before opening main Data.db file for validation
+ * and discarded immediately after that.
+ */
+public class ValidationMetadata extends MetadataComponent
+{
+    public static final IMetadataComponentSerializer serializer = new ValidationMetadataSerializer();
+
+    public final String partitioner;
+    public final double bloomFilterFPChance;
+
+    public ValidationMetadata(String partitioner, double bloomFilterFPChance)
+    {
+        this.partitioner = partitioner;
+        this.bloomFilterFPChance = bloomFilterFPChance;
+    }
+
+    public MetadataType getType()
+    {
+        return MetadataType.VALIDATION;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        ValidationMetadata that = (ValidationMetadata) o;
+        return Double.compare(that.bloomFilterFPChance, bloomFilterFPChance) == 0 && partitioner.equals(that.partitioner);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        int result;
+        long temp;
+        result = partitioner.hashCode();
+        temp = Double.doubleToLongBits(bloomFilterFPChance);
+        result = 31 * result + (int) (temp ^ (temp >>> 32));
+        return result;
+    }
+
+    public static class ValidationMetadataSerializer implements IMetadataComponentSerializer<ValidationMetadata>
+    {
+        public int serializedSize(ValidationMetadata component) throws IOException
+        {
+            return TypeSizes.NATIVE.sizeof(component.partitioner) + 8;
+        }
+
+        public void serialize(ValidationMetadata component, DataOutputPlus out) throws IOException
+        {
+            out.writeUTF(component.partitioner);
+            out.writeDouble(component.bloomFilterFPChance);
+        }
+
+        public ValidationMetadata deserialize(Descriptor.Version version, DataInput in) throws IOException
+        {
+
+            return new ValidationMetadata(in.readUTF(), in.readDouble());
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/AbstractDataOutput.java b/src/java/org/apache/cassandra/io/util/AbstractDataOutput.java
index 149d82a..3e38293 100644
--- a/src/java/org/apache/cassandra/io/util/AbstractDataOutput.java
+++ b/src/java/org/apache/cassandra/io/util/AbstractDataOutput.java

@@ -17,12 +17,14 @@
  */
 package org.apache.cassandra.io.util;
 
-import java.io.DataOutput;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.io.UTFDataFormatException;
+import java.nio.ByteBuffer;
 
-public abstract class AbstractDataOutput extends OutputStream implements DataOutput
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public abstract class AbstractDataOutput extends OutputStream implements DataOutputPlus
 {
     /*
     !! DataOutput methods below are copied from the implementation in Apache Harmony RandomAccessFile.
@@ -207,7 +209,7 @@
         write((val >>> 24) & 0xFF);
         write((val >>> 16) & 0xFF);
         write((val >>>  8) & 0xFF);
-        write((val >>>  0) & 0xFF);
+        write((val >>> 0) & 0xFF);
     }
 
     /**
@@ -229,7 +231,7 @@
         write((int)(val >>> 24) & 0xFF);
         write((int)(val >>> 16) & 0xFF);
         write((int)(val >>>  8) & 0xFF);
-        write((int)(val >>>  0) & 0xFF);
+        write((int) (val >>> 0) & 0xFF);
     }
 
     /**
@@ -291,4 +293,37 @@
         utfBytes[1] = (byte) utfCount;
         write(utfBytes);
     }
+
+    private byte[] buf;
+    public synchronized void write(ByteBuffer buffer) throws IOException
+    {
+        int len = buffer.remaining();
+        if (len < 16)
+        {
+            int offset = buffer.position();
+            for (int i = 0 ; i < len ; i++)
+                write(buffer.get(i + offset));
+            return;
+        }
+
+        byte[] buf = this.buf;
+        if (buf == null)
+            this.buf = buf = new byte[256];
+
+        int offset = 0;
+        while (len > 0)
+        {
+            int sublen = Math.min(buf.length, len);
+            ByteBufferUtil.arrayCopy(buffer, buffer.position() + offset, buf, 0, sublen);
+            write(buf, 0, sublen);
+            offset += sublen;
+            len -= sublen;
+        }
+    }
+
+    public void write(Memory memory) throws IOException
+    {
+        for (ByteBuffer buffer : memory.asByteBuffers())
+            write(buffer);
+    }
 }

diff --git a/src/java/org/apache/cassandra/io/util/BufferedPoolingSegmentedFile.java b/src/java/org/apache/cassandra/io/util/BufferedPoolingSegmentedFile.java
index 6a23fde..b284f61 100644
--- a/src/java/org/apache/cassandra/io/util/BufferedPoolingSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/BufferedPoolingSegmentedFile.java

@@ -38,6 +38,11 @@
             long length = new File(path).length();
             return new BufferedPoolingSegmentedFile(path, length);
         }
+
+        public SegmentedFile openEarly(String path)
+        {
+            return complete(path);
+        }
     }
 
     protected RandomAccessReader createReader(String path)

diff --git a/src/java/org/apache/cassandra/io/util/BufferedSegmentedFile.java b/src/java/org/apache/cassandra/io/util/BufferedSegmentedFile.java
index 790b42b..aa031e3 100644
--- a/src/java/org/apache/cassandra/io/util/BufferedSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/BufferedSegmentedFile.java

@@ -38,6 +38,11 @@
             long length = new File(path).length();
             return new BufferedSegmentedFile(path, length);
         }
+
+        public SegmentedFile openEarly(String path)
+        {
+            return complete(path);
+        }
     }
 
     public FileDataInput getSegment(long position)

diff --git a/src/java/org/apache/cassandra/io/util/ChecksummedOutputStream.java b/src/java/org/apache/cassandra/io/util/ChecksummedOutputStream.java
deleted file mode 100644
index 8cb4403..0000000
--- a/src/java/org/apache/cassandra/io/util/ChecksummedOutputStream.java
+++ /dev/null

@@ -1,56 +0,0 @@
-package org.apache.cassandra.io.util;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.zip.Checksum;
-
-public class ChecksummedOutputStream extends OutputStream
-{
-    private final OutputStream out;
-    private final Checksum checksum;
-
-    public ChecksummedOutputStream(OutputStream out, Checksum checksum)
-    {
-        this.out = out;
-        this.checksum = checksum;
-    }
-
-    public void resetChecksum()
-    {
-        checksum.reset();
-    }
-
-    public void write(int b) throws IOException
-    {
-        out.write(b);
-        checksum.update(b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len) throws IOException
-    {
-        out.write(b, off, len);
-        checksum.update(b, off, len);
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/util/ChecksummedSequentialWriter.java b/src/java/org/apache/cassandra/io/util/ChecksummedSequentialWriter.java
new file mode 100644
index 0000000..b95bf32
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/ChecksummedSequentialWriter.java

@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.File;
+
+import org.apache.cassandra.io.sstable.Descriptor;
+
+public class ChecksummedSequentialWriter extends SequentialWriter
+{
+    private final SequentialWriter crcWriter;
+    private final DataIntegrityMetadata.ChecksumWriter crcMetadata;
+
+    public ChecksummedSequentialWriter(File file, int bufferSize, File crcPath)
+    {
+        super(file, bufferSize);
+        crcWriter = new SequentialWriter(crcPath, 8 * 1024);
+        crcMetadata = new DataIntegrityMetadata.ChecksumWriter(crcWriter.stream);
+        crcMetadata.writeChunkSize(buffer.length);
+    }
+
+    protected void flushData()
+    {
+        super.flushData();
+        crcMetadata.append(buffer, 0, validBufferBytes);
+    }
+
+    public void writeFullChecksum(Descriptor descriptor)
+    {
+        crcMetadata.writeFullChecksum(descriptor);
+    }
+
+    public void close()
+    {
+        super.close();
+        crcWriter.close();
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/CompressedPoolingSegmentedFile.java b/src/java/org/apache/cassandra/io/util/CompressedPoolingSegmentedFile.java
index 121bdb2..1803e69 100644
--- a/src/java/org/apache/cassandra/io/util/CompressedPoolingSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/CompressedPoolingSegmentedFile.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.io.util;
 
 import org.apache.cassandra.io.compress.CompressedRandomAccessReader;
+import org.apache.cassandra.io.compress.CompressedSequentialWriter;
 import org.apache.cassandra.io.compress.CompressionMetadata;
 
 public class CompressedPoolingSegmentedFile extends PoolingSegmentedFile implements ICompressedFile
@@ -30,8 +31,13 @@
         this.metadata = metadata;
     }
 
-    public static class Builder extends SegmentedFile.Builder
+    public static class Builder extends CompressedSegmentedFile.Builder
     {
+        public Builder(CompressedSequentialWriter writer)
+        {
+            super(writer);
+        }
+
         public void addPotentialBoundary(long boundary)
         {
             // only one segment in a standard-io file
@@ -39,7 +45,12 @@
 
         public SegmentedFile complete(String path)
         {
-            return new CompressedPoolingSegmentedFile(path, CompressionMetadata.create(path));
+            return new CompressedPoolingSegmentedFile(path, metadata(path, false));
+        }
+
+        public SegmentedFile openEarly(String path)
+        {
+            return new CompressedPoolingSegmentedFile(path, metadata(path, true));
         }
     }
 

diff --git a/src/java/org/apache/cassandra/io/util/CompressedSegmentedFile.java b/src/java/org/apache/cassandra/io/util/CompressedSegmentedFile.java
index d0ea3fd..4afe0a0 100644
--- a/src/java/org/apache/cassandra/io/util/CompressedSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/CompressedSegmentedFile.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.io.util;
 
 import org.apache.cassandra.io.compress.CompressedRandomAccessReader;
+import org.apache.cassandra.io.compress.CompressedSequentialWriter;
 import org.apache.cassandra.io.compress.CompressionMetadata;
 
 public class CompressedSegmentedFile extends SegmentedFile implements ICompressedFile
@@ -32,14 +33,35 @@
 
     public static class Builder extends SegmentedFile.Builder
     {
+        protected final CompressedSequentialWriter writer;
+        public Builder(CompressedSequentialWriter writer)
+        {
+            this.writer = writer;
+        }
+
         public void addPotentialBoundary(long boundary)
         {
             // only one segment in a standard-io file
         }
 
+        protected CompressionMetadata metadata(String path, boolean early)
+        {
+            if (writer == null)
+                return CompressionMetadata.create(path);
+            else if (early)
+                return writer.openEarly();
+            else
+                return writer.openAfterClose();
+        }
+
         public SegmentedFile complete(String path)
         {
-            return new CompressedSegmentedFile(path, CompressionMetadata.create(path));
+            return new CompressedSegmentedFile(path, metadata(path, false));
+        }
+
+        public SegmentedFile openEarly(String path)
+        {
+            return new CompressedSegmentedFile(path, metadata(path, true));
         }
     }
 

diff --git a/src/java/org/apache/cassandra/io/util/DataIntegrityMetadata.java b/src/java/org/apache/cassandra/io/util/DataIntegrityMetadata.java
index f334d08..797b964 100644
--- a/src/java/org/apache/cassandra/io/util/DataIntegrityMetadata.java
+++ b/src/java/org/apache/cassandra/io/util/DataIntegrityMetadata.java

@@ -17,20 +17,21 @@
  */
 package org.apache.cassandra.io.util;
 
+import java.io.BufferedWriter;
 import java.io.Closeable;
+import java.io.DataOutput;
 import java.io.File;
 import java.io.IOError;
 import java.io.IOException;
-import java.nio.channels.ClosedChannelException;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-import java.util.regex.Pattern;
+import java.nio.file.Files;
+import java.util.zip.Adler32;
 import java.util.zip.Checksum;
 
+import com.google.common.base.Charsets;
+
+import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTable;
-import org.apache.cassandra.utils.Hex;
 import org.apache.cassandra.utils.PureJavaCrc32;
 
 public class DataIntegrityMetadata
@@ -42,24 +43,23 @@
 
     public static class ChecksumValidator implements Closeable
     {
-        private final Checksum checksum = new PureJavaCrc32();
+        private final Checksum checksum;
         private final RandomAccessReader reader;
         private final Descriptor descriptor;
         public final int chunkSize;
 
-        public ChecksumValidator(Descriptor desc) throws IOException
+        public ChecksumValidator(Descriptor descriptor) throws IOException
         {
-            this.descriptor = desc;
-            reader = RandomAccessReader.open(new File(desc.filenameFor(Component.CRC)));
+            this.descriptor = descriptor;
+            checksum = descriptor.version.hasAllAdlerChecksums ? new Adler32() : new PureJavaCrc32();
+            reader = RandomAccessReader.open(new File(descriptor.filenameFor(Component.CRC)));
             chunkSize = reader.readInt();
         }
 
         public void seek(long offset)
         {
             long start = chunkStart(offset);
-            reader.seek(((start / chunkSize) * 4L) + 4); // 8 byte checksum per
-                                                         // chunk + 4 byte
-                                                         // header/chunkLength
+            reader.seek(((start / chunkSize) * 4L) + 4); // 8 byte checksum per chunk + 4 byte header/chunkLength
         }
 
         public long chunkStart(long offset)
@@ -84,38 +84,22 @@
         }
     }
 
-    public static ChecksumWriter checksumWriter(Descriptor desc)
+    public static class ChecksumWriter
     {
-        return new ChecksumWriter(desc);
-    }
+        private final Checksum incrementalChecksum = new Adler32();
+        private final DataOutput incrementalOut;
+        private final Checksum fullChecksum = new Adler32();
 
-    public static class ChecksumWriter implements Closeable
-    {
-        private final Checksum checksum = new PureJavaCrc32();
-        private final MessageDigest digest;
-        private final SequentialWriter writer;
-        private final Descriptor descriptor;
-
-        public ChecksumWriter(Descriptor desc)
+        public ChecksumWriter(DataOutput incrementalOut)
         {
-            this.descriptor = desc;
-            writer = SequentialWriter.open(new File(desc.filenameFor(Component.CRC)), true);
-            try
-            {
-                digest = MessageDigest.getInstance("SHA-1");
-            }
-            catch (NoSuchAlgorithmException e)
-            {
-                // SHA-1 is standard in java 6
-                throw new RuntimeException(e);
-            }
+            this.incrementalOut = incrementalOut;
         }
 
         public void writeChunkSize(int length)
         {
             try
             {
-                writer.stream.writeInt(length);
+                incrementalOut.writeInt(length);
             }
             catch (IOException e)
             {
@@ -127,11 +111,11 @@
         {
             try
             {
-                checksum.update(buffer, start, end);
-                writer.stream.writeInt((int) checksum.getValue());
-                checksum.reset();
+                incrementalChecksum.update(buffer, start, end);
+                incrementalOut.writeInt((int) incrementalChecksum.getValue());
+                incrementalChecksum.reset();
 
-                digest.update(buffer, start, end);
+                fullChecksum.update(buffer, start, end);
             }
             catch (IOException e)
             {
@@ -139,24 +123,18 @@
             }
         }
 
-        public void close()
+        public void writeFullChecksum(Descriptor descriptor)
         {
-            FileUtils.closeQuietly(writer);
-            byte[] bytes = digest.digest();
-            if (bytes == null)
-                return;
-            SequentialWriter out = SequentialWriter.open(new File(descriptor.filenameFor(SSTable.COMPONENT_DIGEST)), true);
-            // Writting output compatible with sha1sum
-            Descriptor newdesc = descriptor.asTemporary(false);
-            String[] tmp = newdesc.filenameFor(SSTable.COMPONENT_DATA).split(Pattern.quote(File.separator));
-            String dataFileName = tmp[tmp.length - 1];
+            File outFile = new File(descriptor.filenameFor(Component.DIGEST));
+            BufferedWriter out = null;
             try
             {
-                out.write(String.format("%s  %s", Hex.bytesToHex(bytes), dataFileName).getBytes());
+                out = Files.newBufferedWriter(outFile.toPath(), Charsets.UTF_8);
+                out.write(String.valueOf(fullChecksum.getValue()));
             }
-            catch (ClosedChannelException e)
+            catch (IOException e)
             {
-                throw new AssertionError(); // can't happen.
+                throw new FSWriteError(e, outFile);
             }
             finally
             {

diff --git a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java
index 4dcdda3..7577567 100644
--- a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java
+++ b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java

@@ -17,8 +17,9 @@
  */
 package org.apache.cassandra.io.util;
 
-import java.io.DataOutputStream;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
 
 
 /**
@@ -27,7 +28,7 @@
  *
  * This class is completely thread unsafe.
  */
-public final class DataOutputBuffer extends DataOutputStream
+public final class DataOutputBuffer extends DataOutputStreamPlus
 {
     public DataOutputBuffer()
     {
@@ -65,6 +66,11 @@
         }
     }
 
+    public void write(ByteBuffer buffer) throws IOException
+    {
+        ((FastByteArrayOutputStream) out).write(buffer);
+    }
+
     /**
      * Returns the current contents of the buffer. Data is only valid to
      * {@link #getLength()}.
@@ -74,6 +80,19 @@
         return ((FastByteArrayOutputStream) out).buf;
     }
 
+    public byte[] toByteArray()
+    {
+        FastByteArrayOutputStream out = (FastByteArrayOutputStream) this.out;
+        return Arrays.copyOfRange(out.buf, 0, out.count);
+
+    }
+
+    public ByteBuffer asByteBuffer()
+    {
+        FastByteArrayOutputStream out = (FastByteArrayOutputStream) this.out;
+        return ByteBuffer.wrap(out.buf, 0, out.count);
+    }
+
     /** Returns the length of the valid data currently in the buffer. */
     public int getLength()
     {

diff --git a/src/java/org/apache/cassandra/io/util/DataOutputByteBuffer.java b/src/java/org/apache/cassandra/io/util/DataOutputByteBuffer.java
new file mode 100644
index 0000000..b40d30e
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/DataOutputByteBuffer.java

@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+
+/**
+ * An implementation of the DataOutputStream interface using a FastByteArrayOutputStream and exposing
+ * its buffer so copies can be avoided.
+ *
+ * This class is completely thread unsafe.
+ */
+public final class DataOutputByteBuffer extends AbstractDataOutput
+{
+
+    final ByteBuffer buffer;
+    public DataOutputByteBuffer(ByteBuffer buffer)
+    {
+        this.buffer = buffer;
+    }
+
+    @Override
+    public void write(int b)
+    {
+        buffer.put((byte) b);
+    }
+
+    @Override
+    public void write(byte[] b, int off, int len)
+    {
+        buffer.put(b, off, len);
+    }
+
+    public void write(ByteBuffer buffer) throws IOException
+    {
+        int len = buffer.remaining();
+        ByteBufferUtil.arrayCopy(buffer, buffer.position(), this.buffer, this.buffer.position(), len);
+        this.buffer.position(this.buffer.position() + len);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
similarity index 72%
rename from src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java
rename to src/java/org/apache/cassandra/io/util/DataOutputPlus.java
index afd268d..36c25ee 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SimpleAbstractColumnIterator.java
+++ b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java

@@ -15,15 +15,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.db.columniterator;
+package org.apache.cassandra.io.util;
 
+import java.io.DataOutput;
 import java.io.IOException;
+import java.nio.ByteBuffer;
 
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.db.OnDiskAtom;
-
-public abstract class SimpleAbstractColumnIterator extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
+public interface DataOutputPlus extends DataOutput
 {
-    public void close() throws IOException {}
+
+    // write the buffer without modifying its position
+    void write(ByteBuffer buffer) throws IOException;
+
+    void write(Memory memory) throws IOException;
+
 }

diff --git a/src/java/org/apache/cassandra/io/util/DataOutputStreamAndChannel.java b/src/java/org/apache/cassandra/io/util/DataOutputStreamAndChannel.java
new file mode 100644
index 0000000..30cf38b
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/DataOutputStreamAndChannel.java

@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.WritableByteChannel;
+
+public class DataOutputStreamAndChannel extends DataOutputStreamPlus
+{
+    private final WritableByteChannel channel;
+    public DataOutputStreamAndChannel(OutputStream os, WritableByteChannel channel)
+    {
+        super(os);
+        this.channel = channel;
+    }
+    public DataOutputStreamAndChannel(WritableByteChannel channel)
+    {
+        this(Channels.newOutputStream(channel), channel);
+    }
+    public DataOutputStreamAndChannel(FileOutputStream fos)
+    {
+        this(fos, fos.getChannel());
+    }
+
+    public void write(ByteBuffer buffer) throws IOException
+    {
+        buffer = buffer.duplicate();
+        while (buffer.remaining() > 0)
+            channel.write(buffer);
+    }
+
+    public WritableByteChannel getChannel()
+    {
+        return channel;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/DataOutputStreamPlus.java b/src/java/org/apache/cassandra/io/util/DataOutputStreamPlus.java
new file mode 100644
index 0000000..6f2e21a
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/DataOutputStreamPlus.java

@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * When possible use {@link DataOutputStreamAndChannel} instead of this class, as it will
+ * be more efficient. This class is only for situations where it cannot be used
+ */
+public class DataOutputStreamPlus extends AbstractDataOutput implements DataOutputPlus
+{
+    protected final OutputStream out;
+    public DataOutputStreamPlus(OutputStream out)
+    {
+        this.out = out;
+    }
+
+    public void write(byte[] buffer, int offset, int count) throws IOException
+    {
+        out.write(buffer, offset, count);
+    }
+
+    public void write(int oneByte) throws IOException
+    {
+        out.write(oneByte);
+    }
+
+    public void close() throws IOException
+    {
+        out.close();
+    }
+
+    public void flush() throws IOException
+    {
+        out.flush();
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/FastByteArrayOutputStream.java b/src/java/org/apache/cassandra/io/util/FastByteArrayOutputStream.java
index 60cc64a..c831508 100644
--- a/src/java/org/apache/cassandra/io/util/FastByteArrayOutputStream.java
+++ b/src/java/org/apache/cassandra/io/util/FastByteArrayOutputStream.java

@@ -21,6 +21,9 @@
 import java.io.IOException;
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 /*
  * This file has been modified from Apache Harmony's ByteArrayOutputStream
@@ -225,6 +228,14 @@
         this.count += len;
     }
 
+    public void write(ByteBuffer buffer)
+    {
+        int len = buffer.remaining();
+        expand(len);
+        ByteBufferUtil.arrayCopy(buffer, buffer.position(), buf, this.count, len);
+        this.count += len;
+    }
+
     /**
      * Writes the specified byte {@code oneByte} to the OutputStream. Only the
      * low order byte of {@code oneByte} is written.

diff --git a/src/java/org/apache/cassandra/io/util/FileUtils.java b/src/java/org/apache/cassandra/io/util/FileUtils.java
index a12745c..e590918 100644
--- a/src/java/org/apache/cassandra/io/util/FileUtils.java
+++ b/src/java/org/apache/cassandra/io/util/FileUtils.java

@@ -17,17 +17,24 @@
  */
 package org.apache.cassandra.io.util;
 
-import java.io.*;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
+import java.io.Closeable;
+import java.io.DataInput;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
-import java.nio.file.Files;
 import java.nio.file.AtomicMoveNotSupportedException;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.text.DecimalFormat;
 import java.util.Arrays;
 
+import sun.nio.ch.DirectBuffer;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -40,6 +47,7 @@
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 public class FileUtils
 {
@@ -50,23 +58,28 @@
     private static final double TB = 1024*1024*1024*1024d;
 
     private static final DecimalFormat df = new DecimalFormat("#.##");
-
-    private static final Method cleanerMethod;
+    private static final boolean canCleanDirectBuffers;
 
     static
     {
-        Method m;
+        boolean canClean = false;
         try
         {
-            m = Class.forName("sun.nio.ch.DirectBuffer").getMethod("cleaner");
+            ByteBuffer buf = ByteBuffer.allocateDirect(1);
+            ((DirectBuffer) buf).cleaner().clean();
+            canClean = true;
         }
-        catch (Exception e)
+        catch (Throwable t)
         {
-            // Perhaps a non-sun-derived JVM - contributions welcome
-            logger.info("Cannot initialize un-mmaper.  (Are you using a non-SUN JVM?)  Compacted data files will not be removed promptly.  Consider using a SUN JVM or using standard disk access mode");
-            m = null;
+            JVMStabilityInspector.inspectThrowable(t);
+            logger.info("Cannot initialize un-mmaper.  (Are you using a non-Oracle JVM?)  Compacted data files will not be removed promptly.  Consider using an Oracle JVM or using standard disk access mode");
         }
-        cleanerMethod = m;
+        canCleanDirectBuffers = canClean;
+    }
+
+    public static void createHardLink(String from, String to)
+    {
+        createHardLink(new File(from), new File(to));
     }
 
     public static void createHardLink(File from, File to)
@@ -112,7 +125,7 @@
     {
         assert file.exists() : "attempted to delete non-existing file " + file.getName();
         if (logger.isDebugEnabled())
-            logger.debug("Deleting " + file.getName());
+            logger.debug("Deleting {}", file.getName());
         try
         {
             Files.delete(file.toPath());
@@ -213,7 +226,7 @@
         }
         catch (Exception e)
         {
-            logger.warn("Failed closing " + c, e);
+            logger.warn("Failed closing {}", c, e);
         }
     }
 
@@ -235,7 +248,7 @@
             catch (IOException ex)
             {
                 e = ex;
-                logger.warn("Failed closing stream " + c, ex);
+                logger.warn("Failed closing stream {}", c, ex);
             }
         }
         if (e != null)
@@ -268,28 +281,12 @@
 
     public static boolean isCleanerAvailable()
     {
-        return cleanerMethod != null;
+        return canCleanDirectBuffers;
     }
 
     public static void clean(MappedByteBuffer buffer)
     {
-        try
-        {
-            Object cleaner = cleanerMethod.invoke(buffer);
-            cleaner.getClass().getMethod("clean").invoke(cleaner);
-        }
-        catch (IllegalAccessException e)
-        {
-            throw new RuntimeException(e);
-        }
-        catch (InvocationTargetException e)
-        {
-            throw new RuntimeException(e);
-        }
-        catch (NoSuchMethodException e)
-        {
-            throw new RuntimeException(e);
-        }
+        ((DirectBuffer) buffer).cleaner().clean();
     }
 
     public static void createDirectory(String directory)
@@ -396,19 +393,6 @@
         }
     }
 
-    public static void skipBytesFully(DataInput in, long bytes) throws IOException
-    {
-        long n = 0;
-        while (n < bytes)
-        {
-            int m = (int) Math.min(Integer.MAX_VALUE, bytes - n);
-            int skipped = in.skipBytes(m);
-            if (skipped == 0)
-                throw new EOFException("EOF after " + n + " bytes out of " + bytes);
-            n += skipped;
-        }
-    }
-
     public static void handleCorruptSSTable(CorruptSSTableException e)
     {
         if (DatabaseDescriptor.getDiskFailurePolicy() == Config.DiskFailurePolicy.stop_paranoid)
@@ -441,4 +425,21 @@
         }
     }
 
+    /**
+     * Get the size of a directory in bytes
+     * @param directory The directory for which we need size.
+     * @return The size of the directory
+     */
+    public static long folderSize(File directory)
+    {
+        long length = 0;
+        for (File file : directory.listFiles())
+        {
+            if (file.isFile())
+                length += file.length();
+            else
+                length += folderSize(file);
+        }
+        return length;
+    }
 }

diff --git a/src/java/org/apache/cassandra/io/util/MappedFileDataInput.java b/src/java/org/apache/cassandra/io/util/MappedFileDataInput.java
index 786d312..f397ddc 100644
--- a/src/java/org/apache/cassandra/io/util/MappedFileDataInput.java
+++ b/src/java/org/apache/cassandra/io/util/MappedFileDataInput.java

@@ -150,9 +150,10 @@
     }
 
     @Override
-    public final void readFully(byte[] buffer) throws IOException
+    public final void readFully(byte[] bytes) throws IOException
     {
-        throw new UnsupportedOperationException("use readBytes instead");
+        ByteBufferUtil.arrayCopy(buffer, buffer.position() + position, bytes, 0, bytes.length);
+        position += bytes.length;
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/io/util/Memory.java b/src/java/org/apache/cassandra/io/util/Memory.java
index 263205b..5306433 100644
--- a/src/java/org/apache/cassandra/io/util/Memory.java
+++ b/src/java/org/apache/cassandra/io/util/Memory.java

@@ -17,10 +17,15 @@
  */
 package org.apache.cassandra.io.util;
 
+import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 
+import com.sun.jna.Native;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.utils.FastByteOperations;
+import org.apache.cassandra.utils.memory.MemoryUtil;
 import sun.misc.Unsafe;
+import sun.nio.ch.DirectBuffer;
 
 /**
  * An off-heap region of memory that must be manually free'd when no longer needed.
@@ -142,6 +147,24 @@
         }
     }
 
+    public void setBytes(long memoryOffset, ByteBuffer buffer)
+    {
+        if (buffer == null)
+            throw new NullPointerException();
+        else if (buffer.remaining() == 0)
+            return;
+        checkPosition(memoryOffset + buffer.remaining());
+        if (buffer.hasArray())
+        {
+            setBytes(memoryOffset, buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
+        }
+        else if (buffer instanceof DirectBuffer)
+        {
+            unsafe.copyMemory(((DirectBuffer) buffer).address() + buffer.position(), peer + memoryOffset, buffer.remaining());
+        }
+        else
+            throw new IllegalStateException();
+    }
     /**
      * Transfers count bytes from buffer to Memory
      *
@@ -251,7 +274,7 @@
         long end = memoryOffset + count;
         checkPosition(end - 1);
 
-        unsafe.copyMemory(null, peer + memoryOffset, buffer, BYTE_ARRAY_BASE_OFFSET + bufferOffset, count);
+        FastByteOperations.UnsafeOperations.copy(null, peer + memoryOffset, buffer, bufferOffset, count);
     }
 
     private void checkPosition(long offset)
@@ -260,6 +283,18 @@
         assert offset >= 0 && offset < size : "Illegal offset: " + offset + ", size: " + size;
     }
 
+    public void put(long trgOffset, Memory memory, long srcOffset, long size)
+    {
+        unsafe.copyMemory(memory.peer + srcOffset, peer + trgOffset, size);
+    }
+
+    public Memory copy(long newSize)
+    {
+        Memory copy = Memory.allocate(newSize);
+        copy.put(0, this, 0, Math.min(size(), newSize));
+        return copy;
+    }
+
     public void free()
     {
         assert peer != 0;
@@ -285,5 +320,21 @@
             return true;
         return false;
     }
-}
 
+    public ByteBuffer[] asByteBuffers()
+    {
+        if (size() == 0)
+            return new ByteBuffer[0];
+
+        ByteBuffer[] result = new ByteBuffer[(int) (size() / Integer.MAX_VALUE) + 1];
+        long offset = 0;
+        int size = (int) (size() / result.length);
+        for (int i = 0 ; i < result.length - 1 ; i++)
+        {
+            result[i] = MemoryUtil.getByteBuffer(peer + offset, size);
+            offset += size;
+        }
+        result[result.length - 1] = MemoryUtil.getByteBuffer(peer + offset, (int) (size() - offset));
+        return result;
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/io/util/MmappedSegmentedFile.java b/src/java/org/apache/cassandra/io/util/MmappedSegmentedFile.java
index 39a4160..450553b 100644
--- a/src/java/org/apache/cassandra/io/util/MmappedSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/MmappedSegmentedFile.java

@@ -161,28 +161,36 @@
         public SegmentedFile complete(String path)
         {
             long length = new File(path).length();
-            // add a sentinel value == length
-            if (length != boundaries.get(boundaries.size() - 1))
-                boundaries.add(length);
             // create the segments
             return new MmappedSegmentedFile(path, length, createSegments(path));
         }
 
+        public SegmentedFile openEarly(String path)
+        {
+            return complete(path);
+        }
+
         private Segment[] createSegments(String path)
         {
-            int segcount = boundaries.size() - 1;
-            Segment[] segments = new Segment[segcount];
             RandomAccessFile raf;
-
+            long length;
             try
             {
                 raf = new RandomAccessFile(path, "r");
+                length = raf.length();
             }
-            catch (FileNotFoundException e)
+            catch (IOException e)
             {
                 throw new RuntimeException(e);
             }
 
+            // add a sentinel value == length
+            List<Long> boundaries = new ArrayList<>(this.boundaries);
+            if (length != boundaries.get(boundaries.size() - 1))
+                boundaries.add(length);
+            int segcount = boundaries.size() - 1;
+            Segment[] segments = new Segment[segcount];
+
             try
             {
                 for (int i = 0; i < segcount; i++)
@@ -221,7 +229,7 @@
             super.deserializeBounds(in);
 
             int size = in.readInt();
-            List<Long> temp = new ArrayList<Long>(size);
+            List<Long> temp = new ArrayList<>(size);
             
             for (int i = 0; i < size; i++)
                 temp.add(in.readLong());

diff --git a/src/java/org/apache/cassandra/io/util/PoolingSegmentedFile.java b/src/java/org/apache/cassandra/io/util/PoolingSegmentedFile.java
index 892611c..01f4e31 100644
--- a/src/java/org/apache/cassandra/io/util/PoolingSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/PoolingSegmentedFile.java

@@ -21,6 +21,7 @@
 
 public abstract class PoolingSegmentedFile extends SegmentedFile
 {
+    final FileCacheService.CacheKey cacheKey = new FileCacheService.CacheKey();
     protected PoolingSegmentedFile(String path, long length)
     {
         super(path, length);
@@ -33,7 +34,7 @@
 
     public FileDataInput getSegment(long position)
     {
-        RandomAccessReader reader = FileCacheService.instance.get(path);
+        RandomAccessReader reader = FileCacheService.instance.get(cacheKey);
 
         if (reader == null)
             reader = createReader(path);
@@ -46,11 +47,11 @@
 
     public void recycle(RandomAccessReader reader)
     {
-        FileCacheService.instance.put(reader);
+        FileCacheService.instance.put(cacheKey, reader);
     }
 
     public void cleanup()
     {
-        FileCacheService.instance.invalidate(path);
+        FileCacheService.instance.invalidate(cacheKey, path);
     }
 }

diff --git a/src/java/org/apache/cassandra/io/util/RandomAccessReader.java b/src/java/org/apache/cassandra/io/util/RandomAccessReader.java
index 09ecac0..81e45b5 100644
--- a/src/java/org/apache/cassandra/io/util/RandomAccessReader.java
+++ b/src/java/org/apache/cassandra/io/util/RandomAccessReader.java

@@ -368,6 +368,11 @@
         return fileLength;
     }
 
+    public long getPosition()
+    {
+        return current;
+    }
+
     @Override
     public void write(int value)
     {

diff --git a/src/java/org/apache/cassandra/io/util/SegmentedFile.java b/src/java/org/apache/cassandra/io/util/SegmentedFile.java
index d4da177..be549a6 100644
--- a/src/java/org/apache/cassandra/io/util/SegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/SegmentedFile.java

@@ -28,6 +28,7 @@
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.compress.CompressedSequentialWriter;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -75,7 +76,12 @@
 
     public static Builder getCompressedBuilder()
     {
-        return new CompressedPoolingSegmentedFile.Builder();
+        return getCompressedBuilder(null);
+    }
+
+    public static Builder getCompressedBuilder(CompressedSequentialWriter writer)
+    {
+        return new CompressedPoolingSegmentedFile.Builder(writer);
     }
 
     public abstract FileDataInput getSegment(long position);
@@ -111,6 +117,12 @@
          */
         public abstract SegmentedFile complete(String path);
 
+        /**
+         * Called after all potential boundaries have been added to apply this Builder to a concrete file on disk.
+         * @param path The file on disk.
+         */
+        public abstract SegmentedFile openEarly(String path);
+
         public void serializeBounds(DataOutput out) throws IOException
         {
             out.writeUTF(DatabaseDescriptor.getDiskAccessMode().name());

diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
index dc95676..7a7eb63 100644
--- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java

@@ -18,18 +18,25 @@
 package org.apache.cassandra.io.util;
 
 import java.io.*;
+import java.nio.ByteBuffer;
 import java.nio.channels.ClosedChannelException;
+import java.nio.channels.WritableByteChannel;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.compress.CompressedSequentialWriter;
+import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.CLibrary;
 
 /**
  * Adds buffering, mark, and fsyncing to OutputStream.  We always fsync on close; we may also
  * fsync incrementally if Config.trickle_fsync is enabled.
  */
-public class SequentialWriter extends OutputStream
+public class SequentialWriter extends OutputStream implements WritableByteChannel
 {
     // isDirty - true if this.buffer contains any un-synced bytes
     protected boolean isDirty = false, syncNeeded = false;
@@ -37,11 +44,7 @@
     // absolute path to the given file
     private final String filePath;
 
-    // so we can use the write(int) path w/o tons of new byte[] allocations
-    private final byte[] singleByteBuffer = new byte[1];
-
     protected byte[] buffer;
-    private final boolean skipIOCache;
     private final int fd;
     private final int directoryFD;
     // directory should be synced only after first file sync, in other words, only once per file
@@ -52,19 +55,16 @@
 
     protected final RandomAccessFile out;
 
-    // used if skip I/O cache was enabled
-    private long ioCacheStartOffset = 0, bytesSinceCacheFlush = 0;
-
     // whether to do trickling fsync() to avoid sudden bursts of dirty buffer flushing by kernel causing read
     // latency spikes
     private boolean trickleFsync;
     private int trickleFsyncByteInterval;
     private int bytesSinceTrickleFsync = 0;
 
-    public final DataOutputStream stream;
-    private DataIntegrityMetadata.ChecksumWriter metadata;
+    public final DataOutputPlus stream;
+    protected long lastFlushOffset;
 
-    public SequentialWriter(File file, int bufferSize, boolean skipIOCache)
+    public SequentialWriter(File file, int bufferSize)
     {
         try
         {
@@ -78,7 +78,6 @@
         filePath = file.getAbsolutePath();
 
         buffer = new byte[bufferSize];
-        this.skipIOCache = skipIOCache;
         this.trickleFsync = DatabaseDescriptor.getTrickleFsync();
         this.trickleFsyncByteInterval = DatabaseDescriptor.getTrickleFsyncIntervalInKb() * 1024;
 
@@ -92,28 +91,46 @@
         }
 
         directoryFD = CLibrary.tryOpenDirectory(file.getParent());
-        stream = new DataOutputStream(this);
+        stream = new DataOutputStreamAndChannel(this, this);
     }
 
     public static SequentialWriter open(File file)
     {
-        return open(file, RandomAccessReader.DEFAULT_BUFFER_SIZE, false);
+        return open(file, RandomAccessReader.DEFAULT_BUFFER_SIZE);
     }
 
-    public static SequentialWriter open(File file, boolean skipIOCache)
+    public static SequentialWriter open(File file, int bufferSize)
     {
-        return open(file, RandomAccessReader.DEFAULT_BUFFER_SIZE, skipIOCache);
+        return new SequentialWriter(file, bufferSize);
     }
 
-    public static SequentialWriter open(File file, int bufferSize, boolean skipIOCache)
+    public static ChecksummedSequentialWriter open(File file, File crcPath)
     {
-        return new SequentialWriter(file, bufferSize, skipIOCache);
+        return new ChecksummedSequentialWriter(file, RandomAccessReader.DEFAULT_BUFFER_SIZE, crcPath);
+    }
+
+    public static CompressedSequentialWriter open(String dataFilePath,
+                                                  String offsetsPath,
+                                                  CompressionParameters parameters,
+                                                  MetadataCollector sstableMetadataCollector)
+    {
+        return new CompressedSequentialWriter(new File(dataFilePath), offsetsPath, parameters, sstableMetadataCollector);
     }
 
     public void write(int value) throws ClosedChannelException
     {
-        singleByteBuffer[0] = (byte) value;
-        write(singleByteBuffer, 0, 1);
+        if (current >= bufferOffset + buffer.length)
+            reBuffer();
+
+        assert current < bufferOffset + buffer.length
+                : String.format("File (%s) offset %d, buffer offset %d.", getPath(), current, bufferOffset);
+
+        buffer[bufferCursor()] = (byte) value;
+
+        validBufferBytes += 1;
+        current += 1;
+        isDirty = true;
+        syncNeeded = true;
     }
 
     public void write(byte[] buffer) throws ClosedChannelException
@@ -136,8 +153,55 @@
         }
     }
 
+    public int write(ByteBuffer src) throws IOException
+    {
+        if (buffer == null)
+            throw new ClosedChannelException();
+
+        int length = src.remaining();
+        int offset = src.position();
+        while (length > 0)
+        {
+            int n = writeAtMost(src, offset, length);
+            offset += n;
+            length -= n;
+            isDirty = true;
+            syncNeeded = true;
+        }
+        src.position(offset);
+        return length;
+    }
+
     /*
-     * Write at most "length" bytes from "b" starting at position "offset", and
+     * Write at most "length" bytes from "data" starting at position "offset", and
+     * return the number of bytes written. caller is responsible for setting
+     * isDirty.
+     */
+    private int writeAtMost(ByteBuffer data, int offset, int length)
+    {
+        if (current >= bufferOffset + buffer.length)
+            reBuffer();
+
+        assert current < bufferOffset + buffer.length
+        : String.format("File (%s) offset %d, buffer offset %d.", getPath(), current, bufferOffset);
+
+
+        int toCopy = Math.min(length, buffer.length - bufferCursor());
+
+        // copy bytes from external buffer
+        ByteBufferUtil.arrayCopy(data, offset, buffer, bufferCursor(), toCopy);
+
+        assert current <= bufferOffset + buffer.length
+        : String.format("File (%s) offset %d, buffer offset %d.", getPath(), current, bufferOffset);
+
+        validBufferBytes = Math.max(validBufferBytes, bufferCursor() + toCopy);
+        current += toCopy;
+
+        return toCopy;
+    }
+
+    /*
+     * Write at most "length" bytes from "data" starting at position "offset", and
      * return the number of bytes written. caller is responsible for setting
      * isDirty.
      */
@@ -228,23 +292,6 @@
                 }
             }
 
-            if (skipIOCache)
-            {
-                // we don't know when the data reaches disk since we aren't
-                // calling flush
-                // so we continue to clear pages we don't need from the first
-                // offset we see
-                // periodically we update this starting offset
-                bytesSinceCacheFlush += validBufferBytes;
-
-                if (bytesSinceCacheFlush >= RandomAccessReader.CACHE_FLUSH_INTERVAL_IN_BYTES)
-                {
-                    CLibrary.trySkipCache(this.fd, ioCacheStartOffset, 0);
-                    ioCacheStartOffset = bufferOffset;
-                    bytesSinceCacheFlush = 0;
-                }
-            }
-
             // Remember that we wrote, so we don't write it again on next flush().
             resetBuffer();
 
@@ -261,14 +308,12 @@
         try
         {
             out.write(buffer, 0, validBufferBytes);
+            lastFlushOffset += validBufferBytes;
         }
         catch (IOException e)
         {
             throw new FSWriteError(e, getPath());
         }
-
-        if (metadata != null)
-            metadata.append(buffer, 0, validBufferBytes);
     }
 
     public long getFilePointer()
@@ -360,6 +405,11 @@
         resetBuffer();
     }
 
+    public long getLastFlushOffset()
+    {
+        return lastFlushOffset;
+    }
+
     public void truncate(long toSize)
     {
         try
@@ -372,6 +422,11 @@
         }
     }
 
+    public boolean isOpen()
+    {
+        return out.getChannel().isOpen();
+    }
+
     @Override
     public void close()
     {
@@ -382,9 +437,6 @@
 
         buffer = null;
 
-        if (skipIOCache && bytesSinceCacheFlush > 0)
-            CLibrary.trySkipCache(fd, 0, 0);
-
         try
         {
             out.close();
@@ -394,21 +446,12 @@
             throw new FSWriteError(e, getPath());
         }
 
-        FileUtils.closeQuietly(metadata);
         CLibrary.tryCloseFD(directoryFD);
     }
 
-    /**
-     * Turn on digest computation on this writer.
-     * This can only be called before any data is written to this write,
-     * otherwise an IllegalStateException is thrown.
-     */
-    public void setDataIntegrityWriter(DataIntegrityMetadata.ChecksumWriter writer)
+    // hack to make life easier for subclasses
+    public void writeFullChecksum(Descriptor descriptor)
     {
-        if (current != 0)
-            throw new IllegalStateException();
-        metadata = writer;
-        metadata.writeChunkSize(buffer.length);
     }
 
     /**

diff --git a/src/java/org/apache/cassandra/locator/Ec2MultiRegionSnitch.java b/src/java/org/apache/cassandra/locator/Ec2MultiRegionSnitch.java
index bd5e091..511cbb5 100644
--- a/src/java/org/apache/cassandra/locator/Ec2MultiRegionSnitch.java
+++ b/src/java/org/apache/cassandra/locator/Ec2MultiRegionSnitch.java

@@ -48,10 +48,11 @@
     {
         super();
         localPublicAddress = InetAddress.getByName(awsApiCall(PUBLIC_IP_QUERY_URL));
-        logger.info("EC2Snitch using publicIP as identifier: " + localPublicAddress);
+        logger.info("EC2Snitch using publicIP as identifier: {}", localPublicAddress);
         localPrivateAddress = awsApiCall(PRIVATE_IP_QUERY_URL);
         // use the Public IP to broadcast Address to other nodes.
         DatabaseDescriptor.setBroadcastAddress(localPublicAddress);
+        DatabaseDescriptor.setBroadcastRpcAddress(localPublicAddress);
     }
 
     public void gossiperStarting()

diff --git a/src/java/org/apache/cassandra/locator/Ec2Snitch.java b/src/java/org/apache/cassandra/locator/Ec2Snitch.java
index 8b404d1..59eb27b 100644
--- a/src/java/org/apache/cassandra/locator/Ec2Snitch.java
+++ b/src/java/org/apache/cassandra/locator/Ec2Snitch.java

@@ -62,9 +62,9 @@
         if (ec2region.endsWith("1"))
             ec2region = az.substring(0, az.length() - 3);
 
-        String datacenterSuffix = SnitchProperties.get("dc_suffix", "");
+        String datacenterSuffix = (new SnitchProperties()).get("dc_suffix", "");
         ec2region = ec2region.concat(datacenterSuffix);
-        logger.info("EC2Snitch using region: " + ec2region + ", zone: " + ec2zone + ".");
+        logger.info("EC2Snitch using region: {}, zone: {}.", ec2region, ec2zone);
     }
 
     String awsApiCall(String url) throws IOException, ConfigurationException

diff --git a/src/java/org/apache/cassandra/locator/GossipingPropertyFileSnitch.java b/src/java/org/apache/cassandra/locator/GossipingPropertyFileSnitch.java
index dd1637d..2c0980a 100644
--- a/src/java/org/apache/cassandra/locator/GossipingPropertyFileSnitch.java
+++ b/src/java/org/apache/cassandra/locator/GossipingPropertyFileSnitch.java

@@ -19,6 +19,7 @@
 package org.apache.cassandra.locator;
 
 import java.net.InetAddress;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.Map;
 
 import org.slf4j.Logger;
@@ -29,8 +30,10 @@
 import org.apache.cassandra.gms.ApplicationState;
 import org.apache.cassandra.gms.EndpointState;
 import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ResourceWatcher;
+import org.apache.cassandra.utils.WrappedRunnable;
 
 
 public class GossipingPropertyFileSnitch extends AbstractNetworkTopologySnitch// implements IEndpointStateChangeSubscriber
@@ -38,31 +41,55 @@
     private static final Logger logger = LoggerFactory.getLogger(GossipingPropertyFileSnitch.class);
 
     private PropertyFileSnitch psnitch;
-    private String myDC;
-    private String myRack;
-    private Map<InetAddress, Map<String, String>> savedEndpoints;
-    private String DEFAULT_DC = "UNKNOWN_DC";
-    private String DEFAULT_RACK = "UNKNOWN_RACK";
-    private final boolean preferLocal;
 
+    private volatile String myDC;
+    private volatile String myRack;
+    private volatile boolean preferLocal;
+    private AtomicReference<ReconnectableSnitchHelper> snitchHelperReference;
+    private volatile boolean gossipStarted;
+
+    private Map<InetAddress, Map<String, String>> savedEndpoints;
+    private static final String DEFAULT_DC = "UNKNOWN_DC";
+    private static final String DEFAULT_RACK = "UNKNOWN_RACK";
+
+    private static final int DEFAULT_REFRESH_PERIOD_IN_SECONDS = 60;
+    
     public GossipingPropertyFileSnitch() throws ConfigurationException
     {
-        myDC = SnitchProperties.get("dc", null);
-        myRack = SnitchProperties.get("rack", null);
-        if (myDC == null || myRack == null)
-            throw new ConfigurationException("DC or rack not found in snitch properties, check your configuration in: " + SnitchProperties.RACKDC_PROPERTY_FILENAME);
+        this(DEFAULT_REFRESH_PERIOD_IN_SECONDS);
+    }
 
-        myDC = myDC.trim();
-        myRack = myRack.trim();
-        preferLocal = Boolean.parseBoolean(SnitchProperties.get("prefer_local", "false"));
+    public GossipingPropertyFileSnitch(int refreshPeriodInSeconds) throws ConfigurationException
+    {
+        snitchHelperReference = new AtomicReference<ReconnectableSnitchHelper>();
+
+        reloadConfiguration();
+
         try
         {
             psnitch = new PropertyFileSnitch();
-            logger.info("Loaded " + PropertyFileSnitch.SNITCH_PROPERTIES_FILENAME + " for compatibility");
+            logger.info("Loaded {} for compatibility", PropertyFileSnitch.SNITCH_PROPERTIES_FILENAME);
         }
         catch (ConfigurationException e)
         {
-            logger.info("Unable to load " + PropertyFileSnitch.SNITCH_PROPERTIES_FILENAME + "; compatibility mode disabled");
+            logger.info("Unable to load {}; compatibility mode disabled", PropertyFileSnitch.SNITCH_PROPERTIES_FILENAME);
+        }
+
+        try
+        {
+            FBUtilities.resourceToFile(SnitchProperties.RACKDC_PROPERTY_FILENAME);
+            Runnable runnable = new WrappedRunnable()
+            {
+                protected void runMayThrow() throws ConfigurationException
+                {
+                    reloadConfiguration();
+                }
+            };
+            ResourceWatcher.watch(SnitchProperties.RACKDC_PROPERTY_FILENAME, runnable, refreshPeriodInSeconds * 1000);
+        }
+        catch (ConfigurationException ex)
+        {
+            logger.error("{} found, but does not look like a plain file. Will not watch it for changes", SnitchProperties.RACKDC_PROPERTY_FILENAME);
         }
     }
 
@@ -125,8 +152,55 @@
     public void gossiperStarting()
     {
         super.gossiperStarting();
+
         Gossiper.instance.addLocalApplicationState(ApplicationState.INTERNAL_IP,
-                                                   StorageService.instance.valueFactory.internalIP(FBUtilities.getLocalAddress().getHostAddress()));
-        Gossiper.instance.register(new ReconnectableSnitchHelper(this, myDC, preferLocal));
+                StorageService.instance.valueFactory.internalIP(FBUtilities.getLocalAddress().getHostAddress()));
+
+        reloadGossiperState();
+
+        gossipStarted = true;
+    }
+    
+    private void reloadConfiguration() throws ConfigurationException
+    {
+        final SnitchProperties properties = new SnitchProperties();
+
+        String newDc = properties.get("dc", null);
+        String newRack = properties.get("rack", null);
+        if (newDc == null || newRack == null)
+            throw new ConfigurationException("DC or rack not found in snitch properties, check your configuration in: " + SnitchProperties.RACKDC_PROPERTY_FILENAME);
+
+        newDc = newDc.trim();
+        newRack = newRack.trim();
+        final boolean newPreferLocal = Boolean.parseBoolean(properties.get("prefer_local", "false"));
+
+        if (!newDc.equals(myDC) || !newRack.equals(myRack) || (preferLocal != newPreferLocal))
+        {
+            myDC = newDc;
+            myRack = newRack;
+            preferLocal = newPreferLocal;
+
+            reloadGossiperState();
+
+            if (StorageService.instance != null)
+                StorageService.instance.getTokenMetadata().invalidateCachedRings();
+
+            if (gossipStarted)
+                StorageService.instance.gossipSnitchInfo();
+        }
+    }
+
+    private void reloadGossiperState()
+    {
+        if (Gossiper.instance != null)
+        {
+            ReconnectableSnitchHelper pendingHelper = new ReconnectableSnitchHelper(this, myDC, preferLocal);
+            Gossiper.instance.register(pendingHelper);
+            
+            pendingHelper = snitchHelperReference.getAndSet(pendingHelper);
+            if (pendingHelper != null)
+                Gossiper.instance.unregister(pendingHelper);
+        }
+        // else this will eventually rerun at gossiperStarting()
     }
 }

diff --git a/src/java/org/apache/cassandra/locator/PropertyFileSnitch.java b/src/java/org/apache/cassandra/locator/PropertyFileSnitch.java
index 9138bc2..d235369 100644
--- a/src/java/org/apache/cassandra/locator/PropertyFileSnitch.java
+++ b/src/java/org/apache/cassandra/locator/PropertyFileSnitch.java

@@ -73,7 +73,7 @@
         }
         catch (ConfigurationException ex)
         {
-            logger.debug(SNITCH_PROPERTIES_FILENAME + " found, but does not look like a plain file. Will not watch it for changes");
+            logger.error("{} found, but does not look like a plain file. Will not watch it for changes", SNITCH_PROPERTIES_FILENAME);
         }
     }
 

diff --git a/src/java/org/apache/cassandra/locator/SimpleSeedProvider.java b/src/java/org/apache/cassandra/locator/SimpleSeedProvider.java
index eda9fff..6f36cd0 100644
--- a/src/java/org/apache/cassandra/locator/SimpleSeedProvider.java
+++ b/src/java/org/apache/cassandra/locator/SimpleSeedProvider.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.locator;
 
-import java.io.IOException;
 import java.io.InputStream;
 import java.net.InetAddress;
 import java.net.URL;
@@ -30,7 +29,6 @@
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.SeedProviderDef;
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.yaml.snakeyaml.Loader;
@@ -41,34 +39,19 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(SimpleSeedProvider.class);
 
-    List<InetAddress> seeds;
-    public SimpleSeedProvider(Map<String, String> args) {
+    public SimpleSeedProvider(Map<String, String> args) {}
+
+    public List<InetAddress> getSeeds()
+    {
+        Config conf;
         try
         {
-            seeds = loadSeeds();
+            conf = DatabaseDescriptor.loadConfig();
         }
         catch (Exception e)
         {
             throw new AssertionError(e);
         }
-    }
-
-    public List<InetAddress> getSeeds()
-    {
-        try
-        {
-            seeds = loadSeeds();
-        }
-        catch (Exception e)
-        {
-            logger.warn("Could not refresh seeds from configuration file: {}", e);
-        }
-        return Collections.unmodifiableList(seeds);
-    }
-
-    private List<InetAddress> loadSeeds() throws IOException, ConfigurationException
-    {
-        Config conf = DatabaseDescriptor.loadConfig();
         String[] hosts = conf.seed_provider.parameters.get("seeds").split(",", -1);
         List<InetAddress> seeds = new ArrayList<InetAddress>(hosts.length);
         for (String host : hosts)
@@ -80,9 +63,9 @@
             catch (UnknownHostException ex)
             {
                 // not fatal... DD will bark if there end up being zero seeds.
-                logger.warn("Seed provider couldn't lookup host " + host);
+                logger.warn("Seed provider couldn't lookup host {}", host);
             }
         }
-        return seeds;
+        return Collections.unmodifiableList(seeds);
     }
 }

diff --git a/src/java/org/apache/cassandra/locator/SnitchProperties.java b/src/java/org/apache/cassandra/locator/SnitchProperties.java
index cb0946a..be89fcf 100644
--- a/src/java/org/apache/cassandra/locator/SnitchProperties.java
+++ b/src/java/org/apache/cassandra/locator/SnitchProperties.java

@@ -29,9 +29,10 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(SnitchProperties.class);
     public static final String RACKDC_PROPERTY_FILENAME = "cassandra-rackdc.properties";
-    private static Properties properties = new Properties();
 
-    static
+    private Properties properties;
+
+    public SnitchProperties()
     {
         properties = new Properties();
         InputStream stream = null;
@@ -59,9 +60,9 @@
     }
 
     /**
-     * Get a snitch property value or return null if not defined.
+     * Get a snitch property value or return defaultValue if not defined.
      */
-    public static String get(String propertyName, String defaultValue)
+    public String get(String propertyName, String defaultValue)
     {
         return properties.getProperty(propertyName, defaultValue);
     }

diff --git a/src/java/org/apache/cassandra/locator/TokenMetadata.java b/src/java/org/apache/cassandra/locator/TokenMetadata.java
index a673c94..f848e3b 100644
--- a/src/java/org/apache/cassandra/locator/TokenMetadata.java
+++ b/src/java/org/apache/cassandra/locator/TokenMetadata.java

@@ -195,7 +195,7 @@
                     if (!endpoint.equals(prev))
                     {
                         if (prev != null)
-                            logger.warn("Token " + token + " changing ownership from " + prev + " to " + endpoint);
+                            logger.warn("Token {} changing ownership from {} to {}", token, prev, endpoint);
                         shouldSortTokens = true;
                     }
                 }

diff --git a/src/java/org/apache/cassandra/locator/YamlFileNetworkTopologySnitch.java b/src/java/org/apache/cassandra/locator/YamlFileNetworkTopologySnitch.java
index 3237979..93e76f0 100644
--- a/src/java/org/apache/cassandra/locator/YamlFileNetworkTopologySnitch.java
+++ b/src/java/org/apache/cassandra/locator/YamlFileNetworkTopologySnitch.java

@@ -102,6 +102,7 @@
     YamlFileNetworkTopologySnitch(final String topologyConfigFilename)
             throws ConfigurationException
     {
+        logger.warn("YamlFileNetworkTopologySnitch is deprecated; switch to GossipingPropertyFileSnitch instead");
         this.topologyConfigFilename = topologyConfigFilename;
         loadTopologyConfiguration();
 

diff --git a/src/java/org/apache/cassandra/metrics/CQLMetrics.java b/src/java/org/apache/cassandra/metrics/CQLMetrics.java
new file mode 100644
index 0000000..a7076dd
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/CQLMetrics.java

@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.metrics;
+
+import org.apache.cassandra.cql3.QueryProcessor;
+import com.yammer.metrics.Metrics;
+import com.yammer.metrics.core.Counter;
+import com.yammer.metrics.core.Gauge;
+import com.yammer.metrics.util.RatioGauge;
+
+public class CQLMetrics
+{
+    private static final MetricNameFactory factory = new DefaultNameFactory("CQL");
+
+    public final Counter regularStatementsExecuted;
+    public final Counter preparedStatementsExecuted;
+    public final Counter preparedStatementsEvicted;
+
+    public final Gauge<Integer> preparedStatementsCount;
+    public final Gauge<Double> preparedStatementsRatio;
+
+    public CQLMetrics()
+    {
+        regularStatementsExecuted = Metrics.newCounter(factory.createMetricName("RegularStatementsExecuted"));
+        preparedStatementsExecuted = Metrics.newCounter(factory.createMetricName("PreparedStatementsExecuted"));
+        preparedStatementsEvicted = Metrics.newCounter(factory.createMetricName("PreparedStatementsEvicted"));
+
+        preparedStatementsCount = Metrics.newGauge(factory.createMetricName("PreparedStatementsCount"), new Gauge<Integer>()
+        {
+            public Integer value()
+            {
+                return QueryProcessor.preparedStatementsCount();
+            }
+        });
+        preparedStatementsRatio = Metrics.newGauge(factory.createMetricName("PreparedStatementsRatio"), new RatioGauge()
+        {
+            public double getNumerator()
+            {
+                return preparedStatementsExecuted.count();
+            }
+
+            public double getDenominator()
+            {
+                return regularStatementsExecuted.count() + preparedStatementsExecuted.count();
+            }
+        });
+    }
+}

diff --git a/src/java/org/apache/cassandra/metrics/ColumnFamilyMetrics.java b/src/java/org/apache/cassandra/metrics/ColumnFamilyMetrics.java
index a3838a0..8ab432e 100644
--- a/src/java/org/apache/cassandra/metrics/ColumnFamilyMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/ColumnFamilyMetrics.java

@@ -24,8 +24,8 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.utils.EstimatedHistogram;
 
 import com.google.common.collect.Maps;
@@ -39,10 +39,18 @@
  */
 public class ColumnFamilyMetrics
 {
-    /** Total amount of data stored in the memtable, including column related overhead. */
-    public final Gauge<Long> memtableDataSize;
-    /** Total amount of data stored in the memtables (2i and pending flush memtables included). */
-    public final Gauge<Long> allMemtablesDataSize;
+    /** Total amount of data stored in the memtable that resides on-heap, including column related overhead and overwritten rows. */
+    public final Gauge<Long> memtableOnHeapSize;
+    /** Total amount of data stored in the memtable that resides off-heap, including column related overhead and overwritten rows. */
+    public final Gauge<Long> memtableOffHeapSize;
+    /** Total amount of live data stored in the memtable, excluding any data structure overhead */
+    public final Gauge<Long> memtableLiveDataSize;
+    /** Total amount of data stored in the memtables (2i and pending flush memtables included) that resides on-heap. */
+    public final Gauge<Long> allMemtablesOnHeapSize;
+    /** Total amount of data stored in the memtables (2i and pending flush memtables included) that resides off-heap. */
+    public final Gauge<Long> allMemtablesOffHeapSize;
+    /** Total amount of live data stored in the memtables (2i and pending flush memtables included) that resides off-heap, excluding any data structure overhead */
+    public final Gauge<Long> allMemtablesLiveDataSize;
     /** Total number of columns present in the memtable. */
     public final Gauge<Long> memtableColumnsCount;
     /** Number of times flush has resulted in the memtable being switched out. */
@@ -62,7 +70,7 @@
     /** (Local) write metrics */
     public final LatencyMetrics writeLatency;
     /** Estimated number of tasks pending for this column family */
-    public final Gauge<Integer> pendingTasks;
+    public final Counter pendingFlushes;
     /** Estimate of number of pending compactios for this CF */
     public final Gauge<Integer> pendingCompactions;
     /** Number of SSTables on disk for this CF */
@@ -93,6 +101,14 @@
     public final ColumnFamilyHistogram tombstoneScannedHistogram;
     /** Live cells scanned in queries on this CF */
     public final ColumnFamilyHistogram liveScannedHistogram;
+    /** Disk space used by snapshot files which */
+    public final Gauge<Long> trueSnapshotsSize;
+    /** Row cache hits, but result out of range */
+    public final Counter rowCacheHitOutOfRange;
+    /** Number of row cache hits */
+    public final Counter rowCacheHit;
+    /** Number of row cache misses */
+    public final Counter rowCacheMiss;
     /** CAS Prepare metrics */
     public final LatencyMetrics casPrepare;
     /** CAS Propose metrics */
@@ -103,6 +119,9 @@
     public final Timer coordinatorReadLatency;
     public final Timer coordinatorScanLatency;
 
+    /** Time spent waiting for free memtable space, either on- or off-heap */
+    public final Timer waitingOnFreeMemtableSpace;
+
     private final MetricNameFactory factory;
     private static final MetricNameFactory globalNameFactory = new AllColumnFamilyMetricNameFactory();;
 
@@ -139,21 +158,58 @@
         {
             public Long value()
             {
-                return cfs.getDataTracker().getMemtable().getOperations();
+                return cfs.getDataTracker().getView().getCurrentMemtable().getOperations();
             }
         });
-        memtableDataSize = createColumnFamilyGauge("MemtableDataSize", new Gauge<Long>()
+        memtableOnHeapSize = createColumnFamilyGauge("MemtableOnHeapSize", new Gauge<Long>()
         {
             public Long value()
             {
-                return cfs.getDataTracker().getMemtable().getLiveSize();
+                return cfs.getDataTracker().getView().getCurrentMemtable().getAllocator().onHeap().owns();
             }
         });
-        allMemtablesDataSize = createColumnFamilyGauge("AllMemtablesDataSize", new Gauge<Long>()
+        memtableOffHeapSize = createColumnFamilyGauge("MemtableOffHeapSize", new Gauge<Long>()
         {
             public Long value()
             {
-                return cfs.getTotalAllMemtablesLiveSize();
+                return cfs.getDataTracker().getView().getCurrentMemtable().getAllocator().offHeap().owns();
+            }
+        });
+        memtableLiveDataSize = createColumnFamilyGauge("MemtableLiveDataSize", new Gauge<Long>()
+        {
+            public Long value()
+            {
+                return cfs.getDataTracker().getView().getCurrentMemtable().getLiveDataSize();
+            }
+        });
+        allMemtablesOnHeapSize = createColumnFamilyGauge("AllMemtablesHeapSize", new Gauge<Long>()
+        {
+            public Long value()
+            {
+                long size = 0;
+                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
+                    size += cfs2.getDataTracker().getView().getCurrentMemtable().getAllocator().onHeap().owns();
+                return size;
+            }
+        });
+        allMemtablesOffHeapSize = createColumnFamilyGauge("AllMemtablesOffHeapSize", new Gauge<Long>()
+        {
+            public Long value()
+            {
+                long size = 0;
+                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
+                    size += cfs2.getDataTracker().getView().getCurrentMemtable().getAllocator().offHeap().owns();
+                return size;
+            }
+        });
+        allMemtablesLiveDataSize = createColumnFamilyGauge("AllMemtablesLiveDataSize", new Gauge<Long>()
+        {
+            public Long value()
+            {
+                long size = 0;
+                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
+                    size += cfs2.getDataTracker().getView().getCurrentMemtable().getLiveDataSize();
+                return size;
             }
         });
         memtableSwitchCount = createColumnFamilyCounter("MemtableSwitchCount");
@@ -194,7 +250,7 @@
                 int total = 0;
                 for (SSTableReader sstable : cfs.getSSTables())
                 {
-                    if (sstable.getCompressionRatio() != SSTableMetadata.NO_COMPRESSION_RATIO)
+                    if (sstable.getCompressionRatio() != MetadataCollector.NO_COMPRESSION_RATIO)
                     {
                         sum += sstable.getCompressionRatio();
                         total++;
@@ -212,7 +268,7 @@
                 {
                     for (SSTableReader sstable : keyspace.getAllSSTables())
                     {
-                        if (sstable.getCompressionRatio() != SSTableMetadata.NO_COMPRESSION_RATIO)
+                        if (sstable.getCompressionRatio() != MetadataCollector.NO_COMPRESSION_RATIO)
                         {
                             sum += sstable.getCompressionRatio();
                             total++;
@@ -225,6 +281,7 @@
         readLatency = new LatencyMetrics(factory, "Read", cfs.keyspace.metric.readLatency, globalReadLatency);
         writeLatency = new LatencyMetrics(factory, "Write", cfs.keyspace.metric.writeLatency, globalWriteLatency);
         rangeLatency = new LatencyMetrics(factory, "Range", cfs.keyspace.metric.rangeLatency, globalRangeLatency);
+        pendingFlushes = createColumnFamilyCounter("PendingFlushes");
         pendingCompactions = createColumnFamilyGauge("PendingCompactions", new Gauge<Integer>()
         {
             public Integer value()
@@ -232,14 +289,6 @@
                 return cfs.getCompactionStrategy().getEstimatedRemainingTasks();
             }
         });
-        pendingTasks = Metrics.newGauge(factory.createMetricName("PendingTasks"), new Gauge<Integer>()
-        {
-            public Integer value()
-            {
-                // TODO this actually isn't a good measure of pending tasks
-                return Keyspace.switchLock.getQueueLength();
-            }
-        });
         liveSSTableCount = createColumnFamilyGauge("LiveSSTableCount", new Gauge<Integer>()
         {
             public Integer value()
@@ -450,6 +499,19 @@
         liveScannedHistogram = createColumnFamilyHistogram("LiveScannedHistogram", cfs.keyspace.metric.liveScannedHistogram);
         coordinatorReadLatency = Metrics.newTimer(factory.createMetricName("CoordinatorReadLatency"), TimeUnit.MICROSECONDS, TimeUnit.SECONDS);
         coordinatorScanLatency = Metrics.newTimer(factory.createMetricName("CoordinatorScanLatency"), TimeUnit.MICROSECONDS, TimeUnit.SECONDS);
+        waitingOnFreeMemtableSpace = Metrics.newTimer(factory.createMetricName("WaitingOnFreeMemtableSpace"), TimeUnit.MICROSECONDS, TimeUnit.SECONDS);
+
+        trueSnapshotsSize = createColumnFamilyGauge("SnapshotsSize", new Gauge<Long>()
+        {
+            public Long value()
+            {
+                return cfs.trueSnapshotsSize();
+            }
+        });
+        rowCacheHitOutOfRange = createColumnFamilyCounter("RowCacheHitOutOfRange");
+        rowCacheHit = createColumnFamilyCounter("RowCacheHit");
+        rowCacheMiss = createColumnFamilyCounter("RowCacheMiss");
+
         casPrepare = new LatencyMetrics(factory, "CasPrepare", cfs.keyspace.metric.casPrepare);
         casPropose = new LatencyMetrics(factory, "CasPropose", cfs.keyspace.metric.casPropose);
         casCommit = new LatencyMetrics(factory, "CasCommit", cfs.keyspace.metric.casCommit);
@@ -480,9 +542,10 @@
         Metrics.defaultRegistry().removeMetric(factory.createMetricName("KeyCacheHitRate"));
         Metrics.defaultRegistry().removeMetric(factory.createMetricName("CoordinatorReadLatency"));
         Metrics.defaultRegistry().removeMetric(factory.createMetricName("CoordinatorScanLatency"));
-        Metrics.defaultRegistry().removeMetric(factory.createMetricName("PendingTasks"));
+        Metrics.defaultRegistry().removeMetric(factory.createMetricName("WaitingOnFreeMemtableSpace"));
     }
 
+
     /**
      * Create a gauge that will be part of a merged version of all column families.  The global gauge
      * will merge each CF gauge by adding their values 

diff --git a/src/java/org/apache/cassandra/metrics/CommitLogMetrics.java b/src/java/org/apache/cassandra/metrics/CommitLogMetrics.java
index c18b3a2..e9c0719 100644
--- a/src/java/org/apache/cassandra/metrics/CommitLogMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/CommitLogMetrics.java

@@ -20,8 +20,11 @@
 import com.yammer.metrics.Metrics;
 import com.yammer.metrics.core.Gauge;
 
-import org.apache.cassandra.db.commitlog.CommitLogAllocator;
-import org.apache.cassandra.db.commitlog.ICommitLogExecutorService;
+import com.yammer.metrics.core.Timer;
+import org.apache.cassandra.db.commitlog.AbstractCommitLogService;
+import org.apache.cassandra.db.commitlog.CommitLogSegmentManager;
+
+import java.util.concurrent.TimeUnit;
 
 /**
  * Metrics for commit log
@@ -36,21 +39,25 @@
     public final Gauge<Long> pendingTasks;
     /** Current size used by all the commit log segments */
     public final Gauge<Long> totalCommitLogSize;
+    /** Time spent waiting for a CLS to be allocated - under normal conditions this should be zero */
+    public final Timer waitingOnSegmentAllocation;
+    /** The time spent waiting on CL sync; for Periodic this is only occurs when the sync is lagging its sync interval */
+    public final Timer waitingOnCommit;
 
-    public CommitLogMetrics(final ICommitLogExecutorService executor, final CommitLogAllocator allocator)
+    public CommitLogMetrics(final AbstractCommitLogService service, final CommitLogSegmentManager allocator)
     {
         completedTasks = Metrics.newGauge(factory.createMetricName("CompletedTasks"), new Gauge<Long>()
         {
             public Long value()
             {
-                return executor.getCompletedTasks();
+                return service.getCompletedTasks();
             }
         });
         pendingTasks = Metrics.newGauge(factory.createMetricName("PendingTasks"), new Gauge<Long>()
         {
             public Long value()
             {
-                return executor.getPendingTasks();
+                return service.getPendingTasks();
             }
         });
         totalCommitLogSize = Metrics.newGauge(factory.createMetricName("TotalCommitLogSize"), new Gauge<Long>()
@@ -60,5 +67,7 @@
                 return allocator.bytesUsed();
             }
         });
+        waitingOnSegmentAllocation = Metrics.newTimer(factory.createMetricName("WaitingOnSegmentAllocation"), TimeUnit.MICROSECONDS, TimeUnit.SECONDS);
+        waitingOnCommit = Metrics.newTimer(factory.createMetricName("WaitingOnCommit"), TimeUnit.MICROSECONDS, TimeUnit.SECONDS);
     }
 }

diff --git a/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java b/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java
index 7a768b8..6fa64e9 100644
--- a/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java

@@ -32,16 +32,24 @@
  */
 public class KeyspaceMetrics
 {
-    /** Total amount of data stored in the memtable, including column related overhead. */
-    public final Gauge<Long> memtableDataSize;
-    /** Total amount of data stored in the memtables (2i and pending flush memtables included). */
-    public final Gauge<Long> allMemtablesDataSize;
+    /** Total amount of live data stored in the memtable, excluding any data structure overhead */
+    public final Gauge<Long> memtableLiveDataSize;
+    /** Total amount of data stored in the memtable that resides on-heap, including column related overhead and overwritten rows. */
+    public final Gauge<Long> memtableOnHeapDataSize;
+    /** Total amount of data stored in the memtable that resides off-heap, including column related overhead and overwritten rows. */
+    public final Gauge<Long> memtableOffHeapDataSize;
+    /** Total amount of live data stored in the memtables (2i and pending flush memtables included) that resides off-heap, excluding any data structure overhead */
+    public final Gauge<Long> allMemtablesLiveDataSize;
+    /** Total amount of data stored in the memtables (2i and pending flush memtables included) that resides on-heap. */
+    public final Gauge<Long> allMemtablesOnHeapDataSize;
+    /** Total amount of data stored in the memtables (2i and pending flush memtables included) that resides off-heap. */
+    public final Gauge<Long> allMemtablesOffHeapDataSize;
     /** Total number of columns present in the memtable. */
     public final Gauge<Long> memtableColumnsCount;
     /** Number of times flush has resulted in the memtable being switched out. */
     public final Gauge<Long> memtableSwitchCount;
     /** Estimated number of tasks pending for this column family */
-    public final Gauge<Integer> pendingTasks;
+    public final Gauge<Long> pendingFlushes;
     /** Estimate of number of pending compactios for this CF */
     public final Gauge<Long> pendingCompactions;
     /** Disk space used by SSTables belonging to this CF */
@@ -68,8 +76,8 @@
     public final LatencyMetrics casPropose;
     /** CAS Commit metrics */
     public final LatencyMetrics casCommit;
-
-    private final MetricNameFactory factory;
+    
+    public final MetricNameFactory factory;
     private Keyspace keyspace;
     
     /** set containing names of all the metrics stored here, for releasing later */
@@ -91,18 +99,46 @@
                 return metric.memtableColumnsCount.value();
             }
         });
-        memtableDataSize = createKeyspaceGauge("MemtableDataSize", new MetricValue()
+        memtableLiveDataSize = createKeyspaceGauge("MemtableLiveDataSize", new MetricValue()
         {
             public Long getValue(ColumnFamilyMetrics metric)
             {
-                return metric.memtableDataSize.value();
+                return metric.memtableLiveDataSize.value();
             }
         }); 
-        allMemtablesDataSize = createKeyspaceGauge("AllMemtablesDataSize", new MetricValue()
+        memtableOnHeapDataSize = createKeyspaceGauge("MemtableOnHeapDataSize", new MetricValue()
         {
             public Long getValue(ColumnFamilyMetrics metric)
             {
-                return metric.allMemtablesDataSize.value();
+                return metric.memtableOnHeapSize.value();
+            }
+        });
+        memtableOffHeapDataSize = createKeyspaceGauge("MemtableOffHeapDataSize", new MetricValue()
+        {
+            public Long getValue(ColumnFamilyMetrics metric)
+            {
+                return metric.memtableOffHeapSize.value();
+            }
+        });
+        allMemtablesLiveDataSize = createKeyspaceGauge("AllMemtablesLiveDataSize", new MetricValue()
+        {
+            public Long getValue(ColumnFamilyMetrics metric)
+            {
+                return metric.allMemtablesLiveDataSize.value();
+            }
+        });
+        allMemtablesOnHeapDataSize = createKeyspaceGauge("AllMemtablesOnHeapDataSize", new MetricValue()
+        {
+            public Long getValue(ColumnFamilyMetrics metric)
+            {
+                return metric.allMemtablesOnHeapSize.value();
+            }
+        });
+        allMemtablesOffHeapDataSize = createKeyspaceGauge("AllMemtablesOffHeapDataSize", new MetricValue()
+        {
+            public Long getValue(ColumnFamilyMetrics metric)
+            {
+                return metric.allMemtablesOffHeapSize.value();
             }
         });
         memtableSwitchCount = createKeyspaceGauge("MemtableSwitchCount", new MetricValue()
@@ -119,11 +155,11 @@
                 return (long) metric.pendingCompactions.value();
             }
         });
-        pendingTasks = Metrics.newGauge(factory.createMetricName("PendingTasks"), new Gauge<Integer>()
+        pendingFlushes = createKeyspaceGauge("PendingFlushes", new MetricValue()
         {
-            public Integer value()
+            public Long getValue(ColumnFamilyMetrics metric)
             {
-                return Keyspace.switchLock.getQueueLength();
+                return (long) metric.pendingFlushes.count();
             }
         });
         liveDiskSpaceUsed = createKeyspaceGauge("LiveDiskSpaceUsed", new MetricValue()
@@ -176,7 +212,6 @@
         readLatency.release();
         writeLatency.release();
         rangeLatency.release();
-        Metrics.defaultRegistry().removeMetric(factory.createMetricName("PendingTasks"));
     }
     
     /**

diff --git a/src/java/org/apache/cassandra/metrics/MetricNameFactory.java b/src/java/org/apache/cassandra/metrics/MetricNameFactory.java
index 5c1a5c2..78f8b14 100644
--- a/src/java/org/apache/cassandra/metrics/MetricNameFactory.java
+++ b/src/java/org/apache/cassandra/metrics/MetricNameFactory.java

@@ -22,10 +22,10 @@
 public interface MetricNameFactory
 {
     /**
-     * Create {@link MetricName} from given metric name.
+     * Create a qualified name from given metric name.
      *
-     * @param metricName Name part of {@link MetricName}.
-     * @return new MetricName with given metric name.
+     * @param metricName part of qualified name.
+     * @return new String with given metric name.
      */
     MetricName createMetricName(String metricName);
 }

diff --git a/src/java/org/apache/cassandra/metrics/SEPMetrics.java b/src/java/org/apache/cassandra/metrics/SEPMetrics.java
new file mode 100644
index 0000000..fbccc3b
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/SEPMetrics.java

@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.metrics;
+
+import com.yammer.metrics.Metrics;
+import com.yammer.metrics.core.Gauge;
+import org.apache.cassandra.concurrent.SEPExecutor;
+
+public class SEPMetrics
+{
+    /** Number of active tasks. */
+    public final Gauge<Integer> activeTasks;
+    /** Number of tasks that had blocked before being accepted (or rejected). */
+    public final Gauge<Integer> totalBlocked;
+    /**
+     * Number of tasks currently blocked, waiting to be accepted by
+     * the executor (because all threads are busy and the backing queue is full).
+     */
+    public final Gauge<Long> currentBlocked;
+    /** Number of completed tasks. */
+    public final Gauge<Long> completedTasks;
+
+    /** Number of tasks waiting to be executed. */
+    public final Gauge<Long> pendingTasks;
+
+    private MetricNameFactory factory;
+
+    /**
+     * Create metrics for the given LowSignalExecutor.
+     *
+     * @param executor Thread pool
+     * @param path Type of thread pool
+     * @param poolName Name of thread pool to identify metrics
+     */
+    public SEPMetrics(final SEPExecutor executor, String path, String poolName)
+    {
+        this.factory = new ThreadPoolMetricNameFactory("ThreadPools", path, poolName);
+        activeTasks = Metrics.newGauge(factory.createMetricName("ActiveTasks"), new Gauge<Integer>()
+        {
+            public Integer value()
+            {
+                return executor.getActiveCount();
+            }
+        });
+        pendingTasks = Metrics.newGauge(factory.createMetricName("PendingTasks"), new Gauge<Long>()
+        {
+            public Long value()
+            {
+                return executor.getPendingTasks();
+            }
+        });
+        totalBlocked = Metrics.newGauge(factory.createMetricName("TotalBlockedTasks"), new Gauge<Integer>()
+        {
+            public Integer value()
+            {
+                return executor.getTotalBlockedTasks();
+            }
+        });
+        currentBlocked = Metrics.newGauge(factory.createMetricName("CurrentlyBlockedTasks"), new Gauge<Long>()
+        {
+            public Long value()
+            {
+                return (long) executor.getCurrentlyBlockedTasks();
+            }
+        });
+        completedTasks = Metrics.newGauge(factory.createMetricName("CompletedTasks"), new Gauge<Long>()
+        {
+            public Long value()
+            {
+                return executor.getCompletedTasks();
+            }
+        });
+    }
+
+    public void release()
+    {
+        Metrics.defaultRegistry().removeMetric(factory.createMetricName("ActiveTasks"));
+        Metrics.defaultRegistry().removeMetric(factory.createMetricName("PendingTasks"));
+        Metrics.defaultRegistry().removeMetric(factory.createMetricName("CompletedTasks"));
+        Metrics.defaultRegistry().removeMetric(factory.createMetricName("TotalBlockedTasks"));
+        Metrics.defaultRegistry().removeMetric(factory.createMetricName("CurrentlyBlockedTasks"));
+    }
+}

diff --git a/src/java/org/apache/cassandra/metrics/ThreadPoolMetricNameFactory.java b/src/java/org/apache/cassandra/metrics/ThreadPoolMetricNameFactory.java
new file mode 100644
index 0000000..4afc4d3
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/ThreadPoolMetricNameFactory.java

@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.metrics;
+
+import com.yammer.metrics.core.MetricName;
+
+class ThreadPoolMetricNameFactory implements MetricNameFactory
+{
+    private final String type;
+    private final String path;
+    private final String poolName;
+
+    ThreadPoolMetricNameFactory(String type, String path, String poolName)
+    {
+        this.type = type;
+        this.path = path;
+        this.poolName = poolName;
+    }
+
+    public MetricName createMetricName(String metricName)
+    {
+        String groupName = ThreadPoolMetrics.class.getPackage().getName();
+        StringBuilder mbeanName = new StringBuilder();
+        mbeanName.append(groupName).append(":");
+        mbeanName.append("type=").append(type);
+        mbeanName.append(",path=").append(path);
+        mbeanName.append(",scope=").append(poolName);
+        mbeanName.append(",name=").append(metricName);
+
+        return new MetricName(groupName, type, metricName, path + "." + poolName, mbeanName.toString());
+    }
+}

diff --git a/src/java/org/apache/cassandra/metrics/ThreadPoolMetrics.java b/src/java/org/apache/cassandra/metrics/ThreadPoolMetrics.java
index af54cdb..3cebf07 100644
--- a/src/java/org/apache/cassandra/metrics/ThreadPoolMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/ThreadPoolMetrics.java

@@ -52,7 +52,7 @@
      */
     public ThreadPoolMetrics(final ThreadPoolExecutor executor, String path, String poolName)
     {
-        this.factory = new ThreadPoolMetricNameFactory(path, poolName);
+        this.factory = new ThreadPoolMetricNameFactory("ThreadPools", path, poolName);
 
         activeTasks = Metrics.newGauge(factory.createMetricName("ActiveTasks"), new Gauge<Integer>()
         {
@@ -87,30 +87,4 @@
         Metrics.defaultRegistry().removeMetric(factory.createMetricName("TotalBlockedTasks"));
         Metrics.defaultRegistry().removeMetric(factory.createMetricName("CurrentlyBlockedTasks"));
     }
-
-    class ThreadPoolMetricNameFactory implements MetricNameFactory
-    {
-        private final String path;
-        private final String poolName;
-
-        ThreadPoolMetricNameFactory(String path, String poolName)
-        {
-            this.path = path;
-            this.poolName = poolName;
-        }
-
-        public MetricName createMetricName(String metricName)
-        {
-            String groupName = ThreadPoolMetrics.class.getPackage().getName();
-            String type = "ThreadPools";
-            StringBuilder mbeanName = new StringBuilder();
-            mbeanName.append(groupName).append(":");
-            mbeanName.append("type=").append(type);
-            mbeanName.append(",path=").append(path);
-            mbeanName.append(",scope=").append(poolName);
-            mbeanName.append(",name=").append(metricName);
-
-            return new MetricName(groupName, type, metricName, path + "." + poolName, mbeanName.toString());
-        }
-    }
 }

diff --git a/src/java/org/apache/cassandra/net/IAsyncCallbackWithFailure.java b/src/java/org/apache/cassandra/net/IAsyncCallbackWithFailure.java
index 1f95579..744bb62 100644
--- a/src/java/org/apache/cassandra/net/IAsyncCallbackWithFailure.java
+++ b/src/java/org/apache/cassandra/net/IAsyncCallbackWithFailure.java

@@ -21,6 +21,7 @@
 
 public interface IAsyncCallbackWithFailure<T> extends IAsyncCallback<T>
 {
+
     /**
      * Called when there is an exception on the remote node or timeout happens
      */

diff --git a/src/java/org/apache/cassandra/net/IncomingStreamingConnection.java b/src/java/org/apache/cassandra/net/IncomingStreamingConnection.java
index 20392f2..003bbf9 100644
--- a/src/java/org/apache/cassandra/net/IncomingStreamingConnection.java
+++ b/src/java/org/apache/cassandra/net/IncomingStreamingConnection.java

@@ -62,7 +62,7 @@
             // The receiving side distinguish two connections by looking at StreamInitMessage#isForOutgoing.
             // Note: we cannot use the same socket for incoming and outgoing streams because we want to
             // parallelize said streams and the socket is blocking, so we might deadlock.
-            StreamResultFuture.initReceivingSide(init.planId, init.description, init.from, socket, init.isForOutgoing, version);
+            StreamResultFuture.initReceivingSide(init.sessionIndex, init.planId, init.description, init.from, socket, init.isForOutgoing, version);
         }
         catch (IOException e)
         {

diff --git a/src/java/org/apache/cassandra/net/IncomingTcpConnection.java b/src/java/org/apache/cassandra/net/IncomingTcpConnection.java
index 3296cfd..ee44493 100644
--- a/src/java/org/apache/cassandra/net/IncomingTcpConnection.java
+++ b/src/java/org/apache/cassandra/net/IncomingTcpConnection.java

@@ -21,9 +21,15 @@
 import java.net.InetAddress;
 import java.net.Socket;
 import java.net.SocketException;
+import java.util.zip.Checksum;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
+import net.jpountz.lz4.LZ4BlockInputStream;
+import net.jpountz.lz4.LZ4FastDecompressor;
+import net.jpountz.lz4.LZ4Factory;
+import net.jpountz.xxhash.XXHashFactory;
 import org.xerial.snappy.SnappyInputStream;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -68,8 +74,10 @@
     {
         try
         {
-            if (version < MessagingService.VERSION_12)
-                throw new UnsupportedOperationException("Unable to read obsolete message version " + version + "; the earliest version supported is 1.2.0");
+            if (version < MessagingService.VERSION_20)
+                throw new UnsupportedOperationException(String.format("Unable to read obsolete message version %s; "
+                                                                      + "The earliest version supported is 2.0.0",
+                                                                      version));
 
             receiveMessages();
         }
@@ -109,7 +117,18 @@
         if (compressed)
         {
             logger.debug("Upgrading incoming connection to be compressed");
-            in = new DataInputStream(new SnappyInputStream(socket.getInputStream()));
+            if (version < MessagingService.VERSION_21)
+            {
+                in = new DataInputStream(new SnappyInputStream(socket.getInputStream()));
+            }
+            else
+            {
+                LZ4FastDecompressor decompressor = LZ4Factory.fastestInstance().fastDecompressor();
+                Checksum checksum = XXHashFactory.fastestInstance().newStreamingHash32(OutboundTcpConnection.LZ4_HASH_SEED).asChecksum();
+                in = new DataInputStream(new LZ4BlockInputStream(socket.getInputStream(),
+                                                                 decompressor,
+                                                                 checksum));
+            }
         }
         else
         {

diff --git a/src/java/org/apache/cassandra/net/MessageOut.java b/src/java/org/apache/cassandra/net/MessageOut.java
index dd6cae8..c78b13f 100644
--- a/src/java/org/apache/cassandra/net/MessageOut.java
+++ b/src/java/org/apache/cassandra/net/MessageOut.java

@@ -31,6 +31,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
@@ -100,7 +101,7 @@
         return sbuf.toString();
     }
 
-    public void serialize(DataOutputStream out, int version) throws IOException
+    public void serialize(DataOutputPlus out, int version) throws IOException
     {
         CompactEndpointSerializationHelper.serialize(from, out);
 

diff --git a/src/java/org/apache/cassandra/net/MessagingService.java b/src/java/org/apache/cassandra/net/MessagingService.java
index d2e65d8..c6b0098 100644
--- a/src/java/org/apache/cassandra/net/MessagingService.java
+++ b/src/java/org/apache/cassandra/net/MessagingService.java

@@ -50,6 +50,7 @@
 import org.apache.cassandra.gms.GossipDigestAck2;
 import org.apache.cassandra.gms.GossipDigestSyn;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.locator.ILatencySubscriber;
 import org.apache.cassandra.metrics.ConnectionMetrics;
@@ -63,17 +64,17 @@
 import org.apache.cassandra.tracing.TraceState;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
 public final class MessagingService implements MessagingServiceMBean
 {
     public static final String MBEAN_NAME = "org.apache.cassandra.net:type=MessagingService";
 
     // 8 bits version, so don't waste versions
-    public static final int VERSION_12  = 6;
-    public static final int VERSION_20  = 7;
-    public static final int current_version = VERSION_20;
-
-    public boolean allNodesAtLeast20 = true;
+    public static final int VERSION_12 = 6;
+    public static final int VERSION_20 = 7;
+    public static final int VERSION_21 = 8;
+    public static final int current_version = VERSION_21;
 
     public static final String FAILURE_CALLBACK_PARAM = "CAL_BAC";
     public static final byte[] ONE_BYTE = new byte[1];
@@ -84,6 +85,8 @@
      */
     public static final int PROTOCOL_MAGIC = 0xCA552DFA;
 
+    private boolean allNodesAtLeast21 = true;
+
     /* All verb handler identifiers */
     public enum Verb
     {
@@ -135,9 +138,9 @@
     public static final EnumMap<MessagingService.Verb, Stage> verbStages = new EnumMap<MessagingService.Verb, Stage>(MessagingService.Verb.class)
     {{
         put(Verb.MUTATION, Stage.MUTATION);
+        put(Verb.COUNTER_MUTATION, Stage.COUNTER_MUTATION);
         put(Verb.READ_REPAIR, Stage.MUTATION);
         put(Verb.TRUNCATE, Stage.MUTATION);
-        put(Verb.COUNTER_MUTATION, Stage.MUTATION);
         put(Verb.PAXOS_PREPARE, Stage.MUTATION);
         put(Verb.PAXOS_PROPOSE, Stage.MUTATION);
         put(Verb.PAXOS_COMMIT, Stage.MUTATION);
@@ -193,8 +196,8 @@
         put(Verb.REQUEST_RESPONSE, CallbackDeterminedSerializer.instance);
         put(Verb.INTERNAL_RESPONSE, CallbackDeterminedSerializer.instance);
 
-        put(Verb.MUTATION, RowMutation.serializer);
-        put(Verb.READ_REPAIR, RowMutation.serializer);
+        put(Verb.MUTATION, Mutation.serializer);
+        put(Verb.READ_REPAIR, Mutation.serializer);
         put(Verb.READ, ReadCommand.serializer);
         put(Verb.RANGE_SLICE, RangeSliceCommand.serializer);
         put(Verb.PAGED_RANGE, PagedRangeCommand.serializer);
@@ -253,7 +256,7 @@
             throw new UnsupportedOperationException();
         }
 
-        public void serialize(Object o, DataOutput out, int version) throws IOException
+        public void serialize(Object o, DataOutputPlus out, int version) throws IOException
         {
             throw new UnsupportedOperationException();
         }
@@ -348,8 +351,9 @@
 
                 if (expiredCallbackInfo.shouldHint())
                 {
-                    RowMutation rm = (RowMutation) ((WriteCallbackInfo) expiredCallbackInfo).sentMessage.payload;
-                    return StorageProxy.submitHint(rm, expiredCallbackInfo.target, null);
+                    Mutation mutation = (Mutation) ((WriteCallbackInfo) expiredCallbackInfo).sentMessage.payload;
+
+                    return StorageProxy.submitHint(mutation, expiredCallbackInfo.target, null);
                 }
 
                 return null;
@@ -393,7 +397,7 @@
      */
     public void convict(InetAddress ep)
     {
-        logger.debug("Resetting pool for " + ep);
+        logger.debug("Resetting pool for {}", ep);
         getConnectionPool(ep).reset();
     }
 
@@ -567,6 +571,7 @@
     {
         assert message.verb == Verb.MUTATION || message.verb == Verb.COUNTER_MUTATION;
         int messageId = nextId();
+
         CallbackInfo previous = callbacks.put(messageId,
                                               new WriteCallbackInfo(to,
                                                                     cb,
@@ -605,7 +610,6 @@
      * @param cb      callback interface which is used to pass the responses or
      *                suggest that a timeout occurred to the invoker of the send().
      * @param timeout the timeout used for expiration
-     * @param failureCallback true if given cb has failure callback
      * @return an reference to message id used to match with the result
      */
     public int sendRR(MessageOut message, InetAddress to, IAsyncCallback cb, long timeout, boolean failureCallback)
@@ -772,19 +776,24 @@
         return packed >>> (start + 1) - count & ~(-1 << count);
     }
 
+    public boolean areAllNodesAtLeast21()
+    {
+        return allNodesAtLeast21;
+    }
+
     /**
      * @return the last version associated with address, or @param version if this is the first such version
      */
     public int setVersion(InetAddress endpoint, int version)
     {
         logger.debug("Setting version {} for {}", version, endpoint);
-        if (version < VERSION_20)
-            allNodesAtLeast20 = false;
+        if (version < VERSION_21)
+            allNodesAtLeast21 = false;
         Integer v = versions.put(endpoint, version);
 
         // if the version was increased to 2.0 or later, see if all nodes are >= 2.0 now
-        if (v != null && v < VERSION_20 && version >= VERSION_20)
-            refreshAllNodesAtLeast20();
+        if (v != null && v < VERSION_21 && version >= VERSION_21)
+            refreshAllNodesAtLeast21();
 
         return v == null ? version : v;
     }
@@ -793,21 +802,21 @@
     {
         logger.debug("Resetting version for {}", endpoint);
         Integer removed = versions.remove(endpoint);
-        if (removed != null && removed <= VERSION_20)
-            refreshAllNodesAtLeast20();
+        if (removed != null && removed <= VERSION_21)
+            refreshAllNodesAtLeast21();
     }
 
-    private void refreshAllNodesAtLeast20()
+    private void refreshAllNodesAtLeast21()
     {
         for (Integer version: versions.values())
         {
-            if (version < VERSION_20)
+            if (version < VERSION_21)
             {
-                allNodesAtLeast20 = false;
+                allNodesAtLeast21 = false;
                 return;
             }
         }
-        allNodesAtLeast20 = true;
+        allNodesAtLeast21 = true;
     }
 
     public int getVersion(InetAddress endpoint)
@@ -841,6 +850,7 @@
         return versions.containsKey(endpoint);
     }
 
+
     public void incrementDroppedMessages(Verb verb)
     {
         assert DROPPABLE_VERBS.contains(verb) : "Verb " + verb + " should not legally be dropped";

diff --git a/src/java/org/apache/cassandra/net/OutboundTcpConnection.java b/src/java/org/apache/cassandra/net/OutboundTcpConnection.java
index c1de563..e1cb7a2 100644
--- a/src/java/org/apache/cassandra/net/OutboundTcpConnection.java
+++ b/src/java/org/apache/cassandra/net/OutboundTcpConnection.java

@@ -19,12 +19,15 @@
 
 import java.io.BufferedOutputStream;
 import java.io.DataInputStream;
-import java.io.DataOutputStream;
+import java.io.DataOutput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.net.Socket;
 import java.net.SocketException;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
 import java.util.UUID;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.CountDownLatch;
@@ -32,10 +35,16 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.zip.Checksum;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import net.jpountz.lz4.LZ4BlockOutputStream;
+import net.jpountz.lz4.LZ4Compressor;
+import net.jpountz.lz4.LZ4Factory;
+import net.jpountz.xxhash.XXHashFactory;
+import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.tracing.TraceState;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
@@ -58,17 +67,17 @@
     private static final int WAIT_FOR_VERSION_MAX_TIME = 5000;
     private static final int NO_VERSION = Integer.MIN_VALUE;
 
-    // sending thread reads from "active" (one of queue1, queue2) until it is empty.
-    // then it swaps it with "backlog."
-    private volatile BlockingQueue<QueuedMessage> backlog = new LinkedBlockingQueue<QueuedMessage>();
-    private volatile BlockingQueue<QueuedMessage> active = new LinkedBlockingQueue<QueuedMessage>();
+    static final int LZ4_HASH_SEED = 0x9747b28c;
+
+    private final BlockingQueue<QueuedMessage> backlog = new LinkedBlockingQueue<>();
 
     private final OutboundTcpConnectionPool poolReference;
 
-    private DataOutputStream out;
+    private DataOutputStreamPlus out;
     private Socket socket;
     private volatile long completed;
     private final AtomicLong dropped = new AtomicLong();
+    private volatile int currentMsgBufferCount = 0;
     private int targetVersion;
 
     public OutboundTcpConnection(OutboundTcpConnectionPool pool)
@@ -86,7 +95,8 @@
 
     public void enqueue(MessageOut<?> message, int id)
     {
-        expireMessages();
+        if (backlog.size() > 1024)
+            expireMessages();
         try
         {
             backlog.put(new QueuedMessage(message, id));
@@ -99,7 +109,6 @@
 
     void closeSocket(boolean destroyThread)
     {
-        active.clear();
         backlog.clear();
         isStopped = destroyThread; // Exit loop to stop the thread
         enqueue(CLOSE_SENTINEL, -1);
@@ -117,47 +126,61 @@
 
     public void run()
     {
+        // keeping list (batch) size small for now; that way we don't have an unbounded array (that we never resize)
+        final List<QueuedMessage> drainedMessages = new ArrayList<>(128);
+        outer:
         while (true)
         {
-            QueuedMessage qm = active.poll();
-            if (qm == null)
+            if (backlog.drainTo(drainedMessages, drainedMessages.size()) == 0)
             {
-                // exhausted the active queue.  switch to backlog, once there's something to process there
                 try
                 {
-                    qm = backlog.take();
+                    drainedMessages.add(backlog.take());
                 }
                 catch (InterruptedException e)
                 {
                     throw new AssertionError(e);
                 }
 
-                BlockingQueue<QueuedMessage> tmp = backlog;
-                backlog = active;
-                active = tmp;
             }
+            currentMsgBufferCount = drainedMessages.size();
 
-            MessageOut<?> m = qm.message;
-            if (m == CLOSE_SENTINEL)
+            int count = drainedMessages.size();
+            for (QueuedMessage qm : drainedMessages)
             {
-                disconnect();
-                if (isStopped)
-                    break;
-                continue;
+                try
+                {
+                    MessageOut<?> m = qm.message;
+                    if (m == CLOSE_SENTINEL)
+                    {
+                        disconnect();
+                        if (isStopped)
+                            break outer;
+                        continue;
+                    }
+                    if (qm.isTimedOut(m.getTimeout()))
+                        dropped.incrementAndGet();
+                    else if (socket != null || connect())
+                        writeConnected(qm, count == 1 && backlog.size() == 0);
+                    else
+                        // clear out the queue, else gossip messages back up.
+                        backlog.clear();
+                }
+                catch (Exception e)
+                {
+                    // really shouldn't get here, as exception handling in writeConnected() is reasonably robust
+                    // but we want to catch anything bad we don't drop the messages in the current batch
+                    logger.error("error processing a message intended for {}", poolReference.endPoint(), e);
+                }
+                currentMsgBufferCount = --count;
             }
-            if (qm.isTimedOut(m.getTimeout()))
-                dropped.incrementAndGet();
-            else if (socket != null || connect())
-                writeConnected(qm);
-            else
-                // clear out the queue, else gossip messages back up.
-                active.clear();
+            drainedMessages.clear();
         }
     }
 
     public int getPendingMessages()
     {
-        return active.size() + backlog.size();
+        return backlog.size() + currentMsgBufferCount;
     }
 
     public long getCompletedMesssages()
@@ -177,7 +200,7 @@
                || (DatabaseDescriptor.internodeCompression() == Config.InternodeCompression.dc && !isLocalDC(poolReference.endPoint()));
     }
 
-    private void writeConnected(QueuedMessage qm)
+    private void writeConnected(QueuedMessage qm, boolean flush)
     {
         try
         {
@@ -196,14 +219,14 @@
                 {
                     state.trace(message);
                     if (qm.message.verb == MessagingService.Verb.REQUEST_RESPONSE)
-                        Tracing.instance.stopNonLocal(state);
+                        Tracing.instance.doneWithNonLocalSession(state);
                 }
             }
 
             writeInternal(qm.message, qm.id, qm.timestamp);
 
             completed++;
-            if (active.peek() == null)
+            if (flush)
                 out.flush();
         }
         catch (Exception e)
@@ -212,7 +235,7 @@
             if (e instanceof IOException)
             {
                 if (logger.isDebugEnabled())
-                    logger.debug("error writing to " + poolReference.endPoint(), e);
+                    logger.debug("error writing to {}", poolReference.endPoint(), e);
 
                 // if the message was important, such as a repair acknowledgement, put it back on the queue
                 // to retry after re-connecting.  See CASSANDRA-5393
@@ -231,7 +254,7 @@
             else
             {
                 // Non IO exceptions are likely a programming error so let's not silence them
-                logger.error("error writing to " + poolReference.endPoint(), e);
+                logger.error("error writing to {}", poolReference.endPoint(), e);
             }
         }
     }
@@ -251,7 +274,7 @@
         message.serialize(out, targetVersion);
     }
 
-    private static void writeHeader(DataOutputStream out, int version, boolean compressionEnabled) throws IOException
+    private static void writeHeader(DataOutput out, int version, boolean compressionEnabled) throws IOException
     {
         // 2 bits: unused.  used to be "serializer type," which was always Binary
         // 1 bit: compression
@@ -287,7 +310,7 @@
     private boolean connect()
     {
         if (logger.isDebugEnabled())
-            logger.debug("attempting to connect to " + poolReference.endPoint());
+            logger.debug("attempting to connect to {}", poolReference.endPoint());
 
         long start = System.nanoTime();
         long timeout = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getRpcTimeout());
@@ -317,7 +340,7 @@
                         logger.warn("Failed to set send buffer size on internode socket.", se);
                     }
                 }
-                out = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream(), 4096));
+                out = new DataOutputStreamPlus(new BufferedOutputStream(socket.getOutputStream(), 4096));
 
                 out.writeInt(MessagingService.PROTOCOL_MAGIC);
                 writeHeader(out, targetVersion, shouldCompressConnection());
@@ -359,7 +382,22 @@
                 {
                     out.flush();
                     logger.trace("Upgrading OutputStream to be compressed");
-                    out = new DataOutputStream(new SnappyOutputStream(new BufferedOutputStream(socket.getOutputStream())));
+                    if (targetVersion < MessagingService.VERSION_21)
+                    {
+                        // Snappy is buffered, so no need for extra buffering output stream
+                        out = new DataOutputStreamPlus(new SnappyOutputStream(socket.getOutputStream()));
+                    }
+                    else
+                    {
+                        // TODO: custom LZ4 OS that supports BB write methods
+                        LZ4Compressor compressor = LZ4Factory.fastestInstance().fastCompressor();
+                        Checksum checksum = XXHashFactory.fastestInstance().newStreamingHash32(LZ4_HASH_SEED).asChecksum();
+                        out = new DataOutputStreamPlus(new LZ4BlockOutputStream(socket.getOutputStream(),
+                                                                            1 << 14,  // 16k block size
+                                                                            compressor,
+                                                                            checksum,
+                                                                            true)); // no async flushing
+                    }
                 }
 
                 return true;
@@ -418,23 +456,13 @@
 
     private void expireMessages()
     {
-        while (true)
+        Iterator<QueuedMessage> iter = backlog.iterator();
+        while (iter.hasNext())
         {
-            QueuedMessage qm = backlog.peek();
-            if (qm == null || qm.timestamp >= System.currentTimeMillis() - qm.message.getTimeout())
-                break;
-
-            QueuedMessage qm2 = backlog.poll();
-            if (qm2 != qm)
-            {
-                // sending thread switched queues.  add this entry (from the "new" backlog)
-                // at the end of the active queue, which keeps it in the same position relative to the other entries
-                // without having to contend with other clients for the head-of-backlog lock.
-                if (qm2 != null)
-                    active.add(qm2);
-                break;
-            }
-
+            QueuedMessage qm = iter.next();
+            if (qm.timestamp >= System.currentTimeMillis() - qm.message.getTimeout())
+                return;
+            iter.remove();
             dropped.incrementAndGet();
         }
     }

diff --git a/src/java/org/apache/cassandra/net/ResponseVerbHandler.java b/src/java/org/apache/cassandra/net/ResponseVerbHandler.java
index 1d9aa98..0ec91c6 100644
--- a/src/java/org/apache/cassandra/net/ResponseVerbHandler.java
+++ b/src/java/org/apache/cassandra/net/ResponseVerbHandler.java

@@ -19,6 +19,7 @@
 
 import java.util.concurrent.TimeUnit;
 
+import org.apache.cassandra.db.IMutation;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/src/java/org/apache/cassandra/notifications/SSTableRepairStatusChanged.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to src/java/org/apache/cassandra/notifications/SSTableRepairStatusChanged.java
index e42574b..a473a43 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/src/java/org/apache/cassandra/notifications/SSTableRepairStatusChanged.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.io.util;
 /*
- * 
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +6,28 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
  */
+package org.apache.cassandra.notifications;
 
+import java.util.Collection;
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
+import org.apache.cassandra.io.sstable.SSTableReader;
 
-public class ByteBufferOutputStream extends OutputStream
+public class SSTableRepairStatusChanged implements INotification
 {
-    private final ByteBuffer buffer;
+    public final Collection<SSTableReader> sstable;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public SSTableRepairStatusChanged(Collection<SSTableReader> repairStatusChanged)
     {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
+        this.sstable = repairStatusChanged;
     }
 }

diff --git a/src/java/org/apache/cassandra/repair/NodePair.java b/src/java/org/apache/cassandra/repair/NodePair.java
index fb7a72a..bb6be04 100644
--- a/src/java/org/apache/cassandra/repair/NodePair.java
+++ b/src/java/org/apache/cassandra/repair/NodePair.java

@@ -18,13 +18,13 @@
 package org.apache.cassandra.repair;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.net.InetAddress;
 
 import com.google.common.base.Objects;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 
 /**
@@ -63,7 +63,7 @@
 
     public static class NodePairSerializer implements IVersionedSerializer<NodePair>
     {
-        public void serialize(NodePair nodePair, DataOutput out, int version) throws IOException
+        public void serialize(NodePair nodePair, DataOutputPlus out, int version) throws IOException
         {
             CompactEndpointSerializationHelper.serialize(nodePair.endpoint1, out);
             CompactEndpointSerializationHelper.serialize(nodePair.endpoint2, out);

diff --git a/src/java/org/apache/cassandra/repair/RepairJob.java b/src/java/org/apache/cassandra/repair/RepairJob.java
index 931f95a..8057ed5 100644
--- a/src/java/org/apache/cassandra/repair/RepairJob.java
+++ b/src/java/org/apache/cassandra/repair/RepairJob.java

@@ -33,7 +33,7 @@
 import org.apache.cassandra.repair.messages.ValidationRequest;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.MerkleTree;
-import org.apache.cassandra.utils.SimpleCondition;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
 /**
  * RepairJob runs repair on given ColumnFamily.
@@ -64,6 +64,7 @@
      * Create repair job to run on specific columnfamily
      */
     public RepairJob(IRepairJobEventListener listener,
+                     UUID parentSessionId,
                      UUID sessionId,
                      String keyspace,
                      String columnFamily,
@@ -72,7 +73,7 @@
                      ListeningExecutorService taskExecutor)
     {
         this.listener = listener;
-        this.desc = new RepairJobDesc(sessionId, keyspace, columnFamily, range);
+        this.desc = new RepairJobDesc(parentSessionId, sessionId, keyspace, columnFamily, range);
         this.isSequential = isSequential;
         this.taskExecutor = taskExecutor;
         this.treeRequests = new RequestCoordinator<InetAddress>(isSequential)
@@ -122,6 +123,7 @@
 
                 public void onFailure(Throwable throwable)
                 {
+                    // TODO need to propagate error to RepairSession
                     logger.error("Error occurred during snapshot phase", throwable);
                     listener.failedSnapshot();
                     failed = true;

diff --git a/src/java/org/apache/cassandra/repair/RepairJobDesc.java b/src/java/org/apache/cassandra/repair/RepairJobDesc.java
index 596540f..5ce5969 100644
--- a/src/java/org/apache/cassandra/repair/RepairJobDesc.java
+++ b/src/java/org/apache/cassandra/repair/RepairJobDesc.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.repair;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.UUID;
 
@@ -29,6 +28,8 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
 
 /**
@@ -40,6 +41,7 @@
 {
     public static final IVersionedSerializer<RepairJobDesc> serializer = new RepairJobDescSerializer();
 
+    public final UUID parentSessionId;
     /** RepairSession id */
     public final UUID sessionId;
     public final String keyspace;
@@ -47,8 +49,9 @@
     /** repairing range  */
     public final Range<Token> range;
 
-    public RepairJobDesc(UUID sessionId, String keyspace, String columnFamily, Range<Token> range)
+    public RepairJobDesc(UUID parentSessionId, UUID sessionId, String keyspace, String columnFamily, Range<Token> range)
     {
+        this.parentSessionId = parentSessionId;
         this.sessionId = sessionId;
         this.keyspace = keyspace;
         this.columnFamily = columnFamily;
@@ -58,13 +61,7 @@
     @Override
     public String toString()
     {
-        StringBuilder sb = new StringBuilder("[repair #");
-        sb.append(sessionId);
-        sb.append(" on ");
-        sb.append(keyspace).append("/").append(columnFamily);
-        sb.append(", ").append(range);
-        sb.append("]");
-        return sb.toString();
+        return "[repair #" + sessionId + " on " + keyspace + "/" + columnFamily + ", " + range + "]";
     }
 
     @Override
@@ -79,6 +76,7 @@
         if (!keyspace.equals(that.keyspace)) return false;
         if (range != null ? !range.equals(that.range) : that.range != null) return false;
         if (!sessionId.equals(that.sessionId)) return false;
+        if (parentSessionId != null ? !parentSessionId.equals(that.parentSessionId) : that.parentSessionId != null) return false;
 
         return true;
     }
@@ -91,8 +89,14 @@
 
     private static class RepairJobDescSerializer implements IVersionedSerializer<RepairJobDesc>
     {
-        public void serialize(RepairJobDesc desc, DataOutput out, int version) throws IOException
+        public void serialize(RepairJobDesc desc, DataOutputPlus out, int version) throws IOException
         {
+            if (version >= MessagingService.VERSION_21)
+            {
+                out.writeBoolean(desc.parentSessionId != null);
+                if (desc.parentSessionId != null)
+                    UUIDSerializer.serializer.serialize(desc.parentSessionId, out, version);
+            }
             UUIDSerializer.serializer.serialize(desc.sessionId, out, version);
             out.writeUTF(desc.keyspace);
             out.writeUTF(desc.columnFamily);
@@ -101,16 +105,28 @@
 
         public RepairJobDesc deserialize(DataInput in, int version) throws IOException
         {
+            UUID parentSessionId = null;
+            if (version >= MessagingService.VERSION_21)
+            {
+                if (in.readBoolean())
+                    parentSessionId = UUIDSerializer.serializer.deserialize(in, version);
+            }
             UUID sessionId = UUIDSerializer.serializer.deserialize(in, version);
             String keyspace = in.readUTF();
             String columnFamily = in.readUTF();
             Range<Token> range = (Range<Token>)AbstractBounds.serializer.deserialize(in, version);
-            return new RepairJobDesc(sessionId, keyspace, columnFamily, range);
+            return new RepairJobDesc(parentSessionId, sessionId, keyspace, columnFamily, range);
         }
 
         public long serializedSize(RepairJobDesc desc, int version)
         {
             int size = 0;
+            if (version >= MessagingService.VERSION_21)
+            {
+                size += TypeSizes.NATIVE.sizeof(desc.parentSessionId != null);
+                if (desc.parentSessionId != null)
+                    size += UUIDSerializer.serializer.serializedSize(desc.parentSessionId, version);
+            }
             size += UUIDSerializer.serializer.serializedSize(desc.sessionId, version);
             size += TypeSizes.NATIVE.sizeof(desc.keyspace);
             size += TypeSizes.NATIVE.sizeof(desc.columnFamily);

diff --git a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java
index acc8aab..54117a3 100644
--- a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java
+++ b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java

@@ -17,15 +17,33 @@
  */
 package org.apache.cassandra.repair;
 
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.Future;
+
+import com.google.common.base.Predicate;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.net.IVerbHandler;
 import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.repair.messages.RepairMessage;
-import org.apache.cassandra.repair.messages.SyncRequest;
-import org.apache.cassandra.repair.messages.ValidationRequest;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.repair.messages.*;
 import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
 
 /**
  * Handles all repair related message.
@@ -34,16 +52,50 @@
  */
 public class RepairMessageVerbHandler implements IVerbHandler<RepairMessage>
 {
+    private static final Logger logger = LoggerFactory.getLogger(RepairMessageVerbHandler.class);
     public void doVerb(MessageIn<RepairMessage> message, int id)
     {
         // TODO add cancel/interrupt message
         RepairJobDesc desc = message.payload.desc;
         switch (message.payload.messageType)
         {
+            case PREPARE_MESSAGE:
+                PrepareMessage prepareMessage = (PrepareMessage) message.payload;
+                List<ColumnFamilyStore> columnFamilyStores = new ArrayList<>(prepareMessage.cfIds.size());
+                for (UUID cfId : prepareMessage.cfIds)
+                {
+                    Pair<String, String> kscf = Schema.instance.getCF(cfId);
+                    ColumnFamilyStore columnFamilyStore = Keyspace.open(kscf.left).getColumnFamilyStore(kscf.right);
+                    columnFamilyStores.add(columnFamilyStore);
+                }
+                ActiveRepairService.instance.registerParentRepairSession(prepareMessage.parentRepairSession,
+                                                                         columnFamilyStores,
+                                                                         prepareMessage.ranges);
+                MessagingService.instance().sendReply(new MessageOut(MessagingService.Verb.INTERNAL_RESPONSE), id, message.from);
+                break;
+
+            case SNAPSHOT:
+                ColumnFamilyStore cfs = Keyspace.open(desc.keyspace).getColumnFamilyStore(desc.columnFamily);
+                final Range<Token> repairingRange = desc.range;
+                cfs.snapshot(desc.sessionId.toString(), new Predicate<SSTableReader>()
+                {
+                    public boolean apply(SSTableReader sstable)
+                    {
+                        return sstable != null &&
+                               !(sstable.partitioner instanceof LocalPartitioner) && // exclude SSTables from 2i
+                               new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(Collections.singleton(repairingRange));
+                    }
+                });
+
+                logger.debug("Enqueuing response to snapshot request {} to {}", desc.sessionId, message.from);
+                MessagingService.instance().sendReply(new MessageOut(MessagingService.Verb.INTERNAL_RESPONSE), id, message.from);
+                break;
+
             case VALIDATION_REQUEST:
                 ValidationRequest validationRequest = (ValidationRequest) message.payload;
                 // trigger read-only compaction
                 ColumnFamilyStore store = Keyspace.open(desc.keyspace).getColumnFamilyStore(desc.columnFamily);
+
                 Validator validator = new Validator(desc, message.from, validationRequest.gcBefore);
                 CompactionManager.instance.submitValidation(store, validator);
                 break;
@@ -55,6 +107,21 @@
                 task.run();
                 break;
 
+            case ANTICOMPACTION_REQUEST:
+                logger.debug("Got anticompaction request");
+                AnticompactionRequest anticompactionRequest = (AnticompactionRequest) message.payload;
+                try
+                {
+                    List<Future<?>> futures = ActiveRepairService.instance.doAntiCompaction(anticompactionRequest.parentRepairSession);
+                    FBUtilities.waitOnFutures(futures);
+                }
+                catch (Exception e)
+                {
+                    throw new RuntimeException(e);
+                }
+
+                break;
+
             default:
                 ActiveRepairService.instance.handleMessage(message.from, message.payload);
                 break;

diff --git a/src/java/org/apache/cassandra/repair/RepairSession.java b/src/java/org/apache/cassandra/repair/RepairSession.java
index c9a9671..346f3f4 100644
--- a/src/java/org/apache/cassandra/repair/RepairSession.java
+++ b/src/java/org/apache/cassandra/repair/RepairSession.java

@@ -39,6 +39,7 @@
 import org.apache.cassandra.gms.*;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
 /**
  * Coordinates the (active) repair of a token range.
@@ -90,9 +91,11 @@
 
     private volatile Exception exception;
     private final AtomicBoolean isFailed = new AtomicBoolean(false);
+    private final AtomicBoolean fdUnregistered = new AtomicBoolean(false);
 
     // First, all RepairJobs are added to this queue,
     final Queue<RepairJob> jobs = new ConcurrentLinkedQueue<>();
+
     // and after receiving all validation, the job is moved to
     // this map, keyed by CF name.
     final Map<String, RepairJob> syncingJobs = new ConcurrentHashMap<>();
@@ -102,6 +105,7 @@
 
     private final SimpleCondition completed = new SimpleCondition();
     public final Condition differencingDone = new SimpleCondition();
+    public final UUID parentRepairSession;
 
     private volatile boolean terminated = false;
 
@@ -111,23 +115,24 @@
      * @param range range to repair
      * @param keyspace name of keyspace
      * @param isSequential true if performing repair on snapshots sequentially
-     * @param dataCenters the data centers that should be part of the repair; null for all DCs
+     * @param endpoints the data centers that should be part of the repair; null for all DCs
      * @param cfnames names of columnfamilies
      */
-    public RepairSession(Range<Token> range, String keyspace, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, String... cfnames)
+    public RepairSession(UUID parentRepairSession, Range<Token> range, String keyspace, boolean isSequential, Set<InetAddress> endpoints, String... cfnames)
     {
-        this(UUIDGen.getTimeUUID(), range, keyspace, isSequential, dataCenters, hosts, cfnames);
+        this(parentRepairSession, UUIDGen.getTimeUUID(), range, keyspace, isSequential, endpoints, cfnames);
     }
 
-    public RepairSession(UUID id, Range<Token> range, String keyspace, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, String[] cfnames)
+    public RepairSession(UUID parentRepairSession, UUID id, Range<Token> range, String keyspace, boolean isSequential, Set<InetAddress> endpoints, String[] cfnames)
     {
+        this.parentRepairSession = parentRepairSession;
         this.id = id;
         this.isSequential = isSequential;
         this.keyspace = keyspace;
         this.cfnames = cfnames;
         assert cfnames.length > 0 : "Repairing no column families seems pointless, doesn't it";
         this.range = range;
-        this.endpoints = ActiveRepairService.getNeighbors(keyspace, range, dataCenters, hosts);
+        this.endpoints = endpoints;
     }
 
     public UUID getId()
@@ -168,23 +173,32 @@
         assert job.desc.equals(desc);
         if (job.addTree(endpoint, tree) == 0)
         {
-            logger.debug("All response received for " + getId() + "/" + desc.columnFamily);
+            logger.debug("All responses received for {}/{}", getId(), desc.columnFamily);
             if (!job.isFailed())
             {
                 syncingJobs.put(job.desc.columnFamily, job);
                 job.submitDifferencers();
             }
 
-            // This job is complete, switching to next in line (note that only
-            // one thread will can ever do this)
+            // This job is complete, switching to next in line (note that only one thread will ever do this)
             jobs.poll();
             RepairJob nextJob = jobs.peek();
             if (nextJob == null)
+            {
+                // Unregister from FailureDetector once we've completed synchronizing Merkle trees.
+                // After this point, we rely on tcp_keepalive for individual sockets to notify us when a connection is down.
+                // See CASSANDRA-3569
+                if (fdUnregistered.compareAndSet(false, true))
+                    FailureDetector.instance.unregisterFailureDetectionEventListener(this);
+
                 // We are done with this repair session as far as differencing
                 // is considered. Just inform the session
                 differencingDone.signalAll();
+            }
             else
+            {
                 nextJob.sendTreeRequests(endpoints);
+            }
         }
     }
 
@@ -259,7 +273,7 @@
             {
                 String message = String.format("Cannot proceed on repair because a neighbor (%s) is dead: session failed", endpoint);
                 differencingDone.signalAll();
-                logger.error(String.format("[repair #%s] ", getId()) + message);
+                logger.error("[repair #{}] {}", getId(), message);
                 throw new IOException(message);
             }
         }
@@ -270,15 +284,16 @@
             // Create and queue a RepairJob for each column family
             for (String cfname : cfnames)
             {
-                RepairJob job = new RepairJob(this, id, keyspace, cfname, range, isSequential, taskExecutor);
+                RepairJob job = new RepairJob(this, parentRepairSession, id, keyspace, cfname, range, isSequential, taskExecutor);
                 jobs.offer(job);
             }
-
+            logger.debug("Sending tree requests to endpoints {}", endpoints);
             jobs.peek().sendTreeRequests(endpoints);
 
             // block whatever thread started this session until all requests have been returned:
             // if this thread dies, the session will still complete in the background
             completed.await();
+
             if (exception == null)
             {
                 logger.info(String.format("[repair #%s] session completed successfully", getId()));
@@ -297,7 +312,13 @@
         {
             // mark this session as terminated
             terminate();
+
             ActiveRepairService.instance.removeFromActiveSessions(this);
+
+            // If we've reached here in an exception state without completing Merkle Tree sync, we'll still be registered
+            // with the FailureDetector.
+            if (fdUnregistered.compareAndSet(false, true))
+                FailureDetector.instance.unregisterFailureDetectionEventListener(this);
         }
     }
 
@@ -328,7 +349,7 @@
     {
         String errorMsg = String.format("Endpoint %s died", remote);
         exception = new IOException(errorMsg);
-        // If a node failed, we stop everything (though there could still be some activity in the background)
+        // If a node failed during Merkle creation, we stop everything (though there could still be some activity in the background)
         forceShutdown();
     }
 
@@ -353,8 +374,8 @@
         if (!endpoints.contains(endpoint))
             return;
 
-        // We want a higher confidence in the failure detection than usual because failing a repair wrongly has a high cost (CASSANDRA-7063)
-        if (phi < 100 * DatabaseDescriptor.getPhiConvictThreshold())
+        // We want a higher confidence in the failure detection than usual because failing a repair wrongly has a high cost.
+        if (phi < 2 * DatabaseDescriptor.getPhiConvictThreshold())
             return;
 
         // Though unlikely, it is possible to arrive here multiple time and we

diff --git a/src/java/org/apache/cassandra/repair/SnapshotTask.java b/src/java/org/apache/cassandra/repair/SnapshotTask.java
index 09e8104..6c3afb1 100644
--- a/src/java/org/apache/cassandra/repair/SnapshotTask.java
+++ b/src/java/org/apache/cassandra/repair/SnapshotTask.java

@@ -18,15 +18,14 @@
 package org.apache.cassandra.repair;
 
 import java.net.InetAddress;
-import java.util.List;
 import java.util.concurrent.RunnableFuture;
 
 import com.google.common.util.concurrent.AbstractFuture;
 
-import org.apache.cassandra.db.SnapshotCommand;
 import org.apache.cassandra.net.IAsyncCallbackWithFailure;
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.repair.messages.SnapshotMessage;
 
 /**
  * SnapshotTask is a task that sends snapshot request.
@@ -44,12 +43,9 @@
 
     public void run()
     {
-        MessagingService.instance().sendRRWithFailure(new SnapshotCommand(desc.keyspace,
-                                                                          desc.columnFamily,
-                                                                          desc.sessionId.toString(),
-                                                                          false).createMessage(),
-                                                      endpoint,
-                                                      new SnapshotCallback(this));
+        MessagingService.instance().sendRRWithFailure(new SnapshotMessage(desc).createMessage(),
+                endpoint,
+                new SnapshotCallback(this));
     }
 
     /**

diff --git a/src/java/org/apache/cassandra/repair/StreamingRepairTask.java b/src/java/org/apache/cassandra/repair/StreamingRepairTask.java
index f7203a4..9af949d 100644
--- a/src/java/org/apache/cassandra/repair/StreamingRepairTask.java
+++ b/src/java/org/apache/cassandra/repair/StreamingRepairTask.java

@@ -23,6 +23,7 @@
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.repair.messages.SyncComplete;
 import org.apache.cassandra.repair.messages.SyncRequest;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.streaming.*;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -56,8 +57,12 @@
 
     private void initiateStreaming()
     {
+        long repairedAt = ActiveRepairService.UNREPAIRED_SSTABLE;
+        if (desc.parentSessionId != null && ActiveRepairService.instance.getParentRepairSession(desc.parentSessionId) != null)
+            repairedAt = ActiveRepairService.instance.getParentRepairSession(desc.parentSessionId).repairedAt;
+
         logger.info(String.format("[streaming task #%s] Performing streaming repair of %d ranges with %s", desc.sessionId, request.ranges.size(), request.dst));
-        StreamResultFuture op = new StreamPlan("Repair")
+        StreamResultFuture op = new StreamPlan("Repair", repairedAt, 1)
                                     .flushBeforeTransfer(true)
                                     // request ranges from the remote node
                                     .requestRanges(request.dst, desc.keyspace, request.ranges, desc.columnFamily)

diff --git a/src/java/org/apache/cassandra/repair/Validator.java b/src/java/org/apache/cassandra/repair/Validator.java
index abf5eac..641717e 100644
--- a/src/java/org/apache/cassandra/repair/Validator.java
+++ b/src/java/org/apache/cassandra/repair/Validator.java

@@ -29,7 +29,6 @@
 
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.compaction.AbstractCompactedRow;
@@ -52,41 +51,32 @@
 
     public final RepairJobDesc desc;
     public final InetAddress initiator;
-    public final MerkleTree tree;
     public final int gcBefore;
 
     // null when all rows with the min token have been consumed
-    private transient long validated;
-    private transient MerkleTree.TreeRange range;
-    private transient MerkleTree.TreeRangeIterator ranges;
-    private transient DecoratedKey lastKey;
+    private long validated;
+    private MerkleTree tree;
+    // current range being updated
+    private MerkleTree.TreeRange range;
+    // iterator for iterating sub ranges (MT's leaves)
+    private MerkleTree.TreeRangeIterator ranges;
+    // last key seen
+    private DecoratedKey lastKey;
 
-    /**
-     * Create Validator with default size of initial Merkle Tree.
-     */
     public Validator(RepairJobDesc desc, InetAddress initiator, int gcBefore)
     {
-        this(desc,
-             initiator,
-             // TODO: memory usage (maxsize) should either be tunable per
-             // CF, globally, or as shared for all CFs in a cluster
-             new MerkleTree(DatabaseDescriptor.getPartitioner(), desc.range, MerkleTree.RECOMMENDED_DEPTH, (int)Math.pow(2, 15)),
-             gcBefore);
-    }
-
-    public Validator(RepairJobDesc desc, InetAddress initiator, MerkleTree tree, int gcBefore)
-    {
         this.desc = desc;
         this.initiator = initiator;
-        this.tree = tree;
         this.gcBefore = gcBefore;
         validated = 0;
         range = null;
         ranges = null;
     }
 
-    public void prepare(ColumnFamilyStore cfs)
+    public void prepare(ColumnFamilyStore cfs, MerkleTree tree)
     {
+        this.tree = tree;
+
         if (!tree.partitioner().preservesOrder())
         {
             // You can't beat an even tree distribution for md5
@@ -97,7 +87,7 @@
             List<DecoratedKey> keys = new ArrayList<>();
             for (DecoratedKey sample : cfs.keySamples(desc.range))
             {
-                assert desc.range.contains(sample.token): "Token " + sample.token + " is not within range " + desc.range;
+                assert desc.range.contains(sample.getToken()): "Token " + sample.getToken() + " is not within range " + desc.range;
                 keys.add(sample);
             }
 
@@ -114,12 +104,12 @@
                 while (true)
                 {
                     DecoratedKey dk = keys.get(random.nextInt(numkeys));
-                    if (!tree.split(dk.token))
+                    if (!tree.split(dk.getToken()))
                         break;
                 }
             }
         }
-        logger.debug("Prepared AEService tree of size " + tree.size() + " for " + desc);
+        logger.debug("Prepared AEService tree of size {} for {}", tree.size(), desc);
         ranges = tree.invalids();
     }
 
@@ -131,7 +121,7 @@
      */
     public void add(AbstractCompactedRow row)
     {
-        assert desc.range.contains(row.key.token) : row.key.token + " is not contained in " + desc.range;
+        assert desc.range.contains(row.key.getToken()) : row.key.getToken() + " is not contained in " + desc.range;
         assert lastKey == null || lastKey.compareTo(row.key) < 0
                : "row " + row.key + " received out of order wrt " + lastKey;
         lastKey = row.key;
@@ -140,7 +130,7 @@
             range = ranges.next();
 
         // generate new ranges as long as case 1 is true
-        while (!range.contains(row.key.token))
+        while (!range.contains(row.key.getToken()))
         {
             // add the empty hash, and move to the next range
             range.ensureHashInitialised();
@@ -196,7 +186,7 @@
         // MerkleTree uses XOR internally, so we want lots of output bits here
         CountingDigest digest = new CountingDigest(FBUtilities.newMessageDigest("SHA-256"));
         row.update(digest);
-        return new MerkleTree.RowHash(row.key.token, digest.digest(), digest.count);
+        return new MerkleTree.RowHash(row.key.getToken(), digest.digest(), digest.count);
     }
 
     /**
@@ -239,7 +229,7 @@
      */
     public void fail()
     {
-        logger.error("Failed creating a merkle tree for " + desc + ", " + initiator + " (see log for details)");
+        logger.error("Failed creating a merkle tree for {}, {} (see log for details)", desc, initiator);
         // send fail message only to nodes >= version 2.0
         MessagingService.instance().sendOneWay(new ValidationComplete(desc).createMessage(), initiator);
     }

diff --git a/src/java/org/apache/cassandra/repair/messages/AnticompactionRequest.java b/src/java/org/apache/cassandra/repair/messages/AnticompactionRequest.java
new file mode 100644
index 0000000..34ea5a5
--- /dev/null
+++ b/src/java/org/apache/cassandra/repair/messages/AnticompactionRequest.java

@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.repair.messages;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.util.UUID;
+
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.UUIDSerializer;
+
+public class AnticompactionRequest extends RepairMessage
+{
+    public static MessageSerializer serializer = new AnticompactionRequestSerializer();
+    public final UUID parentRepairSession;
+
+    public AnticompactionRequest(UUID parentRepairSession)
+    {
+        super(Type.ANTICOMPACTION_REQUEST, null);
+        this.parentRepairSession = parentRepairSession;
+    }
+
+    public static class AnticompactionRequestSerializer implements MessageSerializer<AnticompactionRequest>
+    {
+        public void serialize(AnticompactionRequest message, DataOutputPlus out, int version) throws IOException
+        {
+            UUIDSerializer.serializer.serialize(message.parentRepairSession, out, version);
+        }
+
+        public AnticompactionRequest deserialize(DataInput in, int version) throws IOException
+        {
+            UUID parentRepairSession = UUIDSerializer.serializer.deserialize(in, version);
+            return new AnticompactionRequest(parentRepairSession);
+        }
+
+        public long serializedSize(AnticompactionRequest message, int version)
+        {
+            return UUIDSerializer.serializer.serializedSize(message.parentRepairSession, version);
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/repair/messages/PrepareMessage.java b/src/java/org/apache/cassandra/repair/messages/PrepareMessage.java
new file mode 100644
index 0000000..5699677
--- /dev/null
+++ b/src/java/org/apache/cassandra/repair/messages/PrepareMessage.java

@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.repair.messages;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.UUIDSerializer;
+
+
+public class PrepareMessage extends RepairMessage
+{
+    public final static MessageSerializer serializer = new PrepareMessageSerializer();
+    public final List<UUID> cfIds;
+    public final Collection<Range<Token>> ranges;
+
+    public final UUID parentRepairSession;
+
+    public PrepareMessage(UUID parentRepairSession, List<UUID> cfIds, Collection<Range<Token>> ranges)
+    {
+        super(Type.PREPARE_MESSAGE, null);
+        this.parentRepairSession = parentRepairSession;
+        this.cfIds = cfIds;
+        this.ranges = ranges;
+    }
+
+    public static class PrepareMessageSerializer implements MessageSerializer<PrepareMessage>
+    {
+        public void serialize(PrepareMessage message, DataOutputPlus out, int version) throws IOException
+        {
+            out.writeInt(message.cfIds.size());
+            for (UUID cfId : message.cfIds)
+                UUIDSerializer.serializer.serialize(cfId, out, version);
+            UUIDSerializer.serializer.serialize(message.parentRepairSession, out, version);
+            out.writeInt(message.ranges.size());
+            for (Range r : message.ranges)
+                Range.serializer.serialize(r, out, version);
+        }
+
+        public PrepareMessage deserialize(DataInput in, int version) throws IOException
+        {
+            int cfIdCount = in.readInt();
+            List<UUID> cfIds = new ArrayList<>(cfIdCount);
+            for (int i = 0; i < cfIdCount; i++)
+                cfIds.add(UUIDSerializer.serializer.deserialize(in, version));
+            UUID parentRepairSession = UUIDSerializer.serializer.deserialize(in, version);
+            int rangeCount = in.readInt();
+            List<Range<Token>> ranges = new ArrayList<>(rangeCount);
+            for (int i = 0; i < rangeCount; i++)
+                ranges.add((Range<Token>) Range.serializer.deserialize(in, version).toTokenBounds());
+            return new PrepareMessage(parentRepairSession, cfIds, ranges);
+        }
+
+        public long serializedSize(PrepareMessage message, int version)
+        {
+            long size;
+            TypeSizes sizes = TypeSizes.NATIVE;
+            size = sizes.sizeof(message.cfIds.size());
+            for (UUID cfId : message.cfIds)
+                size += UUIDSerializer.serializer.serializedSize(cfId, version);
+            size += UUIDSerializer.serializer.serializedSize(message.parentRepairSession, version);
+            size += sizes.sizeof(message.ranges.size());
+            for (Range r : message.ranges)
+                size += Range.serializer.serializedSize(r, version);
+            return size;
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return "PrepareMessage{" +
+                "cfIds='" + cfIds + '\'' +
+                ", ranges=" + ranges +
+                ", parentRepairSession=" + parentRepairSession +
+                '}';
+    }
+}

diff --git a/src/java/org/apache/cassandra/repair/messages/RepairMessage.java b/src/java/org/apache/cassandra/repair/messages/RepairMessage.java
index f546410..d500928 100644
--- a/src/java/org/apache/cassandra/repair/messages/RepairMessage.java
+++ b/src/java/org/apache/cassandra/repair/messages/RepairMessage.java

@@ -18,10 +18,10 @@
 package org.apache.cassandra.repair.messages;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.repair.RepairJobDesc;
@@ -42,7 +42,10 @@
         VALIDATION_REQUEST(0, ValidationRequest.serializer),
         VALIDATION_COMPLETE(1, ValidationComplete.serializer),
         SYNC_REQUEST(2, SyncRequest.serializer),
-        SYNC_COMPLETE(3, SyncComplete.serializer);
+        SYNC_COMPLETE(3, SyncComplete.serializer),
+        ANTICOMPACTION_REQUEST(4, AnticompactionRequest.serializer),
+        PREPARE_MESSAGE(5, PrepareMessage.serializer),
+        SNAPSHOT(6, SnapshotMessage.serializer);
 
         private final byte type;
         private final MessageSerializer<RepairMessage> serializer;
@@ -80,7 +83,7 @@
 
     public static class RepairMessageSerializer implements IVersionedSerializer<RepairMessage>
     {
-        public void serialize(RepairMessage message, DataOutput out, int version) throws IOException
+        public void serialize(RepairMessage message, DataOutputPlus out, int version) throws IOException
         {
             out.write(message.messageType.type);
             message.messageType.serializer.serialize(message, out, version);

diff --git a/src/java/org/apache/cassandra/repair/messages/SnapshotMessage.java b/src/java/org/apache/cassandra/repair/messages/SnapshotMessage.java
new file mode 100644
index 0000000..caccc82
--- /dev/null
+++ b/src/java/org/apache/cassandra/repair/messages/SnapshotMessage.java

@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.repair.messages;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.repair.RepairJobDesc;
+
+public class SnapshotMessage extends RepairMessage
+{
+    public final static MessageSerializer serializer = new SnapshotMessageSerializer();
+
+    public SnapshotMessage(RepairJobDesc desc)
+    {
+        super(Type.SNAPSHOT, desc);
+    }
+
+    public static class SnapshotMessageSerializer implements MessageSerializer<SnapshotMessage>
+    {
+        public void serialize(SnapshotMessage message, DataOutputPlus out, int version) throws IOException
+        {
+            RepairJobDesc.serializer.serialize(message.desc, out, version);
+        }
+
+        public SnapshotMessage deserialize(DataInput in, int version) throws IOException
+        {
+            RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version);
+            return new SnapshotMessage(desc);
+        }
+
+        public long serializedSize(SnapshotMessage message, int version)
+        {
+            return RepairJobDesc.serializer.serializedSize(message.desc, version);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/repair/messages/SyncComplete.java b/src/java/org/apache/cassandra/repair/messages/SyncComplete.java
index b54492e..c9548ca 100644
--- a/src/java/org/apache/cassandra/repair/messages/SyncComplete.java
+++ b/src/java/org/apache/cassandra/repair/messages/SyncComplete.java

@@ -18,11 +18,11 @@
 package org.apache.cassandra.repair.messages;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.net.InetAddress;
 
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.repair.NodePair;
 import org.apache.cassandra.repair.RepairJobDesc;
 
@@ -55,7 +55,7 @@
 
     private static class SyncCompleteSerializer implements MessageSerializer<SyncComplete>
     {
-        public void serialize(SyncComplete message, DataOutput out, int version) throws IOException
+        public void serialize(SyncComplete message, DataOutputPlus out, int version) throws IOException
         {
             RepairJobDesc.serializer.serialize(message.desc, out, version);
             NodePair.serializer.serialize(message.nodes, out, version);

diff --git a/src/java/org/apache/cassandra/repair/messages/SyncRequest.java b/src/java/org/apache/cassandra/repair/messages/SyncRequest.java
index 042e35d..e677cd8 100644
--- a/src/java/org/apache/cassandra/repair/messages/SyncRequest.java
+++ b/src/java/org/apache/cassandra/repair/messages/SyncRequest.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.repair.messages;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.ArrayList;
@@ -29,6 +28,7 @@
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 import org.apache.cassandra.repair.RepairJobDesc;
 
@@ -58,7 +58,7 @@
 
     public static class SyncRequestSerializer implements MessageSerializer<SyncRequest>
     {
-        public void serialize(SyncRequest message, DataOutput out, int version) throws IOException
+        public void serialize(SyncRequest message, DataOutputPlus out, int version) throws IOException
         {
             RepairJobDesc.serializer.serialize(message.desc, out, version);
             CompactEndpointSerializationHelper.serialize(message.initiator, out);

diff --git a/src/java/org/apache/cassandra/repair/messages/ValidationComplete.java b/src/java/org/apache/cassandra/repair/messages/ValidationComplete.java
index 4ddbc2e..8328979 100644
--- a/src/java/org/apache/cassandra/repair/messages/ValidationComplete.java
+++ b/src/java/org/apache/cassandra/repair/messages/ValidationComplete.java

@@ -18,10 +18,10 @@
 package org.apache.cassandra.repair.messages;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.repair.RepairJobDesc;
 import org.apache.cassandra.utils.MerkleTree;
 
@@ -56,7 +56,7 @@
 
     private static class ValidationCompleteSerializer implements MessageSerializer<ValidationComplete>
     {
-        public void serialize(ValidationComplete message, DataOutput out, int version) throws IOException
+        public void serialize(ValidationComplete message, DataOutputPlus out, int version) throws IOException
         {
             RepairJobDesc.serializer.serialize(message.desc, out, version);
             out.writeBoolean(message.success);

diff --git a/src/java/org/apache/cassandra/repair/messages/ValidationRequest.java b/src/java/org/apache/cassandra/repair/messages/ValidationRequest.java
index 1ebbb1a..c73b708 100644
--- a/src/java/org/apache/cassandra/repair/messages/ValidationRequest.java
+++ b/src/java/org/apache/cassandra/repair/messages/ValidationRequest.java

@@ -18,10 +18,10 @@
 package org.apache.cassandra.repair.messages;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.repair.RepairJobDesc;
 
 /**
@@ -59,7 +59,7 @@
 
     public static class ValidationRequestSerializer implements MessageSerializer<ValidationRequest>
     {
-        public void serialize(ValidationRequest message, DataOutput out, int version) throws IOException
+        public void serialize(ValidationRequest message, DataOutputPlus out, int version) throws IOException
         {
             RepairJobDesc.serializer.serialize(message.desc, out, version);
             out.writeInt(message.gcBefore);

diff --git a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java
index 83a391d..2a5e809 100644
--- a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java

@@ -21,31 +21,122 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
+import org.apache.cassandra.utils.ByteBufferUtil;
+
 public abstract class CollectionSerializer<T> implements TypeSerializer<T>
 {
+    protected abstract List<ByteBuffer> serializeValues(T value);
+    protected abstract int getElementCount(T value);
+
+    public abstract T deserializeForNativeProtocol(ByteBuffer buffer, int version);
+    public abstract void validateForNativeProtocol(ByteBuffer buffer, int version);
+
+    public ByteBuffer serialize(T value)
+    {
+        List<ByteBuffer> values = serializeValues(value);
+        // See deserialize() for why using the protocol v3 variant is the right thing to do.
+        return pack(values, getElementCount(value), 3);
+    }
+
+    public T deserialize(ByteBuffer bytes)
+    {
+        // The only cases we serialize/deserialize collections internally (i.e. not for the protocol sake),
+        // is:
+        //  1) when collections are in UDT values
+        //  2) for internal calls.
+        // In both case, using the protocol 3 version variant is the right thing to do.
+        return deserializeForNativeProtocol(bytes, 3);
+    }
+
+    public ByteBuffer reserializeToV3(ByteBuffer bytes)
+    {
+        return serialize(deserializeForNativeProtocol(bytes, 2));
+    }
+
     public void validate(ByteBuffer bytes) throws MarshalException
     {
-        // The collection is not currently being properly validated.
+        // Same thing than above
+        validateForNativeProtocol(bytes, 3);
     }
 
-    // Utilitary method
-    protected static ByteBuffer pack(List<ByteBuffer> buffers, int elements, int size)
-    {
-        ByteBuffer result = ByteBuffer.allocate(2 + size);
-        result.putShort((short)elements);
-        for (ByteBuffer bb : buffers)
-        {
-            result.putShort((short)bb.remaining());
-            result.put(bb.duplicate());
-        }
-        return (ByteBuffer)result.flip();
-    }
-
-    public static ByteBuffer pack(List<ByteBuffer> buffers, int elements)
+    public static ByteBuffer pack(List<ByteBuffer> buffers, int elements, int version)
     {
         int size = 0;
         for (ByteBuffer bb : buffers)
-            size += 2 + bb.remaining();
-        return pack(buffers, elements, size);
+            size += sizeOfValue(bb, version);
+
+        ByteBuffer result = ByteBuffer.allocate(sizeOfCollectionSize(elements, version) + size);
+        writeCollectionSize(result, elements, version);
+        for (ByteBuffer bb : buffers)
+            writeValue(result, bb, version);
+        return (ByteBuffer)result.flip();
+    }
+
+    protected static void writeCollectionSize(ByteBuffer output, int elements, int version)
+    {
+        if (version >= 3)
+            output.putInt(elements);
+        else
+            output.putShort((short)elements);
+    }
+
+    public static int readCollectionSize(ByteBuffer input, int version)
+    {
+        return version >= 3 ? input.getInt() : ByteBufferUtil.readShortLength(input);
+    }
+
+    protected static int sizeOfCollectionSize(int elements, int version)
+    {
+        return version >= 3 ? 4 : 2;
+    }
+
+    protected static void writeValue(ByteBuffer output, ByteBuffer value, int version)
+    {
+        if (version >= 3)
+        {
+            if (value == null)
+            {
+                output.putInt(-1);
+                return;
+            }
+
+            output.putInt(value.remaining());
+            output.put(value.duplicate());
+        }
+        else
+        {
+            assert value != null;
+            output.putShort((short)value.remaining());
+            output.put(value.duplicate());
+        }
+    }
+
+    public static ByteBuffer readValue(ByteBuffer input, int version)
+    {
+        if (version >= 3)
+        {
+            int size = input.getInt();
+            if (size < 0)
+                return null;
+
+            return ByteBufferUtil.readBytes(input, size);
+        }
+        else
+        {
+            return ByteBufferUtil.readBytesWithShortLength(input);
+        }
+    }
+
+    protected static int sizeOfValue(ByteBuffer value, int version)
+    {
+        if (version >= 3)
+        {
+            return value == null ? 4 : 4 + value.remaining();
+        }
+        else
+        {
+            assert value != null;
+            return 2 + value.remaining();
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/serializers/ListSerializer.java b/src/java/org/apache/cassandra/serializers/ListSerializer.java
index 74cab7e..7387e1b 100644
--- a/src/java/org/apache/cassandra/serializers/ListSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/ListSerializer.java

@@ -47,22 +47,30 @@
         this.elements = elements;
     }
 
-    public List<T> deserialize(ByteBuffer bytes)
+    public List<ByteBuffer> serializeValues(List<T> values)
+    {
+        List<ByteBuffer> buffers = new ArrayList<>(values.size());
+        for (T value : values)
+            buffers.add(elements.serialize(value));
+        return buffers;
+    }
+
+    public int getElementCount(List<T> value)
+    {
+        return value.size();
+    }
+
+    public void validateForNativeProtocol(ByteBuffer bytes, int version)
     {
         try
         {
             ByteBuffer input = bytes.duplicate();
-            int n = ByteBufferUtil.readShortLength(input);
-            List<T> l = new ArrayList<T>(n);
+            int n = readCollectionSize(input, version);
             for (int i = 0; i < n; i++)
-            {
-                ByteBuffer databb = ByteBufferUtil.readBytesWithShortLength(input);
-                elements.validate(databb);
-                l.add(elements.deserialize(databb));
-            }
+                elements.validate(readValue(input, version));
+
             if (input.hasRemaining())
                 throw new MarshalException("Unexpected extraneous bytes after list value");
-            return l;
         }
         catch (BufferUnderflowException e)
         {
@@ -70,24 +78,37 @@
         }
     }
 
-    /**
-     * Layout is: {@code <n><s_1><b_1>...<s_n><b_n> }
-     * where:
-     *   n is the number of elements
-     *   s_i is the number of bytes composing the ith element
-     *   b_i is the s_i bytes composing the ith element
-     */
-    public ByteBuffer serialize(List<T> value)
+    public List<T> deserializeForNativeProtocol(ByteBuffer bytes, int version)
     {
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(value.size());
-        int size = 0;
-        for (T elt : value)
+        try
         {
-            ByteBuffer bb = elements.serialize(elt);
-            bbs.add(bb);
-            size += 2 + bb.remaining();
+            ByteBuffer input = bytes.duplicate();
+            int n = readCollectionSize(input, version);
+            List<T> l = new ArrayList<T>(n);
+            for (int i = 0; i < n; i++)
+            {
+                // We can have nulls in lists that are used for IN values
+                ByteBuffer databb = readValue(input, version);
+                if (databb != null)
+                {
+                    elements.validate(databb);
+                    l.add(elements.deserialize(databb));
+                }
+                else
+                {
+                    l.add(null);
+                }
+            }
+
+            if (input.hasRemaining())
+                throw new MarshalException("Unexpected extraneous bytes after list value");
+
+            return l;
         }
-        return pack(bbs, value.size(), size);
+        catch (BufferUnderflowException e)
+        {
+            throw new MarshalException("Not enough bytes to read a list");
+        }
     }
 
     public String toString(List<T> value)

diff --git a/src/java/org/apache/cassandra/serializers/MapSerializer.java b/src/java/org/apache/cassandra/serializers/MapSerializer.java
index 47515a1..dadadd0 100644
--- a/src/java/org/apache/cassandra/serializers/MapSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/MapSerializer.java

@@ -51,19 +51,55 @@
         this.values = values;
     }
 
-    public Map<K, V> deserialize(ByteBuffer bytes)
+    public List<ByteBuffer> serializeValues(Map<K, V> map)
+    {
+        List<ByteBuffer> buffers = new ArrayList<>(map.size() * 2);
+        for (Map.Entry<K, V> entry : map.entrySet())
+        {
+            buffers.add(keys.serialize(entry.getKey()));
+            buffers.add(values.serialize(entry.getValue()));
+        }
+        return buffers;
+    }
+
+    public int getElementCount(Map<K, V> value)
+    {
+        return value.size();
+    }
+
+    public void validateForNativeProtocol(ByteBuffer bytes, int version)
     {
         try
         {
             ByteBuffer input = bytes.duplicate();
-            int n = ByteBufferUtil.readShortLength(input);
+            int n = readCollectionSize(input, version);
+            for (int i = 0; i < n; i++)
+            {
+                keys.validate(readValue(input, version));
+                values.validate(readValue(input, version));
+            }
+            if (input.hasRemaining())
+                throw new MarshalException("Unexpected extraneous bytes after map value");
+        }
+        catch (BufferUnderflowException e)
+        {
+            throw new MarshalException("Not enough bytes to read a set");
+        }
+    }
+
+    public Map<K, V> deserializeForNativeProtocol(ByteBuffer bytes, int version)
+    {
+        try
+        {
+            ByteBuffer input = bytes.duplicate();
+            int n = readCollectionSize(input, version);
             Map<K, V> m = new LinkedHashMap<K, V>(n);
             for (int i = 0; i < n; i++)
             {
-                ByteBuffer kbb = ByteBufferUtil.readBytesWithShortLength(input);
+                ByteBuffer kbb = readValue(input, version);
                 keys.validate(kbb);
 
-                ByteBuffer vbb = ByteBufferUtil.readBytesWithShortLength(input);
+                ByteBuffer vbb = readValue(input, version);
                 values.validate(vbb);
 
                 m.put(keys.deserialize(kbb), values.deserialize(vbb));
@@ -78,30 +114,6 @@
         }
     }
 
-    /**
-     * Layout is: {@code <n><sk_1><k_1><sv_1><v_1>...<sk_n><k_n><sv_n><v_n> }
-     * where:
-     *   n is the number of elements
-     *   sk_i is the number of bytes composing the ith key k_i
-     *   k_i is the sk_i bytes composing the ith key
-     *   sv_i is the number of bytes composing the ith value v_i
-     *   v_i is the sv_i bytes composing the ith value
-     */
-    public ByteBuffer serialize(Map<K, V> value)
-    {
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(2 * value.size());
-        int size = 0;
-        for (Map.Entry<K, V> entry : value.entrySet())
-        {
-            ByteBuffer bbk = keys.serialize(entry.getKey());
-            ByteBuffer bbv = values.serialize(entry.getValue());
-            bbs.add(bbk);
-            bbs.add(bbv);
-            size += 4 + bbk.remaining() + bbv.remaining();
-        }
-        return pack(bbs, value.size(), size);
-    }
-
     public String toString(Map<K, V> value)
     {
         StringBuilder sb = new StringBuilder();

diff --git a/src/java/org/apache/cassandra/serializers/SetSerializer.java b/src/java/org/apache/cassandra/serializers/SetSerializer.java
index a6df281..de05a66 100644
--- a/src/java/org/apache/cassandra/serializers/SetSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/SetSerializer.java

@@ -47,16 +47,46 @@
         this.elements = elements;
     }
 
-    public Set<T> deserialize(ByteBuffer bytes)
+    public List<ByteBuffer> serializeValues(Set<T> values)
+    {
+        List<ByteBuffer> buffers = new ArrayList<>(values.size());
+        for (T value : values)
+            buffers.add(elements.serialize(value));
+        return buffers;
+    }
+
+    public int getElementCount(Set<T> value)
+    {
+        return value.size();
+    }
+
+    public void validateForNativeProtocol(ByteBuffer bytes, int version)
     {
         try
         {
             ByteBuffer input = bytes.duplicate();
-            int n = ByteBufferUtil.readShortLength(input);
+            int n = readCollectionSize(input, version);
+            for (int i = 0; i < n; i++)
+                elements.validate(readValue(input, version));
+            if (input.hasRemaining())
+                throw new MarshalException("Unexpected extraneous bytes after set value");
+        }
+        catch (BufferUnderflowException e)
+        {
+            throw new MarshalException("Not enough bytes to read a set");
+        }
+    }
+
+    public Set<T> deserializeForNativeProtocol(ByteBuffer bytes, int version)
+    {
+        try
+        {
+            ByteBuffer input = bytes.duplicate();
+            int n = readCollectionSize(input, version);
             Set<T> l = new LinkedHashSet<T>(n);
             for (int i = 0; i < n; i++)
             {
-                ByteBuffer databb = ByteBufferUtil.readBytesWithShortLength(input);
+                ByteBuffer databb = readValue(input, version);
                 elements.validate(databb);
                 l.add(elements.deserialize(databb));
             }
@@ -70,26 +100,6 @@
         }
     }
 
-    /**
-     * Layout is: {@code <n><s_1><b_1>...<s_n><b_n> }
-     * where:
-     *   n is the number of elements
-     *   s_i is the number of bytes composing the ith element
-     *   b_i is the s_i bytes composing the ith element
-     */
-    public ByteBuffer serialize(Set<T> value)
-    {
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(value.size());
-        int size = 0;
-        for (T elt : value)
-        {
-            ByteBuffer bb = elements.serialize(elt);
-            bbs.add(bb);
-            size += 2 + bb.remaining();
-        }
-        return pack(bbs, value.size(), size);
-    }
-
     public String toString(Set<T> value)
     {
         StringBuilder sb = new StringBuilder();

diff --git a/src/java/org/apache/cassandra/service/AbstractReadExecutor.java b/src/java/org/apache/cassandra/service/AbstractReadExecutor.java
index 3f57e73..2c3261f 100644
--- a/src/java/org/apache/cassandra/service/AbstractReadExecutor.java
+++ b/src/java/org/apache/cassandra/service/AbstractReadExecutor.java

@@ -77,12 +77,12 @@
 
     protected void makeDataRequests(Iterable<InetAddress> endpoints)
     {
+        boolean readLocal = false;
         for (InetAddress endpoint : endpoints)
         {
             if (isLocalRequest(endpoint))
             {
-                logger.trace("reading data locally");
-                StageManager.getStage(Stage.READ).execute(new LocalReadRunnable(command, handler));
+                readLocal = true;
             }
             else
             {
@@ -90,6 +90,11 @@
                 MessagingService.instance().sendRR(command.createMessage(), endpoint, handler);
             }
         }
+        if (readLocal)
+        {
+            logger.trace("reading data locally");
+            StageManager.getStage(Stage.READ).maybeExecuteImmediately(new LocalReadRunnable(command, handler));
+        }
     }
 
     protected void makeDigestRequests(Iterable<InetAddress> endpoints)

diff --git a/src/java/org/apache/cassandra/service/AbstractRowResolver.java b/src/java/org/apache/cassandra/service/AbstractRowResolver.java
index 47a00da..fbbf473 100644
--- a/src/java/org/apache/cassandra/service/AbstractRowResolver.java
+++ b/src/java/org/apache/cassandra/service/AbstractRowResolver.java

@@ -18,9 +18,9 @@
 package org.apache.cassandra.service;
 
 import java.nio.ByteBuffer;
-import java.util.Set;
+import java.util.Queue;
+import java.util.concurrent.ConcurrentLinkedQueue;
 
-import org.cliffc.high_scale_lib.NonBlockingHashSet;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -34,7 +34,8 @@
     protected static final Logger logger = LoggerFactory.getLogger(AbstractRowResolver.class);
 
     protected final String keyspaceName;
-    protected final Set<MessageIn<ReadResponse>> replies = new NonBlockingHashSet<MessageIn<ReadResponse>>();
+    // CLQ gives us thread-safety without the overhead of guaranteeing uniqueness like a Set would
+    protected final Queue<MessageIn<ReadResponse>> replies = new ConcurrentLinkedQueue<>();
     protected final DecoratedKey key;
 
     public AbstractRowResolver(ByteBuffer key, String keyspaceName)

diff --git a/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java b/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java
index 295ed51..72e5b9c 100644
--- a/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java
+++ b/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java

@@ -30,7 +30,7 @@
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.net.IAsyncCallback;
 import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.utils.SimpleCondition;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
 public abstract class AbstractWriteResponseHandler implements IAsyncCallback
 {
@@ -44,7 +44,6 @@
     private final WriteType writeType;
 
     /**
-     * @param pendingEndpoints
      * @param callback A callback to be called when the write is successful.
      */
     protected AbstractWriteResponseHandler(Keyspace keyspace,
@@ -65,7 +64,11 @@
 
     public void get() throws WriteTimeoutException
     {
-        long timeout = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getWriteRpcTimeout()) - (System.nanoTime() - start);
+        long requestTimeout = writeType == WriteType.COUNTER
+                            ? DatabaseDescriptor.getCounterWriteRpcTimeout()
+                            : DatabaseDescriptor.getWriteRpcTimeout();
+
+        long timeout = TimeUnit.MILLISECONDS.toNanos(requestTimeout) - (System.nanoTime() - start);
 
         boolean success;
         try
@@ -83,7 +86,7 @@
             int blockedFor = totalBlockFor();
             // It's pretty unlikely, but we can race between exiting await above and here, so
             // that we could now have enough acks. In that case, we "lie" on the acks count to
-            // avoid sending confusing info to the user (see CASSANDRA-).
+            // avoid sending confusing info to the user (see CASSANDRA-6491).
             if (acks >= blockedFor)
                 acks = blockedFor - 1;
             throw new WriteTimeoutException(writeType, consistencyLevel, acks, blockedFor);

diff --git a/src/java/org/apache/cassandra/service/ActiveRepairService.java b/src/java/org/apache/cassandra/service/ActiveRepairService.java
index aac9f9a..35a86f6 100644
--- a/src/java/org/apache/cassandra/service/ActiveRepairService.java
+++ b/src/java/org/apache/cassandra/service/ActiveRepairService.java

@@ -17,27 +17,45 @@
  */
 package org.apache.cassandra.service;
 
+import java.io.File;
+import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.*;
 import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import com.google.common.collect.Multimap;
 import com.google.common.collect.Sets;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.concurrent.JMXConfigurableThreadPoolExecutor;
 import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.gms.FailureDetector;
 import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.net.IAsyncCallbackWithFailure;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.repair.*;
+import org.apache.cassandra.repair.messages.AnticompactionRequest;
+import org.apache.cassandra.repair.messages.PrepareMessage;
 import org.apache.cassandra.repair.messages.RepairMessage;
 import org.apache.cassandra.repair.messages.SyncComplete;
 import org.apache.cassandra.repair.messages.ValidationComplete;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
 
 /**
  * ActiveRepairService is the starting point for manual "active" repairs.
@@ -55,9 +73,12 @@
  */
 public class ActiveRepairService
 {
+    private static final Logger logger = LoggerFactory.getLogger(ActiveRepairService.class);
     // singleton enforcement
     public static final ActiveRepairService instance = new ActiveRepairService();
 
+    public static final long UNREPAIRED_SSTABLE = 0;
+
     private static final ThreadPoolExecutor executor;
     static
     {
@@ -75,16 +96,19 @@
     }
 
     /**
-     * A map of active session.
+     * A map of active coordinator session.
      */
     private final ConcurrentMap<UUID, RepairSession> sessions;
 
+    private final ConcurrentMap<UUID, ParentRepairSession> parentRepairSessions;
+
     /**
      * Protected constructor. Use ActiveRepairService.instance.
      */
     protected ActiveRepairService()
     {
         sessions = new ConcurrentHashMap<>();
+        parentRepairSessions = new ConcurrentHashMap<>();
     }
 
     /**
@@ -92,9 +116,9 @@
      *
      * @return Future for asynchronous call or null if there is no need to repair
      */
-    public RepairFuture submitRepairSession(Range<Token> range, String keyspace, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, String... cfnames)
+    public RepairFuture submitRepairSession(UUID parentRepairSession, Range<Token> range, String keyspace, boolean isSequential, Set<InetAddress> endpoints, String... cfnames)
     {
-        RepairSession session = new RepairSession(range, keyspace, isSequential, dataCenters, hosts, cfnames);
+        RepairSession session = new RepairSession(parentRepairSession, range, keyspace, isSequential, endpoints, cfnames);
         if (session.endpoints.isEmpty())
             return null;
         RepairFuture futureTask = new RepairFuture(session);
@@ -111,7 +135,6 @@
 
     public void removeFromActiveSessions(RepairSession session)
     {
-        FailureDetector.instance.unregisterFailureDetectionEventListener(session);
         Gossiper.instance.unregister(session);
         sessions.remove(session.getId());
     }
@@ -122,13 +145,16 @@
         {
             session.forceShutdown();
         }
+        parentRepairSessions.clear();
     }
 
     // for testing only. Create a session corresponding to a fake request and
     // add it to the sessions (avoid NPE in tests)
     RepairFuture submitArtificialRepairSession(RepairJobDesc desc)
     {
-        RepairSession session = new RepairSession(desc.sessionId, desc.range, desc.keyspace, false, null, null, new String[]{desc.columnFamily});
+        Set<InetAddress> neighbours = new HashSet<>();
+        neighbours.addAll(ActiveRepairService.getNeighbors(desc.keyspace, desc.range, null, null));
+        RepairSession session = new RepairSession(desc.parentSessionId, desc.sessionId, desc.range, desc.keyspace, false, neighbours, new String[]{desc.columnFamily});
         sessions.put(session.getId(), session);
         RepairFuture futureTask = new RepairFuture(session);
         executor.execute(futureTask);
@@ -215,6 +241,136 @@
         return neighbors;
     }
 
+    public UUID prepareForRepair(Set<InetAddress> endpoints, Collection<Range<Token>> ranges, List<ColumnFamilyStore> columnFamilyStores)
+    {
+        UUID parentRepairSession = UUIDGen.getTimeUUID();
+        registerParentRepairSession(parentRepairSession, columnFamilyStores, ranges);
+        final CountDownLatch prepareLatch = new CountDownLatch(endpoints.size());
+        final AtomicBoolean status = new AtomicBoolean(true);
+        IAsyncCallbackWithFailure callback = new IAsyncCallbackWithFailure()
+        {
+            public void response(MessageIn msg)
+            {
+                prepareLatch.countDown();
+            }
+
+            public boolean isLatencyForSnitch()
+            {
+                return false;
+            }
+
+            public void onFailure(InetAddress from)
+            {
+                status.set(false);
+                prepareLatch.countDown();
+            }
+        };
+
+        List<UUID> cfIds = new ArrayList<>(columnFamilyStores.size());
+        for (ColumnFamilyStore cfs : columnFamilyStores)
+            cfIds.add(cfs.metadata.cfId);
+
+        for(InetAddress neighbour : endpoints)
+        {
+            PrepareMessage message = new PrepareMessage(parentRepairSession, cfIds, ranges);
+            MessageOut<RepairMessage> msg = message.createMessage();
+            MessagingService.instance().sendRRWithFailure(msg, neighbour, callback);
+        }
+        try
+        {
+            prepareLatch.await(1, TimeUnit.HOURS);
+        }
+        catch (InterruptedException e)
+        {
+            parentRepairSessions.remove(parentRepairSession);
+            throw new RuntimeException("Did not get replies from all endpoints.", e);
+        }
+
+        if (!status.get())
+        {
+            parentRepairSessions.remove(parentRepairSession);
+            throw new RuntimeException("Did not get positive replies from all endpoints.");
+        }
+
+        return parentRepairSession;
+    }
+
+    public void registerParentRepairSession(UUID parentRepairSession, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges)
+    {
+        Map<UUID, Set<SSTableReader>> sstablesToRepair = new HashMap<>();
+        for (ColumnFamilyStore cfs : columnFamilyStores)
+        {
+            Set<SSTableReader> sstables = new HashSet<>();
+            for (SSTableReader sstable : cfs.getSSTables())
+            {
+                if (new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(ranges))
+                {
+                    if (!sstable.isRepaired())
+                    {
+                        sstables.add(sstable);
+                    }
+                }
+            }
+            sstablesToRepair.put(cfs.metadata.cfId, sstables);
+        }
+        parentRepairSessions.put(parentRepairSession, new ParentRepairSession(columnFamilyStores, ranges, sstablesToRepair, System.currentTimeMillis()));
+    }
+
+    public void finishParentSession(UUID parentSession, Set<InetAddress> neighbors, boolean doAntiCompaction) throws InterruptedException, ExecutionException, IOException
+    {
+        try
+        {
+            if (doAntiCompaction)
+            {
+                for (InetAddress neighbor : neighbors)
+                {
+                    AnticompactionRequest acr = new AnticompactionRequest(parentSession);
+                    MessageOut<RepairMessage> req = acr.createMessage();
+                    MessagingService.instance().sendOneWay(req, neighbor);
+                }
+                List<Future<?>> futures = doAntiCompaction(parentSession);
+                FBUtilities.waitOnFutures(futures);
+            }
+        }
+        finally
+        {
+            parentRepairSessions.remove(parentSession);
+        }
+    }
+
+    public ParentRepairSession getParentRepairSession(UUID parentSessionId)
+    {
+        return parentRepairSessions.get(parentSessionId);
+    }
+
+    public List<Future<?>> doAntiCompaction(UUID parentRepairSession) throws InterruptedException, ExecutionException, IOException
+    {
+        assert parentRepairSession != null;
+        ParentRepairSession prs = getParentRepairSession(parentRepairSession);
+
+        List<Future<?>> futures = new ArrayList<>();
+        for (Map.Entry<UUID, ColumnFamilyStore> columnFamilyStoreEntry : prs.columnFamilyStores.entrySet())
+        {
+
+            Collection<SSTableReader> sstables = new HashSet<>(prs.getAndReferenceSSTables(columnFamilyStoreEntry.getKey()));
+            ColumnFamilyStore cfs = columnFamilyStoreEntry.getValue();
+            boolean success = false;
+            while (!success)
+            {
+                for (SSTableReader compactingSSTable : cfs.getDataTracker().getCompacting())
+                {
+                    if (sstables.remove(compactingSSTable))
+                        SSTableReader.releaseReferences(Arrays.asList(compactingSSTable));
+                }
+                success = sstables.isEmpty() || cfs.getDataTracker().markCompacting(sstables);
+            }
+
+            futures.add(CompactionManager.instance.submitAntiCompaction(cfs, prs.ranges, sstables, prs.repairedAt));
+        }
+
+        return futures;
+    }
+
     public void handleMessage(InetAddress endpoint, RepairMessage message)
     {
         RepairJobDesc desc = message.desc;
@@ -236,4 +392,41 @@
                 break;
         }
     }
+
+    public static class ParentRepairSession
+    {
+        public final Map<UUID, ColumnFamilyStore> columnFamilyStores = new HashMap<>();
+        public final Collection<Range<Token>> ranges;
+        public final Map<UUID, Set<SSTableReader>> sstableMap;
+        public final long repairedAt;
+
+        public ParentRepairSession(List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, Map<UUID, Set<SSTableReader>> sstables, long repairedAt)
+        {
+            for (ColumnFamilyStore cfs : columnFamilyStores)
+                this.columnFamilyStores.put(cfs.metadata.cfId, cfs);
+            this.ranges = ranges;
+            this.sstableMap = sstables;
+            this.repairedAt = repairedAt;
+        }
+
+        public Collection<SSTableReader> getAndReferenceSSTables(UUID cfId)
+        {
+            Set<SSTableReader> sstables = sstableMap.get(cfId);
+            Iterator<SSTableReader> sstableIterator = sstables.iterator();
+            while (sstableIterator.hasNext())
+            {
+                SSTableReader sstable = sstableIterator.next();
+                if (!new File(sstable.descriptor.filenameFor(Component.DATA)).exists())
+                {
+                    sstableIterator.remove();
+                }
+                else
+                {
+                    if (!sstable.acquireReference())
+                        sstableIterator.remove();
+                }
+            }
+            return sstables;
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/service/BatchlogEndpointSelector.java b/src/java/org/apache/cassandra/service/BatchlogEndpointSelector.java
deleted file mode 100644
index bf032f5..0000000
--- a/src/java/org/apache/cassandra/service/BatchlogEndpointSelector.java
+++ /dev/null

@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-
-import java.net.InetAddress;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.cassandra.gms.FailureDetector;
-import org.apache.cassandra.utils.FBUtilities;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.ListMultimap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Multimap;
-
-public class BatchlogEndpointSelector
-{
-    private final String localRack;
-    
-    public BatchlogEndpointSelector(String localRack)
-    {
-        this.localRack = localRack;
-    }
-
-    /**
-     * @param endpoints nodes in the local datacenter, grouped by rack name
-     * @return list of candidates for batchlog hosting.  if possible these will be two nodes from different racks.
-     */
-    public Collection<InetAddress> chooseEndpoints(Multimap<String, InetAddress> endpoints)
-    {
-        // strip out dead endpoints and localhost
-        ListMultimap<String, InetAddress> validated = ArrayListMultimap.create();
-        for (Map.Entry<String, InetAddress> entry : endpoints.entries())
-        {
-            if (isValid(entry.getValue()))
-                validated.put(entry.getKey(), entry.getValue());
-        }
-        if (validated.size() <= 2)
-            return validated.values();
-
-        if ((validated.size() - validated.get(localRack).size()) >= 2)
-        {
-            // we have enough endpoints in other racks
-            validated.removeAll(localRack);
-        }
-
-        if (validated.keySet().size() == 1)
-        {
-            // we have only 1 `other` rack
-            Collection<InetAddress> otherRack = Iterables.getOnlyElement(validated.asMap().values());
-            return Lists.newArrayList(Iterables.limit(otherRack, 2));
-        }
-
-        // randomize which racks we pick from if more than 2 remaining
-        Collection<String> racks;
-        if (validated.keySet().size() == 2)
-        {
-            racks = validated.keySet();
-        }
-        else
-        {
-            racks = Lists.newArrayList(validated.keySet());
-            Collections.shuffle((List) racks);
-        }
-
-        // grab a random member of up to two racks
-        List<InetAddress> result = new ArrayList<>(2);
-        for (String rack : Iterables.limit(racks, 2))
-        {
-            List<InetAddress> rackMembers = validated.get(rack);
-            result.add(rackMembers.get(getRandomInt(rackMembers.size())));
-        }
-
-        return result;
-    }
-    
-    @VisibleForTesting
-    protected boolean isValid(InetAddress input)
-    {
-        return !input.equals(FBUtilities.getBroadcastAddress()) && FailureDetector.instance.isAlive(input);
-    }
-    
-    @VisibleForTesting
-    protected int getRandomInt(int bound)
-    {
-        return FBUtilities.threadLocalRandom().nextInt(bound);
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/CASConditions.java b/src/java/org/apache/cassandra/service/CASRequest.java
similarity index 81%
rename from src/java/org/apache/cassandra/service/CASConditions.java
rename to src/java/org/apache/cassandra/service/CASRequest.java
index c0a2111..3d86637 100644
--- a/src/java/org/apache/cassandra/service/CASConditions.java
+++ b/src/java/org/apache/cassandra/service/CASRequest.java

@@ -22,9 +22,9 @@
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
 /**
- * Abstract the conditions to be fulfilled by a CAS operation.
+ * Abstract the conditions and updates for a CAS operation.
  */
-public interface CASConditions
+public interface CASRequest
 {
     /**
      * The filter to use to fetch the value to compare for the CAS.
@@ -36,4 +36,10 @@
      * readFilter(), match the CAS conditions this object stands for.
      */
     public boolean appliesTo(ColumnFamily current) throws InvalidRequestException;
+
+    /**
+     * The updates to perform of a CAS success. The values fetched using the readFilter()
+     * are passed as argument.
+     */
+    public ColumnFamily makeUpdates(ColumnFamily current) throws InvalidRequestException;
 }

diff --git a/src/java/org/apache/cassandra/service/CacheService.java b/src/java/org/apache/cassandra/service/CacheService.java
index f626e17..1b93c2c 100644
--- a/src/java/org/apache/cassandra/service/CacheService.java
+++ b/src/java/org/apache/cassandra/service/CacheService.java

@@ -18,13 +18,14 @@
 package org.apache.cassandra.service;
 
 import java.io.DataInputStream;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.lang.management.ManagementFactory;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Iterator;
 import java.util.List;
+import java.util.UUID;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
@@ -39,14 +40,16 @@
 import org.apache.cassandra.cache.AutoSavingCache.CacheSerializer;
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
@@ -60,7 +63,8 @@
     public static enum CacheType
     {
         KEY_CACHE("KeyCache"),
-        ROW_CACHE("RowCache");
+        ROW_CACHE("RowCache"),
+        COUNTER_CACHE("CounterCache");
 
         private final String name;
 
@@ -79,6 +83,7 @@
 
     public final AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache;
     public final AutoSavingCache<RowCacheKey, IRowCacheEntry> rowCache;
+    public final AutoSavingCache<CounterCacheKey, ClockAndCount> counterCache;
 
     private CacheService()
     {
@@ -95,6 +100,7 @@
 
         keyCache = initKeyCache();
         rowCache = initRowCache();
+        counterCache = initCounterCache();
     }
 
     /**
@@ -110,14 +116,10 @@
         // where 48 = 40 bytes (average size of the key) + 8 bytes (size of value)
         ICache<KeyCacheKey, RowIndexEntry> kc;
         kc = ConcurrentLinkedHashCache.create(keyCacheInMemoryCapacity);
-        AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache = new AutoSavingCache<KeyCacheKey, RowIndexEntry>(kc, CacheType.KEY_CACHE, new KeyCacheSerializer());
+        AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache = new AutoSavingCache<>(kc, CacheType.KEY_CACHE, new KeyCacheSerializer());
 
         int keyCacheKeysToSave = DatabaseDescriptor.getKeyCacheKeysToSave();
 
-        logger.info("Scheduling key cache save to each {} seconds (going to save {} keys).",
-                DatabaseDescriptor.getKeyCacheSavePeriod(),
-                    keyCacheKeysToSave == Integer.MAX_VALUE ? "all" : keyCacheKeysToSave);
-
         keyCache.scheduleSaving(DatabaseDescriptor.getKeyCacheSavePeriod(), keyCacheKeysToSave);
 
         return keyCache;
@@ -134,19 +136,37 @@
 
         // cache object
         ICache<RowCacheKey, IRowCacheEntry> rc = new SerializingCacheProvider().create(rowCacheInMemoryCapacity);
-        AutoSavingCache<RowCacheKey, IRowCacheEntry> rowCache = new AutoSavingCache<RowCacheKey, IRowCacheEntry>(rc, CacheType.ROW_CACHE, new RowCacheSerializer());
+        AutoSavingCache<RowCacheKey, IRowCacheEntry> rowCache = new AutoSavingCache<>(rc, CacheType.ROW_CACHE, new RowCacheSerializer());
 
         int rowCacheKeysToSave = DatabaseDescriptor.getRowCacheKeysToSave();
 
-        logger.info("Scheduling row cache save to each {} seconds (going to save {} keys).",
-                DatabaseDescriptor.getRowCacheSavePeriod(),
-                    rowCacheKeysToSave == Integer.MAX_VALUE ? "all" : rowCacheKeysToSave);
-
         rowCache.scheduleSaving(DatabaseDescriptor.getRowCacheSavePeriod(), rowCacheKeysToSave);
 
         return rowCache;
     }
 
+    private AutoSavingCache<CounterCacheKey, ClockAndCount> initCounterCache()
+    {
+        logger.info("Initializing counter cache with capacity of {} MBs", DatabaseDescriptor.getCounterCacheSizeInMB());
+
+        long capacity = DatabaseDescriptor.getCounterCacheSizeInMB() * 1024 * 1024;
+
+        AutoSavingCache<CounterCacheKey, ClockAndCount> cache =
+            new AutoSavingCache<>(ConcurrentLinkedHashCache.<CounterCacheKey, ClockAndCount>create(capacity),
+                                  CacheType.COUNTER_CACHE,
+                                  new CounterCacheSerializer());
+
+        int keysToSave = DatabaseDescriptor.getCounterCacheKeysToSave();
+
+        logger.info("Scheduling counter cache save to every {} seconds (going to save {} keys).",
+                    DatabaseDescriptor.getCounterCacheSavePeriod(),
+                    keysToSave == Integer.MAX_VALUE ? "all" : keysToSave);
+
+        cache.scheduleSaving(DatabaseDescriptor.getCounterCacheSavePeriod(), keysToSave);
+
+        return cache;
+    }
+
     public long getKeyCacheHits()
     {
         return keyCache.getMetrics().hits.count();
@@ -205,6 +225,20 @@
         keyCache.scheduleSaving(seconds, DatabaseDescriptor.getKeyCacheKeysToSave());
     }
 
+    public int getCounterCacheSavePeriodInSeconds()
+    {
+        return DatabaseDescriptor.getCounterCacheSavePeriod();
+    }
+
+    public void setCounterCacheSavePeriodInSeconds(int seconds)
+    {
+        if (seconds < 0)
+            throw new RuntimeException("CounterCacheSavePeriodInSeconds must be non-negative.");
+
+        DatabaseDescriptor.setCounterCacheSavePeriod(seconds);
+        counterCache.scheduleSaving(seconds, DatabaseDescriptor.getCounterCacheKeysToSave());
+    }
+
     public int getRowCacheKeysToSave()
     {
         return DatabaseDescriptor.getRowCacheKeysToSave();
@@ -231,16 +265,67 @@
         keyCache.scheduleSaving(getKeyCacheSavePeriodInSeconds(), count);
     }
 
+    public int getCounterCacheKeysToSave()
+    {
+        return DatabaseDescriptor.getCounterCacheKeysToSave();
+    }
+
+    public void setCounterCacheKeysToSave(int count)
+    {
+        if (count < 0)
+            throw new RuntimeException("CounterCacheKeysToSave must be non-negative.");
+        DatabaseDescriptor.setCounterCacheKeysToSave(count);
+        counterCache.scheduleSaving(getCounterCacheSavePeriodInSeconds(), count);
+    }
+
     public void invalidateKeyCache()
     {
         keyCache.clear();
     }
 
+    public void invalidateKeyCacheForCf(UUID cfId)
+    {
+        Iterator<KeyCacheKey> keyCacheIterator = keyCache.getKeySet().iterator();
+        while (keyCacheIterator.hasNext())
+        {
+            KeyCacheKey key = keyCacheIterator.next();
+            if (key.cfId.equals(cfId))
+                keyCacheIterator.remove();
+        }
+    }
+
     public void invalidateRowCache()
     {
         rowCache.clear();
     }
 
+    public void invalidateRowCacheForCf(UUID cfId)
+    {
+        Iterator<RowCacheKey> rowCacheIterator = rowCache.getKeySet().iterator();
+        while (rowCacheIterator.hasNext())
+        {
+            RowCacheKey rowCacheKey = rowCacheIterator.next();
+            if (rowCacheKey.cfId.equals(cfId))
+                rowCacheIterator.remove();
+        }
+    }
+
+    public void invalidateCounterCacheForCf(UUID cfId)
+    {
+        Iterator<CounterCacheKey> counterCacheIterator = counterCache.getKeySet().iterator();
+        while (counterCacheIterator.hasNext())
+        {
+            CounterCacheKey counterCacheKey = counterCacheIterator.next();
+            if (counterCacheKey.cfId.equals(cfId))
+                counterCacheIterator.remove();
+        }
+    }
+
+    public void invalidateCounterCache()
+    {
+        counterCache.clear();
+    }
+
     public long getRowCacheCapacityInBytes()
     {
         return rowCache.getMetrics().capacity.value();
@@ -277,6 +362,14 @@
         keyCache.setCapacity(capacity * 1024 * 1024);
     }
 
+    public void setCounterCacheCapacityInMB(long capacity)
+    {
+        if (capacity < 0)
+            throw new RuntimeException("capacity should not be negative.");
+
+        counterCache.setCapacity(capacity * 1024 * 1024);
+    }
+
     public long getRowCacheSize()
     {
         return rowCache.getMetrics().size.value();
@@ -299,19 +392,54 @@
 
     public void saveCaches() throws ExecutionException, InterruptedException
     {
-        List<Future<?>> futures = new ArrayList<Future<?>>(2);
+        List<Future<?>> futures = new ArrayList<>(3);
         logger.debug("submitting cache saves");
 
         futures.add(keyCache.submitWrite(DatabaseDescriptor.getKeyCacheKeysToSave()));
         futures.add(rowCache.submitWrite(DatabaseDescriptor.getRowCacheKeysToSave()));
+        futures.add(counterCache.submitWrite(DatabaseDescriptor.getCounterCacheKeysToSave()));
 
         FBUtilities.waitOnFutures(futures);
         logger.debug("cache saves completed");
     }
 
-    public class RowCacheSerializer implements CacheSerializer<RowCacheKey, IRowCacheEntry>
+    public static class CounterCacheSerializer implements CacheSerializer<CounterCacheKey, ClockAndCount>
     {
-        public void serialize(RowCacheKey key, DataOutput out) throws IOException
+        public void serialize(CounterCacheKey key, DataOutputPlus out) throws IOException
+        {
+            ByteBufferUtil.writeWithLength(key.partitionKey, out);
+            ByteBufferUtil.writeWithLength(key.cellName, out);
+        }
+
+        public Future<Pair<CounterCacheKey, ClockAndCount>> deserialize(DataInputStream in, final ColumnFamilyStore cfs) throws IOException
+        {
+            final ByteBuffer partitionKey = ByteBufferUtil.readWithLength(in);
+            final CellName cellName = cfs.metadata.comparator.cellFromByteBuffer(ByteBufferUtil.readWithLength(in));
+            return StageManager.getStage(Stage.READ).submit(new Callable<Pair<CounterCacheKey, ClockAndCount>>()
+            {
+                public Pair<CounterCacheKey, ClockAndCount> call() throws Exception
+                {
+                    DecoratedKey key = cfs.partitioner.decorateKey(partitionKey);
+                    QueryFilter filter = QueryFilter.getNamesFilter(key,
+                                                                    cfs.metadata.cfName,
+                                                                    FBUtilities.singleton(cellName, cfs.metadata.comparator),
+                                                                    Long.MIN_VALUE);
+                    ColumnFamily cf = cfs.getTopLevelColumns(filter, Integer.MIN_VALUE);
+                    if (cf == null)
+                        return null;
+                    Cell cell = cf.getColumn(cellName);
+                    if (cell == null || !cell.isLive(Long.MIN_VALUE))
+                        return null;
+                    ClockAndCount clockAndCount = CounterContext.instance().getLocalClockAndCount(cell.value());
+                    return Pair.create(CounterCacheKey.create(cfs.metadata.cfId, partitionKey, cellName), clockAndCount);
+                }
+            });
+        }
+    }
+
+    public static class RowCacheSerializer implements CacheSerializer<RowCacheKey, IRowCacheEntry>
+    {
+        public void serialize(RowCacheKey key, DataOutputPlus out) throws IOException
         {
             ByteBufferUtil.writeWithLength(key.key, out);
         }
@@ -324,16 +452,17 @@
                 public Pair<RowCacheKey, IRowCacheEntry> call() throws Exception
                 {
                     DecoratedKey key = cfs.partitioner.decorateKey(buffer);
-                    ColumnFamily data = cfs.getTopLevelColumns(QueryFilter.getIdentityFilter(key, cfs.name, Long.MIN_VALUE), Integer.MIN_VALUE);
+                    QueryFilter cacheFilter = new QueryFilter(key, cfs.getColumnFamilyName(), cfs.readFilterForCache(), Integer.MIN_VALUE);
+                    ColumnFamily data = cfs.getTopLevelColumns(cacheFilter, Integer.MIN_VALUE);
                     return Pair.create(new RowCacheKey(cfs.metadata.cfId, key), (IRowCacheEntry) data);
                 }
             });
         }
     }
 
-    public class KeyCacheSerializer implements CacheSerializer<KeyCacheKey, RowIndexEntry>
+    public static class KeyCacheSerializer implements CacheSerializer<KeyCacheKey, RowIndexEntry>
     {
-        public void serialize(KeyCacheKey key, DataOutput out) throws IOException
+        public void serialize(KeyCacheKey key, DataOutputPlus out) throws IOException
         {
             RowIndexEntry entry = CacheService.instance.keyCache.get(key);
             if (entry == null)
@@ -342,7 +471,8 @@
             Descriptor desc = key.desc;
             out.writeInt(desc.generation);
             out.writeBoolean(true);
-            RowIndexEntry.serializer.serialize(entry, out);
+            CFMetaData cfm = Schema.instance.getCFMetaData(key.desc.ksname, key.desc.cfname);
+            cfm.comparator.rowIndexEntrySerializer().serialize(entry, out);
         }
 
         public Future<Pair<KeyCacheKey, RowIndexEntry>> deserialize(DataInputStream input, ColumnFamilyStore cfs) throws IOException
@@ -359,11 +489,11 @@
             input.readBoolean(); // backwards compatibility for "promoted indexes" boolean
             if (reader == null)
             {
-                RowIndexEntry.serializer.skipPromotedIndex(input);
+                RowIndexEntry.Serializer.skipPromotedIndex(input);
                 return null;
             }
-            RowIndexEntry entry = RowIndexEntry.serializer.deserialize(input, reader.descriptor.version);
-            return Futures.immediateFuture(Pair.create(new KeyCacheKey(reader.descriptor, key), entry));
+            RowIndexEntry entry = reader.metadata.comparator.rowIndexEntrySerializer().deserialize(input, reader.descriptor.version);
+            return Futures.immediateFuture(Pair.create(new KeyCacheKey(cfs.metadata.cfId, reader.descriptor, key), entry));
         }
 
         private SSTableReader findDesc(int generation, Collection<SSTableReader> collection)

diff --git a/src/java/org/apache/cassandra/service/CacheServiceMBean.java b/src/java/org/apache/cassandra/service/CacheServiceMBean.java
index 6d93f95..28e9d3b 100644
--- a/src/java/org/apache/cassandra/service/CacheServiceMBean.java
+++ b/src/java/org/apache/cassandra/service/CacheServiceMBean.java

@@ -27,12 +27,18 @@
     public int getKeyCacheSavePeriodInSeconds();
     public void setKeyCacheSavePeriodInSeconds(int kcspis);
 
+    public int getCounterCacheSavePeriodInSeconds();
+    public void setCounterCacheSavePeriodInSeconds(int ccspis);
+
     public int getRowCacheKeysToSave();
     public void setRowCacheKeysToSave(int rckts);
 
     public int getKeyCacheKeysToSave();
     public void setKeyCacheKeysToSave(int kckts);
 
+    public int getCounterCacheKeysToSave();
+    public void setCounterCacheKeysToSave(int cckts);
+
     /**
      * invalidate the key cache; for use after invalidating row cache
      */
@@ -43,10 +49,14 @@
      */
     public void invalidateRowCache();
 
+    public void invalidateCounterCache();
+
     public void setRowCacheCapacityInMB(long capacity);
 
     public void setKeyCacheCapacityInMB(long capacity);
 
+    public void setCounterCacheCapacityInMB(long capacity);
+
     /**
      * save row and key caches
      *

diff --git a/src/java/org/apache/cassandra/service/CassandraDaemon.java b/src/java/org/apache/cassandra/service/CassandraDaemon.java
index 89d2bb0..e68dc26 100644
--- a/src/java/org/apache/cassandra/service/CassandraDaemon.java
+++ b/src/java/org/apache/cassandra/service/CassandraDaemon.java

@@ -22,8 +22,7 @@
 import java.lang.management.ManagementFactory;
 import java.lang.management.MemoryPoolMXBean;
 import java.net.InetAddress;
-import java.net.MalformedURLException;
-import java.net.URL;
+import java.net.UnknownHostException;
 import java.util.Arrays;
 import java.util.Map;
 import java.util.UUID;
@@ -32,16 +31,12 @@
 import javax.management.ObjectName;
 import javax.management.StandardMBean;
 
-import com.addthis.metrics.reporter.config.ReporterConfig;
-
 import com.google.common.collect.Iterables;
 import com.google.common.util.concurrent.Uninterruptibles;
-
-import org.apache.cassandra.io.sstable.CorruptSSTableException;
-import org.apache.log4j.PropertyConfigurator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.addthis.metrics.reporter.config.ReporterConfig;
 import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
@@ -51,21 +46,17 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.MeteredFlusher;
 import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.compaction.LeveledManifest;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.FSError;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.metrics.StorageMetrics;
 import org.apache.cassandra.thrift.ThriftServer;
 import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.CLibrary;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Mx4jTool;
-import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.*;
 
 /**
  * The <code>CassandraDaemon</code> is an abstraction for a Cassandra daemon
@@ -76,65 +67,6 @@
 public class CassandraDaemon
 {
     public static final String MBEAN_NAME = "org.apache.cassandra.db:type=NativeAccess";
-    
-    static
-    {
-        initLog4j();
-    }
-
-    // Have a dedicated thread to call exit to avoid deadlock in the case where the thread that wants to invoke exit
-    // belongs to an executor that our shutdown hook wants to wait to exit gracefully. See CASSANDRA-5273.
-    private static final Thread exitThread = new Thread(new Runnable()
-    {
-        public void run()
-        {
-            System.exit(100);
-        }
-    }, "Exit invoker");
-
-    /**
-     * Initialize logging in such a way that it checks for config changes every 10 seconds.
-     */
-    public static void initLog4j()
-    {
-        if (System.getProperty("log4j.defaultInitOverride","false").equalsIgnoreCase("true"))
-        {
-            String config = System.getProperty("log4j.configuration", "log4j-server.properties");
-            URL configLocation = null;
-            try
-            {
-                // try loading from a physical location first.
-                configLocation = new URL(config);
-            }
-            catch (MalformedURLException ex)
-            {
-                // then try loading from the classpath.
-                configLocation = CassandraDaemon.class.getClassLoader().getResource(config);
-            }
-
-            if (configLocation == null)
-                throw new RuntimeException("Couldn't figure out log4j configuration: "+config);
-
-            // Now convert URL to a filename
-            String configFileName = null;
-            try
-            {
-                // first try URL.getFile() which works for opaque URLs (file:foo) and paths without spaces
-                configFileName = configLocation.getFile();
-                File configFile = new File(configFileName);
-                // then try alternative approach which works for all hierarchical URLs with or without spaces
-                if (!configFile.exists())
-                    configFileName = new File(configLocation.toURI()).getPath();
-            }
-            catch (Exception e)
-            {
-                throw new RuntimeException("Couldn't convert log4j configuration location to a valid file", e);
-            }
-
-            PropertyConfigurator.configureAndWatch(configFileName, 10000);
-            org.apache.log4j.Logger.getLogger(CassandraDaemon.class).info("Logging initialized");
-        }
-    }
 
     private static final Logger logger = LoggerFactory.getLogger(CassandraDaemon.class);
 
@@ -152,6 +84,14 @@
      */
     protected void setup()
     {
+        try 
+        {
+            logger.info("Hostname: {}", InetAddress.getLocalHost().getHostName());
+        }
+        catch (UnknownHostException e1)
+        {
+            logger.info("Could not resolve local host");
+        }
         // log warnings for different kinds of sub-optimal JVMs.  tldr use 64-bit Oracle >= 1.6u32
         if (!DatabaseDescriptor.hasLargeAddressSpace())
             logger.info("32bit JVM detected.  It is recommended to run Cassandra on a 64bit JVM for better performance.");
@@ -189,6 +129,20 @@
         for(MemoryPoolMXBean pool: ManagementFactory.getMemoryPoolMXBeans())
             logger.info("{} {}: {}", pool.getName(), pool.getType(), pool.getPeakUsage());
         logger.info("Classpath: {}", System.getProperty("java.class.path"));
+
+        // Fail-fast if JNA is not available or failing to initialize properly
+        // except with -Dcassandra.boot_without_jna=true. See CASSANDRA-6575.
+        if (!CLibrary.jnaAvailable())
+        {
+            boolean jnaRequired = !Boolean.getBoolean("cassandra.boot_without_jna");
+
+            if (jnaRequired)
+            {
+                logger.error("JNA failing to initialize properly. Use -Dcassandra.boot_without_jna=true to bootstrap even so.");
+                System.exit(3);
+            }
+        }
+
         CLibrary.tryMlockall();
 
         Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler()
@@ -196,18 +150,16 @@
             public void uncaughtException(Thread t, Throwable e)
             {
                 StorageMetrics.exceptions.inc();
-                logger.error("Exception in thread " + t, e);
-                Tracing.trace("Exception in thread " + t, e);
+                logger.error("Exception in thread {}", t, e);
+                Tracing.trace("Exception in thread {}", t, e);
                 for (Throwable e2 = e; e2 != null; e2 = e2.getCause())
                 {
-                    // some code, like FileChannel.map, will wrap an OutOfMemoryError in another exception
-                    if (e2 instanceof OutOfMemoryError)
-                        exitThread.start();
+                    JVMStabilityInspector.inspectThrowable(e2);
 
                     if (e2 instanceof FSError)
                     {
                         if (e2 != e) // make sure FSError gets logged exactly once.
-                            logger.error("Exception in thread " + t, e2);
+                            logger.error("Exception in thread {}", t, e2);
                         FileUtils.handleFSError((FSError) e2);
                     }
 
@@ -229,6 +181,7 @@
         {
             logger.debug("Checking directory {}", dataDir);
             File dir = new File(dataDir);
+
             // check that directories exist.
             if (!dir.exists())
             {
@@ -241,7 +194,7 @@
                 }
             }
             // if directories exist verify their permissions
-            if (!Directories.hasFullPermissions(dir, dataDir))
+            if (!Directories.verifyFullPermissions(dir, dataDir))
             {
                 // if permissions aren't sufficient, stop cassandra.
                 System.exit(3);
@@ -255,7 +208,7 @@
         // we do a one-off scrub of the system keyspace first; we can't load the list of the rest of the keyspaces,
         // until system keyspace is opened.
         for (CFMetaData cfm : Schema.instance.getKeyspaceMetaData(Keyspace.SYSTEM_KS).values())
-            ColumnFamilyStore.scrubDataDirectories(Keyspace.SYSTEM_KS, cfm.cfName);
+            ColumnFamilyStore.scrubDataDirectories(cfm);
         try
         {
             SystemKeyspace.checkHealth();
@@ -269,20 +222,15 @@
         // load keyspace descriptions.
         DatabaseDescriptor.loadSchemas();
 
-        try
-        {
-            LeveledManifest.maybeMigrateManifests();
-        }
-        catch(IOException e)
-        {
-            logger.error("Could not migrate old leveled manifest. Move away the .json file in the data directory", e);
-            System.exit(100);
-        }
-
         // clean up compaction leftovers
         Map<Pair<String, String>, Map<Integer, UUID>> unfinishedCompactions = SystemKeyspace.getUnfinishedCompactions();
         for (Pair<String, String> kscf : unfinishedCompactions.keySet())
-            ColumnFamilyStore.removeUnfinishedCompactionLeftovers(kscf.left, kscf.right, unfinishedCompactions.get(kscf));
+        {
+            CFMetaData cfm = Schema.instance.getCFMetaData(kscf.left, kscf.right);
+            // CFMetaData can be null if CF is already dropped
+            if (cfm != null)
+                ColumnFamilyStore.removeUnfinishedCompactionLeftovers(cfm, unfinishedCompactions.get(kscf));
+        }
         SystemKeyspace.discardCompactionsInProgress();
 
         // clean up debris in the rest of the keyspaces
@@ -293,14 +241,15 @@
                 continue;
 
             for (CFMetaData cfm : Schema.instance.getKeyspaceMetaData(keyspaceName).values())
-                ColumnFamilyStore.scrubDataDirectories(keyspaceName, cfm.cfName);
+                ColumnFamilyStore.scrubDataDirectories(cfm);
         }
 
+        Keyspace.setInitialized();
         // initialize keyspaces
         for (String keyspaceName : Schema.instance.getKeyspaces())
         {
             if (logger.isDebugEnabled())
-                logger.debug("opening keyspace " + keyspaceName);
+                logger.debug("opening keyspace {}", keyspaceName);
             // disable auto compaction until commit log replay ends
             for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
             {
@@ -319,17 +268,14 @@
 
         try
         {
-            GCInspector.instance.start();
+            GCInspector.register();
         }
         catch (Throwable t)
         {
+            JVMStabilityInspector.inspectThrowable(t);
             logger.warn("Unable to start GCInspector (currently only supported on the Sun JVM)");
         }
 
-        // MeteredFlusher can block if flush queue fills up, so don't put on scheduledTasks
-        // Start it before commit log, so memtables can flush during commit log replay
-        StorageService.optionalTasks.scheduleWithFixedDelay(new MeteredFlusher(), 1000, 1000, TimeUnit.MILLISECONDS);
-
         // replay the log if necessary
         try
         {
@@ -408,10 +354,11 @@
         // Thift
         InetAddress rpcAddr = DatabaseDescriptor.getRpcAddress();
         int rpcPort = DatabaseDescriptor.getRpcPort();
-        thriftServer = new ThriftServer(rpcAddr, rpcPort);
+        int listenBacklog = DatabaseDescriptor.getRpcListenBacklog();
+        thriftServer = new ThriftServer(rpcAddr, rpcPort, listenBacklog);
 
         // Native transport
-        InetAddress nativeAddr = DatabaseDescriptor.getNativeTransportAddress();
+        InetAddress nativeAddr = DatabaseDescriptor.getRpcAddress();
         int nativePort = DatabaseDescriptor.getNativeTransportPort();
         nativeServer = new org.apache.cassandra.transport.Server(nativeAddr, nativePort);
     }
@@ -489,7 +436,7 @@
             }
             catch (Exception e)
             {
-                logger.error("error registering MBean " + MBEAN_NAME, e);
+                logger.error("error registering MBean {}", MBEAN_NAME, e);
                 //Allow the server to start even if the bean can't be registered
             }
             

diff --git a/src/java/org/apache/cassandra/service/ClientState.java b/src/java/org/apache/cassandra/service/ClientState.java
index 7611a14..492a07a 100644
--- a/src/java/org/apache/cassandra/service/ClientState.java
+++ b/src/java/org/apache/cassandra/service/ClientState.java

@@ -92,7 +92,7 @@
 
     // isInternal is used to mark ClientState as used by some internal component
     // that should have an ability to modify system keyspace.
-    private final boolean isInternal;
+    public final boolean isInternal;
 
     // The remote address of the client - null for internal clients.
     private final SocketAddress remoteAddress;

diff --git a/src/java/org/apache/cassandra/service/DatacenterWriteResponseHandler.java b/src/java/org/apache/cassandra/service/DatacenterWriteResponseHandler.java
index 74dd488..fb8f992 100644
--- a/src/java/org/apache/cassandra/service/DatacenterWriteResponseHandler.java
+++ b/src/java/org/apache/cassandra/service/DatacenterWriteResponseHandler.java

@@ -45,8 +45,7 @@
     public void response(MessageIn message)
     {
         if (message == null || consistencyLevel.isLocal(message.from))
-            if (responses.decrementAndGet() == 0)
-                signal();
+            super.response(message);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/service/EmbeddedCassandraService.java b/src/java/org/apache/cassandra/service/EmbeddedCassandraService.java
index 8993900..659d851 100644
--- a/src/java/org/apache/cassandra/service/EmbeddedCassandraService.java
+++ b/src/java/org/apache/cassandra/service/EmbeddedCassandraService.java

@@ -52,9 +52,4 @@
         cassandraDaemon.init(null);
         cassandraDaemon.start();
     }
-
-    public void stop() throws IOException
-    {
-        cassandraDaemon.deactivate();
-    }
 }

diff --git a/src/java/org/apache/cassandra/service/FileCacheService.java b/src/java/org/apache/cassandra/service/FileCacheService.java
index 59b5548..250e625 100644
--- a/src/java/org/apache/cassandra/service/FileCacheService.java
+++ b/src/java/org/apache/cassandra/service/FileCacheService.java

@@ -23,6 +23,7 @@
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
 
 import com.google.common.cache.*;
 import org.slf4j.Logger;
@@ -41,36 +42,66 @@
 
     public static FileCacheService instance = new FileCacheService();
 
-    private static final Callable<Queue<RandomAccessReader>> cacheForPathCreator = new Callable<Queue<RandomAccessReader>>()
+    private static final AtomicLong cacheKeyIdCounter = new AtomicLong();
+    public static final class CacheKey
+    {
+        final long id;
+        public CacheKey()
+        {
+            this.id = cacheKeyIdCounter.incrementAndGet();
+        }
+        public boolean equals(Object that)
+        {
+            return that instanceof CacheKey && ((CacheKey) that).id == this.id;
+        }
+        public int hashCode()
+        {
+            return (int) id;
+        }
+    }
+
+    private static final Callable<CacheBucket> cacheForPathCreator = new Callable<CacheBucket>()
     {
         @Override
-        public Queue<RandomAccessReader> call()
+        public CacheBucket call()
         {
-            return new ConcurrentLinkedQueue<RandomAccessReader>();
+            return new CacheBucket();
         }
     };
 
     private static final AtomicInteger memoryUsage = new AtomicInteger();
 
-    private final Cache<String, Queue<RandomAccessReader>> cache;
+    private final Cache<CacheKey, CacheBucket> cache;
     private final FileCacheMetrics metrics = new FileCacheMetrics();
 
+    private static final class CacheBucket
+    {
+        final ConcurrentLinkedQueue<RandomAccessReader> queue = new ConcurrentLinkedQueue<>();
+        volatile boolean discarded = false;
+    }
+
     protected FileCacheService()
     {
-        RemovalListener<String, Queue<RandomAccessReader>> onRemove = new RemovalListener<String, Queue<RandomAccessReader>>()
+        RemovalListener<CacheKey, CacheBucket> onRemove = new RemovalListener<CacheKey, CacheBucket>()
         {
             @Override
-            public void onRemoval(RemovalNotification<String, Queue<RandomAccessReader>> notification)
+            public void onRemoval(RemovalNotification<CacheKey, CacheBucket> notification)
             {
-                Queue<RandomAccessReader> cachedInstances = notification.getValue();
-                if (cachedInstances == null)
+                CacheBucket bucket = notification.getValue();
+                if (bucket == null)
                     return;
 
-                if (cachedInstances.size() > 0)
-                    logger.debug("Evicting cold readers for {}", cachedInstances.peek().getPath());
-
-                for (RandomAccessReader reader = cachedInstances.poll(); reader != null; reader = cachedInstances.poll())
+                // set discarded before deallocating the readers, to ensure we don't leak any
+                bucket.discarded = true;
+                Queue<RandomAccessReader> q = bucket.queue;
+                boolean first = true;
+                for (RandomAccessReader reader = q.poll() ; reader != null ; reader = q.poll())
                 {
+                    if (logger.isDebugEnabled() && first)
+                    {
+                        logger.debug("Evicting cold readers for {}", reader.getPath());
+                        first = false;
+                    }
                     memoryUsage.addAndGet(-1 * reader.getTotalBufferSize());
                     reader.deallocate();
                 }
@@ -81,15 +112,16 @@
                 .expireAfterAccess(AFTER_ACCESS_EXPIRATION, TimeUnit.MILLISECONDS)
                 .concurrencyLevel(DatabaseDescriptor.getConcurrentReaders())
                 .removalListener(onRemove)
+                .initialCapacity(16 << 10)
                 .build();
     }
 
-    public RandomAccessReader get(String path)
+    public RandomAccessReader get(CacheKey key)
     {
         metrics.requests.mark();
 
-        Queue<RandomAccessReader> instances = getCacheFor(path);
-        RandomAccessReader result = instances.poll();
+        CacheBucket bucket = getCacheFor(key);
+        RandomAccessReader result = bucket.queue.poll();
         if (result != null)
         {
             metrics.hits.mark();
@@ -99,11 +131,11 @@
         return result;
     }
 
-    private Queue<RandomAccessReader> getCacheFor(String path)
+    private CacheBucket getCacheFor(CacheKey key)
     {
         try
         {
-            return cache.get(path, cacheForPathCreator);
+            return cache.get(key, cacheForPathCreator);
         }
         catch (ExecutionException e)
         {
@@ -111,34 +143,46 @@
         }
     }
 
-    public void put(RandomAccessReader instance)
+    public void put(CacheKey cacheKey, RandomAccessReader instance)
     {
         int memoryUsed = memoryUsage.get();
         if (logger.isDebugEnabled())
             logger.debug("Estimated memory usage is {} compared to actual usage {}", memoryUsed, sizeInBytes());
 
-        if (memoryUsed >= MEMORY_USAGE_THRESHOLD)
+        CacheBucket bucket = cache.getIfPresent(cacheKey);
+        if (memoryUsed >= MEMORY_USAGE_THRESHOLD || bucket == null)
         {
             instance.deallocate();
         }
         else
         {
             memoryUsage.addAndGet(instance.getTotalBufferSize());
-            getCacheFor(instance.getPath()).add(instance);
+            bucket.queue.add(instance);
+            if (bucket.discarded)
+            {
+                RandomAccessReader reader = bucket.queue.poll();
+                if (reader != null)
+                {
+                    memoryUsage.addAndGet(-1 * reader.getTotalBufferSize());
+                    reader.deallocate();
+                }
+            }
         }
     }
 
-    public void invalidate(String path)
+    public void invalidate(CacheKey cacheKey, String path)
     {
-        logger.debug("Invalidating cache for {}", path);
-        cache.invalidate(path);
+        if (logger.isDebugEnabled())
+            logger.debug("Invalidating cache for {}", path);
+        cache.invalidate(cacheKey);
     }
 
+    // TODO: this method is unsafe, as it calls getTotalBufferSize() on items that can have been discarded
     public long sizeInBytes()
     {
         long n = 0;
-        for (Queue<RandomAccessReader> queue : cache.asMap().values())
-            for (RandomAccessReader reader : queue)
+        for (CacheBucket bucket : cache.asMap().values())
+            for (RandomAccessReader reader : bucket.queue)
                 n += reader.getTotalBufferSize();
         return n;
     }

diff --git a/src/java/org/apache/cassandra/service/GCInspector.java b/src/java/org/apache/cassandra/service/GCInspector.java
index 9961bf9..c4bffac 100644
--- a/src/java/org/apache/cassandra/service/GCInspector.java
+++ b/src/java/org/apache/cassandra/service/GCInspector.java

@@ -17,112 +17,156 @@
  */
 package org.apache.cassandra.service;
 
-import java.lang.management.GarbageCollectorMXBean;
 import java.lang.management.ManagementFactory;
-import java.lang.management.MemoryMXBean;
 import java.lang.management.MemoryUsage;
 import java.util.ArrayList;
-import java.util.HashMap;
+import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
 import javax.management.MBeanServer;
+import javax.management.Notification;
+import javax.management.NotificationListener;
 import javax.management.ObjectName;
+import javax.management.openmbean.CompositeData;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.sun.management.GarbageCollectionNotificationInfo;
 import org.apache.cassandra.io.sstable.SSTableDeletingTask;
 import org.apache.cassandra.utils.StatusLogger;
 
-public class GCInspector
+public class GCInspector implements NotificationListener, GCInspectorMXBean
 {
+    public static final String MBEAN_NAME = "org.apache.cassandra.service:type=GCInspector";
     private static final Logger logger = LoggerFactory.getLogger(GCInspector.class);
-    final static long INTERVAL_IN_MS = 1000;
-    final static long MIN_DURATION = 200;
-    final static long MIN_DURATION_TPSTATS = 1000;
+    final static long MIN_LOG_DURATION = 200;
+    final static long MIN_LOG_DURATION_TPSTATS = 1000;
 
-    public static final GCInspector instance = new GCInspector();
+    static final class State
+    {
+        final double maxRealTimeElapsed;
+        final double totalRealTimeElapsed;
+        final double sumSquaresRealTimeElapsed;
+        final double totalBytesReclaimed;
+        final double count;
+        final long startNanos;
 
-    private final HashMap<String, Long> gctimes = new HashMap<String, Long>();
-    private final HashMap<String, Long> gccounts = new HashMap<String, Long>();
+        State(double extraElapsed, double extraBytes, State prev)
+        {
+            this.totalRealTimeElapsed = prev.totalRealTimeElapsed + extraElapsed;
+            this.totalBytesReclaimed = prev.totalBytesReclaimed + extraBytes;
+            this.sumSquaresRealTimeElapsed = prev.sumSquaresRealTimeElapsed + (extraElapsed * extraElapsed);
+            this.startNanos = prev.startNanos;
+            this.count = prev.count + 1;
+            this.maxRealTimeElapsed = Math.max(prev.maxRealTimeElapsed, extraElapsed);
+        }
 
-    final List<GarbageCollectorMXBean> beans = new ArrayList<GarbageCollectorMXBean>();
-    final MemoryMXBean membean = ManagementFactory.getMemoryMXBean();
+        State()
+        {
+            count = maxRealTimeElapsed = sumSquaresRealTimeElapsed = totalRealTimeElapsed = totalBytesReclaimed = 0;
+            startNanos = System.nanoTime();
+        }
+    }
+
+    final AtomicReference<State> state = new AtomicReference<>(new State());
 
     public GCInspector()
     {
-        MBeanServer server = ManagementFactory.getPlatformMBeanServer();
+        MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
+
         try
         {
-            ObjectName gcName = new ObjectName(ManagementFactory.GARBAGE_COLLECTOR_MXBEAN_DOMAIN_TYPE + ",*");
-            for (ObjectName name : server.queryNames(gcName, null))
-            {
-                GarbageCollectorMXBean gc = ManagementFactory.newPlatformMXBeanProxy(server, name.getCanonicalName(), GarbageCollectorMXBean.class);
-                beans.add(gc);
-            }
+            mbs.registerMBean(this, new ObjectName(MBEAN_NAME));
         }
         catch (Exception e)
         {
             throw new RuntimeException(e);
         }
+
     }
 
-    public void start()
+    public static void register() throws Exception
     {
-        // don't bother starting a thread that will do nothing.
-        if (beans.size() == 0)
-            return;
-        Runnable t = new Runnable()
+        GCInspector inspector = new GCInspector();
+        MBeanServer server = ManagementFactory.getPlatformMBeanServer();
+        ObjectName gcName = new ObjectName(ManagementFactory.GARBAGE_COLLECTOR_MXBEAN_DOMAIN_TYPE + ",*");
+        for (ObjectName name : server.queryNames(gcName, null))
         {
-            public void run()
+            server.addNotificationListener(name, inspector, null, null);
+        }
+    }
+
+    public void handleNotification(Notification notification, Object handback)
+    {
+        String type = notification.getType();
+        if (type.equals(GarbageCollectionNotificationInfo.GARBAGE_COLLECTION_NOTIFICATION))
+        {
+            // retrieve the garbage collection notification information
+            CompositeData cd = (CompositeData) notification.getUserData();
+            GarbageCollectionNotificationInfo info = GarbageCollectionNotificationInfo.from(cd);
+
+            long duration = info.getGcInfo().getDuration();
+
+            StringBuilder sb = new StringBuilder();
+            sb.append(info.getGcName()).append(" GC in ").append(duration).append("ms.  ");
+
+            long bytes = 0;
+            List<String> keys = new ArrayList<>(info.getGcInfo().getMemoryUsageBeforeGc().keySet());
+            Collections.sort(keys);
+            for (String key : keys)
             {
-                logGCResults();
+                MemoryUsage before = info.getGcInfo().getMemoryUsageBeforeGc().get(key);
+                MemoryUsage after = info.getGcInfo().getMemoryUsageAfterGc().get(key);
+                if (after != null && after.getUsed() != before.getUsed())
+                {
+                    sb.append(key).append(": ").append(before.getUsed());
+                    sb.append(" -> ");
+                    sb.append(after.getUsed());
+                    if (!key.equals(keys.get(keys.size() - 1)))
+                        sb.append("; ");
+                    bytes += before.getUsed() - after.getUsed();
+                }
             }
-        };
-        StorageService.scheduledTasks.scheduleWithFixedDelay(t, INTERVAL_IN_MS, INTERVAL_IN_MS, TimeUnit.MILLISECONDS);
-    }
 
-    private void logGCResults()
-    {
-        for (GarbageCollectorMXBean gc : beans)
-        {
-            Long previousTotal = gctimes.get(gc.getName());
-            Long total = gc.getCollectionTime();
-            if (previousTotal == null)
-                previousTotal = 0L;
-            if (previousTotal.equals(total))
-                continue;
-            gctimes.put(gc.getName(), total);
-            Long duration = total - previousTotal; // may be zero for a really fast collection
+            while (true)
+            {
+                State prev = state.get();
+                if (state.compareAndSet(prev, new State(duration, bytes, prev)))
+                    break;
+            }
 
-            Long previousCount = gccounts.get(gc.getName());
-            Long count = gc.getCollectionCount();
-
-            if (previousCount == null)
-                previousCount = 0L;
-            if (count.equals(previousCount))
-                continue;
-
-            gccounts.put(gc.getName(), count);
-
-            MemoryUsage mu = membean.getHeapMemoryUsage();
-            long memoryUsed = mu.getUsed();
-            long memoryMax = mu.getMax();
-
-            String st = String.format("GC for %s: %s ms for %s collections, %s used; max is %s",
-                                      gc.getName(), duration, count - previousCount, memoryUsed, memoryMax);
-            long durationPerCollection = duration / (count - previousCount);
-            if (durationPerCollection > MIN_DURATION)
+            String st = sb.toString();
+            if (duration > MIN_LOG_DURATION)
                 logger.info(st);
             else if (logger.isDebugEnabled())
                 logger.debug(st);
 
-            if (durationPerCollection > MIN_DURATION_TPSTATS)
+            if (duration > MIN_LOG_DURATION_TPSTATS)
                 StatusLogger.log();
 
             // if we just finished a full collection and we're still using a lot of memory, try to reduce the pressure
-            if (gc.getName().equals("ConcurrentMarkSweep"))
+            if (info.getGcName().equals("ConcurrentMarkSweep"))
                 SSTableDeletingTask.rescheduleFailedTasks();
         }
     }
+
+    public State getTotalSinceLastCheck()
+    {
+        return state.getAndSet(new State());
+    }
+
+    public double[] getAndResetStats()
+    {
+        State state = getTotalSinceLastCheck();
+        double[] r = new double[6];
+        r[0] = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - state.startNanos);
+        r[1] = state.maxRealTimeElapsed;
+        r[2] = state.totalRealTimeElapsed;
+        r[3] = state.sumSquaresRealTimeElapsed;
+        r[4] = state.totalBytesReclaimed;
+        r[5] = state.count;
+        return r;
+    }
 }

diff --git a/src/java/org/apache/cassandra/service/GCInspectorMXBean.java b/src/java/org/apache/cassandra/service/GCInspectorMXBean.java
new file mode 100644
index 0000000..c26a67c
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/GCInspectorMXBean.java

@@ -0,0 +1,25 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.service;
+
+public interface GCInspectorMXBean
+{
+    // returns { interval (ms), max(gc real time (ms)), sum(gc real time (ms)), sum((gc real time (ms))^2), sum(gc bytes), count(gc) }
+    public double[] getAndResetStats();
+}

diff --git a/src/java/org/apache/cassandra/service/IMigrationListener.java b/src/java/org/apache/cassandra/service/IMigrationListener.java
index e16ac62..4d142bd 100644
--- a/src/java/org/apache/cassandra/service/IMigrationListener.java
+++ b/src/java/org/apache/cassandra/service/IMigrationListener.java

@@ -21,10 +21,13 @@
 {
     public void onCreateKeyspace(String ksName);
     public void onCreateColumnFamily(String ksName, String cfName);
+    public void onCreateUserType(String ksName, String typeName);
 
     public void onUpdateKeyspace(String ksName);
     public void onUpdateColumnFamily(String ksName, String cfName);
+    public void onUpdateUserType(String ksName, String typeName);
 
     public void onDropKeyspace(String ksName);
     public void onDropColumnFamily(String ksName, String cfName);
+    public void onDropUserType(String ksName, String typeName);
 }

diff --git a/src/java/org/apache/cassandra/service/MigrationManager.java b/src/java/org/apache/cassandra/service/MigrationManager.java
index b474bdc..bdae208 100644
--- a/src/java/org/apache/cassandra/service/MigrationManager.java
+++ b/src/java/org/apache/cassandra/service/MigrationManager.java

@@ -18,10 +18,8 @@
 package org.apache.cassandra.service;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.net.InetAddress;
-import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.Future;
@@ -38,15 +36,17 @@
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.UTMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.UserType;
 import org.apache.cassandra.exceptions.AlreadyExistsException;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.*;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.WrappedRunnable;
 
@@ -54,8 +54,6 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(MigrationManager.class);
 
-    private static final ByteBuffer LAST_MIGRATION_KEY = ByteBufferUtil.bytes("Last Migration");
-
     public static final MigrationManager instance = new MigrationManager();
 
     private static final RuntimeMXBean runtimeMXBean = ManagementFactory.getRuntimeMXBean();
@@ -169,6 +167,12 @@
             listener.onCreateColumnFamily(cfm.ksName, cfm.cfName);
     }
 
+    public void notifyCreateUserType(UserType ut)
+    {
+        for (IMigrationListener listener : listeners)
+            listener.onCreateUserType(ut.keyspace, ut.getNameAsString());
+    }
+
     public void notifyUpdateKeyspace(KSMetaData ksm)
     {
         for (IMigrationListener listener : listeners)
@@ -181,6 +185,12 @@
             listener.onUpdateColumnFamily(cfm.ksName, cfm.cfName);
     }
 
+    public void notifyUpdateUserType(UserType ut)
+    {
+        for (IMigrationListener listener : listeners)
+            listener.onUpdateUserType(ut.keyspace, ut.getNameAsString());
+    }
+
     public void notifyDropKeyspace(KSMetaData ksm)
     {
         for (IMigrationListener listener : listeners)
@@ -193,12 +203,23 @@
             listener.onDropColumnFamily(cfm.ksName, cfm.cfName);
     }
 
-    public static void announceNewKeyspace(KSMetaData ksm) throws ConfigurationException
+    public void notifyDropUserType(UserType ut)
     {
-        announceNewKeyspace(ksm, FBUtilities.timestampMicros());
+        for (IMigrationListener listener : listeners)
+            listener.onDropUserType(ut.keyspace, ut.getNameAsString());
     }
 
-    public static void announceNewKeyspace(KSMetaData ksm, long timestamp) throws ConfigurationException
+    public static void announceNewKeyspace(KSMetaData ksm) throws ConfigurationException
+    {
+        announceNewKeyspace(ksm, false);
+    }
+
+    public static void announceNewKeyspace(KSMetaData ksm, boolean announceLocally) throws ConfigurationException
+    {
+        announceNewKeyspace(ksm, FBUtilities.timestampMicros(), announceLocally);
+    }
+
+    public static void announceNewKeyspace(KSMetaData ksm, long timestamp, boolean announceLocally) throws ConfigurationException
     {
         ksm.validate();
 
@@ -206,11 +227,16 @@
             throw new AlreadyExistsException(ksm.name);
 
         logger.info(String.format("Create new Keyspace: %s", ksm));
-        announce(ksm.toSchema(timestamp));
+        announce(ksm.toSchema(timestamp), announceLocally);
     }
 
     public static void announceNewColumnFamily(CFMetaData cfm) throws ConfigurationException
     {
+        announceNewColumnFamily(cfm, false);
+    }
+
+    public static void announceNewColumnFamily(CFMetaData cfm, boolean announceLocally) throws ConfigurationException
+    {
         cfm.validate();
 
         KSMetaData ksm = Schema.instance.getKSMetaData(cfm.ksName);
@@ -220,11 +246,26 @@
             throw new AlreadyExistsException(cfm.ksName, cfm.cfName);
 
         logger.info(String.format("Create new ColumnFamily: %s", cfm));
-        announce(addSerializedKeyspace(cfm.toSchema(FBUtilities.timestampMicros()), cfm.ksName));
+        announce(addSerializedKeyspace(cfm.toSchema(FBUtilities.timestampMicros()), cfm.ksName), announceLocally);
+    }
+
+    public static void announceNewType(UserType newType)
+    {
+        announceNewType(newType, false);
+    }
+
+    public static void announceNewType(UserType newType, boolean announceLocally)
+    {
+        announce(addSerializedKeyspace(UTMetaData.toSchema(newType, FBUtilities.timestampMicros()), newType.keyspace), announceLocally);
     }
 
     public static void announceKeyspaceUpdate(KSMetaData ksm) throws ConfigurationException
     {
+        announceKeyspaceUpdate(ksm, false);
+    }
+
+    public static void announceKeyspaceUpdate(KSMetaData ksm, boolean announceLocally) throws ConfigurationException
+    {
         ksm.validate();
 
         KSMetaData oldKsm = Schema.instance.getKSMetaData(ksm.name);
@@ -232,11 +273,16 @@
             throw new ConfigurationException(String.format("Cannot update non existing keyspace '%s'.", ksm.name));
 
         logger.info(String.format("Update Keyspace '%s' From %s To %s", ksm.name, oldKsm, ksm));
-        announce(oldKsm.toSchemaUpdate(ksm, FBUtilities.timestampMicros()));
+        announce(oldKsm.toSchemaUpdate(ksm, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceColumnFamilyUpdate(CFMetaData cfm, boolean fromThrift) throws ConfigurationException
     {
+        announceColumnFamilyUpdate(cfm, fromThrift, false);
+    }
+
+    public static void announceColumnFamilyUpdate(CFMetaData cfm, boolean fromThrift, boolean announceLocally) throws ConfigurationException
+    {
         cfm.validate();
 
         CFMetaData oldCfm = Schema.instance.getCFMetaData(cfm.ksName, cfm.cfName);
@@ -246,55 +292,99 @@
         oldCfm.validateCompatility(cfm);
 
         logger.info(String.format("Update ColumnFamily '%s/%s' From %s To %s", cfm.ksName, cfm.cfName, oldCfm, cfm));
-        announce(addSerializedKeyspace(oldCfm.toSchemaUpdate(cfm, FBUtilities.timestampMicros(), fromThrift), cfm.ksName));
+        announce(addSerializedKeyspace(oldCfm.toSchemaUpdate(cfm, FBUtilities.timestampMicros(), fromThrift), cfm.ksName), announceLocally);
+    }
+
+    public static void announceTypeUpdate(UserType updatedType)
+    {
+        announceTypeUpdate(updatedType, false);
+    }
+
+    public static void announceTypeUpdate(UserType updatedType, boolean announceLocally)
+    {
+        announceNewType(updatedType, announceLocally);
     }
 
     public static void announceKeyspaceDrop(String ksName) throws ConfigurationException
     {
+        announceKeyspaceDrop(ksName, false);
+    }
+
+    public static void announceKeyspaceDrop(String ksName, boolean announceLocally) throws ConfigurationException
+    {
         KSMetaData oldKsm = Schema.instance.getKSMetaData(ksName);
         if (oldKsm == null)
             throw new ConfigurationException(String.format("Cannot drop non existing keyspace '%s'.", ksName));
 
         logger.info(String.format("Drop Keyspace '%s'", oldKsm.name));
-        announce(oldKsm.dropFromSchema(FBUtilities.timestampMicros()));
+        announce(oldKsm.dropFromSchema(FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceColumnFamilyDrop(String ksName, String cfName) throws ConfigurationException
     {
+        announceColumnFamilyDrop(ksName, cfName, false);
+    }
+
+    public static void announceColumnFamilyDrop(String ksName, String cfName, boolean announceLocally) throws ConfigurationException
+    {
         CFMetaData oldCfm = Schema.instance.getCFMetaData(ksName, cfName);
         if (oldCfm == null)
             throw new ConfigurationException(String.format("Cannot drop non existing column family '%s' in keyspace '%s'.", cfName, ksName));
 
         logger.info(String.format("Drop ColumnFamily '%s/%s'", oldCfm.ksName, oldCfm.cfName));
-        announce(addSerializedKeyspace(oldCfm.dropFromSchema(FBUtilities.timestampMicros()), ksName));
+        announce(addSerializedKeyspace(oldCfm.dropFromSchema(FBUtilities.timestampMicros()), ksName), announceLocally);
     }
 
     // Include the serialized keyspace for when a target node missed the CREATE KEYSPACE migration (see #5631).
-    private static RowMutation addSerializedKeyspace(RowMutation migration, String ksName)
+    private static Mutation addSerializedKeyspace(Mutation migration, String ksName)
     {
-        migration.add(SystemKeyspace.readSchemaRow(ksName).cf);
+        migration.add(SystemKeyspace.readSchemaRow(SystemKeyspace.SCHEMA_KEYSPACES_CF, ksName).cf);
         return migration;
     }
 
+    public static void announceTypeDrop(UserType droppedType)
+    {
+        announceTypeDrop(droppedType, false);
+    }
+
+    public static void announceTypeDrop(UserType droppedType, boolean announceLocally)
+    {
+        announce(addSerializedKeyspace(UTMetaData.dropFromSchema(droppedType, FBUtilities.timestampMicros()), droppedType.keyspace), announceLocally);
+    }
+
     /**
      * actively announce a new version to active hosts via rpc
      * @param schema The schema mutation to be applied
      */
-    private static void announce(RowMutation schema)
+    private static void announce(Mutation schema, boolean announceLocally)
     {
-        FBUtilities.waitOnFuture(announce(Collections.singletonList(schema)));
+        if (announceLocally)
+        {
+            try
+            {
+                DefsTables.mergeSchemaInternal(Collections.singletonList(schema), false);
+            }
+            catch (ConfigurationException | IOException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        else
+        {
+            FBUtilities.waitOnFuture(announce(Collections.singletonList(schema)));
+        }
     }
 
-    private static void pushSchemaMutation(InetAddress endpoint, Collection<RowMutation> schema)
+    private static void pushSchemaMutation(InetAddress endpoint, Collection<Mutation> schema)
     {
-        MessageOut<Collection<RowMutation>> msg = new MessageOut<>(MessagingService.Verb.DEFINITIONS_UPDATE,
-                                                                   schema,
-                                                                   MigrationsSerializer.instance);
+        MessageOut<Collection<Mutation>> msg = new MessageOut<>(MessagingService.Verb.DEFINITIONS_UPDATE,
+                                                                schema,
+                                                                MigrationsSerializer.instance);
         MessagingService.instance().sendOneWay(msg, endpoint);
     }
 
     // Returns a future on the local application of the schema
-    private static Future<?> announce(final Collection<RowMutation> schema)
+    private static Future<?> announce(final Collection<Mutation> schema)
     {
         Future<?> f = StageManager.getStage(Stage.MIGRATION).submit(new WrappedRunnable()
         {
@@ -325,7 +415,7 @@
     public static void passiveAnnounce(UUID version)
     {
         Gossiper.instance.addLocalApplicationState(ApplicationState.SCHEMA, StorageService.instance.valueFactory.schema(version));
-        logger.debug("Gossiping my schema version " + version);
+        logger.debug("Gossiping my schema version {}", version);
     }
 
     /**
@@ -341,10 +431,8 @@
         logger.debug("Truncating schema tables...");
 
         // truncate schema tables
-        SystemKeyspace.schemaCFS(SystemKeyspace.SCHEMA_KEYSPACES_CF).truncateBlocking();
-        SystemKeyspace.schemaCFS(SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF).truncateBlocking();
-        SystemKeyspace.schemaCFS(SystemKeyspace.SCHEMA_COLUMNS_CF).truncateBlocking();
-        SystemKeyspace.schemaCFS(SystemKeyspace.SCHEMA_TRIGGERS_CF).truncateBlocking();
+        for (String cf : SystemKeyspace.allSchemaCfs)
+            SystemKeyspace.schemaCFS(cf).truncateBlocking();
 
         logger.debug("Clearing local schema keyspace definitions...");
 
@@ -367,33 +455,33 @@
         logger.info("Local schema reset is complete.");
     }
 
-    public static class MigrationsSerializer implements IVersionedSerializer<Collection<RowMutation>>
+    public static class MigrationsSerializer implements IVersionedSerializer<Collection<Mutation>>
     {
         public static MigrationsSerializer instance = new MigrationsSerializer();
 
-        public void serialize(Collection<RowMutation> schema, DataOutput out, int version) throws IOException
+        public void serialize(Collection<Mutation> schema, DataOutputPlus out, int version) throws IOException
         {
             out.writeInt(schema.size());
-            for (RowMutation rm : schema)
-                RowMutation.serializer.serialize(rm, out, version);
+            for (Mutation mutation : schema)
+                Mutation.serializer.serialize(mutation, out, version);
         }
 
-        public Collection<RowMutation> deserialize(DataInput in, int version) throws IOException
+        public Collection<Mutation> deserialize(DataInput in, int version) throws IOException
         {
             int count = in.readInt();
-            Collection<RowMutation> schema = new ArrayList<RowMutation>(count);
+            Collection<Mutation> schema = new ArrayList<Mutation>(count);
 
             for (int i = 0; i < count; i++)
-                schema.add(RowMutation.serializer.deserialize(in, version));
+                schema.add(Mutation.serializer.deserialize(in, version));
 
             return schema;
         }
 
-        public long serializedSize(Collection<RowMutation> schema, int version)
+        public long serializedSize(Collection<Mutation> schema, int version)
         {
             int size = TypeSizes.NATIVE.sizeof(schema.size());
-            for (RowMutation rm : schema)
-                size += RowMutation.serializer.serializedSize(rm, version);
+            for (Mutation mutation : schema)
+                size += Mutation.serializer.serializedSize(mutation, version);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/service/MigrationTask.java b/src/java/org/apache/cassandra/service/MigrationTask.java
index 93572f0..9fdbff4 100644
--- a/src/java/org/apache/cassandra/service/MigrationTask.java
+++ b/src/java/org/apache/cassandra/service/MigrationTask.java

@@ -24,9 +24,9 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.db.DefsTables;
-import org.apache.cassandra.db.RowMutation;
 import org.apache.cassandra.gms.FailureDetector;
 import org.apache.cassandra.net.IAsyncCallback;
 import org.apache.cassandra.net.MessageIn;
@@ -56,10 +56,10 @@
             return;
         }
 
-        IAsyncCallback<Collection<RowMutation>> cb = new IAsyncCallback<Collection<RowMutation>>()
+        IAsyncCallback<Collection<Mutation>> cb = new IAsyncCallback<Collection<Mutation>>()
         {
             @Override
-            public void response(MessageIn<Collection<RowMutation>> message)
+            public void response(MessageIn<Collection<Mutation>> message)
             {
                 try
                 {

diff --git a/src/java/org/apache/cassandra/service/PendingRangeCalculatorService.java b/src/java/org/apache/cassandra/service/PendingRangeCalculatorService.java
index 2a88e7d..0ff8a92 100644
--- a/src/java/org/apache/cassandra/service/PendingRangeCalculatorService.java
+++ b/src/java/org/apache/cassandra/service/PendingRangeCalculatorService.java

@@ -43,7 +43,7 @@
 import java.util.Collection;
 import java.util.concurrent.*;
 
-public class PendingRangeCalculatorService extends PendingRangeCalculatorServiceMBean
+public class PendingRangeCalculatorService
 {
     public static final PendingRangeCalculatorService instance = new PendingRangeCalculatorService();
 

diff --git a/src/java/org/apache/cassandra/service/PendingRangeCalculatorServiceMBean.java b/src/java/org/apache/cassandra/service/PendingRangeCalculatorServiceMBean.java
deleted file mode 100644
index c9b04f0..0000000
--- a/src/java/org/apache/cassandra/service/PendingRangeCalculatorServiceMBean.java
+++ /dev/null

@@ -1,23 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.service;
-
-public class PendingRangeCalculatorServiceMBean
-{
-}

diff --git a/src/java/org/apache/cassandra/service/QueryState.java b/src/java/org/apache/cassandra/service/QueryState.java
index 12fc392..0179a3e 100644
--- a/src/java/org/apache/cassandra/service/QueryState.java
+++ b/src/java/org/apache/cassandra/service/QueryState.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.service;
 
 import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
 
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
@@ -68,7 +69,7 @@
         }
 
         double tracingProbability = StorageService.instance.getTracingProbability();
-        return tracingProbability != 0 && FBUtilities.threadLocalRandom().nextDouble() < tracingProbability;
+        return tracingProbability != 0 && ThreadLocalRandom.current().nextDouble() < tracingProbability;
     }
 
     public void prepareTracingSession(UUID sessionId)

diff --git a/src/java/org/apache/cassandra/service/ReadCallback.java b/src/java/org/apache/cassandra/service/ReadCallback.java
index 150fabe..29eaadf 100644
--- a/src/java/org/apache/cassandra/service/ReadCallback.java
+++ b/src/java/org/apache/cassandra/service/ReadCallback.java

@@ -21,7 +21,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
 
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
@@ -41,7 +41,7 @@
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.SimpleCondition;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
 public class ReadCallback<TMessage, TResolved> implements IAsyncCallback<TMessage>
 {
@@ -54,7 +54,9 @@
     final List<InetAddress> endpoints;
     private final IReadCommand command;
     private final ConsistencyLevel consistencyLevel;
-    private final AtomicInteger received = new AtomicInteger(0);
+    private static final AtomicIntegerFieldUpdater<ReadCallback> recievedUpdater
+            = AtomicIntegerFieldUpdater.newUpdater(ReadCallback.class, "received");
+    private volatile int received = 0;
     private final Keyspace keyspace; // TODO push this into ConsistencyLevel?
 
     /**
@@ -98,7 +100,8 @@
         if (!await(command.getTimeout(), TimeUnit.MILLISECONDS))
         {
             // Same as for writes, see AbstractWriteResponseHandler
-            ReadTimeoutException ex = new ReadTimeoutException(consistencyLevel, received.get(), blockfor, resolver.isDataPresent());
+            ReadTimeoutException ex = new ReadTimeoutException(consistencyLevel, received, blockfor, resolver.isDataPresent());
+
             if (logger.isDebugEnabled())
                 logger.debug("Read timeout: {}", ex.toString());
             throw ex;
@@ -111,8 +114,8 @@
     {
         resolver.preprocess(message);
         int n = waitingFor(message)
-              ? received.incrementAndGet()
-              : received.get();
+              ? recievedUpdater.incrementAndGet(this)
+              : received;
         if (n >= blockfor && resolver.isDataPresent())
         {
             condition.signalAll();
@@ -138,7 +141,7 @@
      */
     public int getReceivedCount()
     {
-        return received.get();
+        return received;
     }
 
     public void response(TMessage result)

diff --git a/src/java/org/apache/cassandra/service/RowDataResolver.java b/src/java/org/apache/cassandra/service/RowDataResolver.java
index 00f8753..e92dad7 100644
--- a/src/java/org/apache/cassandra/service/RowDataResolver.java
+++ b/src/java/org/apache/cassandra/service/RowDataResolver.java

@@ -57,15 +57,16 @@
     */
     public Row resolve() throws DigestMismatchException
     {
+        int replyCount = replies.size();
         if (logger.isDebugEnabled())
-            logger.debug("resolving " + replies.size() + " responses");
+            logger.debug("resolving {} responses", replyCount);
         long start = System.nanoTime();
 
         ColumnFamily resolved;
-        if (replies.size() > 1)
+        if (replyCount > 1)
         {
-            List<ColumnFamily> versions = new ArrayList<ColumnFamily>(replies.size());
-            List<InetAddress> endpoints = new ArrayList<InetAddress>(replies.size());
+            List<ColumnFamily> versions = new ArrayList<>(replyCount);
+            List<InetAddress> endpoints = new ArrayList<>(replyCount);
 
             for (MessageIn<ReadResponse> message : replies)
             {
@@ -115,13 +116,12 @@
             if (diffCf == null) // no repair needs to happen
                 continue;
 
-            // create and send the row mutation message based on the diff
-            RowMutation rowMutation = new RowMutation(keyspaceName, key.key, diffCf);
-            MessageOut repairMessage;
+            // create and send the mutation message based on the diff
+            Mutation mutation = new Mutation(keyspaceName, key.getKey(), diffCf);
             // use a separate verb here because we don't want these to be get the white glove hint-
             // on-timeout behavior that a "real" mutation gets
-            repairMessage = rowMutation.createMessage(MessagingService.Verb.READ_REPAIR);
-            results.add(MessagingService.instance().sendRR(repairMessage, endpoints.get(i)));
+            results.add(MessagingService.instance().sendRR(mutation.createMessage(MessagingService.Verb.READ_REPAIR),
+                                                           endpoints.get(i)));
         }
 
         return results;
@@ -146,23 +146,21 @@
             return null;
 
         // mimic the collectCollatedColumn + removeDeleted path that getColumnFamily takes.
-        // this will handle removing columns and subcolumns that are supressed by a row or
+        // this will handle removing columns and subcolumns that are suppressed by a row or
         // supercolumn tombstone.
         QueryFilter filter = new QueryFilter(null, resolved.metadata().cfName, new IdentityQueryFilter(), now);
-        List<CloseableIterator<Column>> iters = new ArrayList<CloseableIterator<Column>>();
+        List<CloseableIterator<Cell>> iters = new ArrayList<>(Iterables.size(versions));
         for (ColumnFamily version : versions)
-        {
-            if (version == null)
-                continue;
-            iters.add(FBUtilities.closeableIterator(version.iterator()));
-        }
+            if (version != null)
+                iters.add(FBUtilities.closeableIterator(version.iterator()));
         filter.collateColumns(resolved, iters, Integer.MIN_VALUE);
         return ColumnFamilyStore.removeDeleted(resolved, Integer.MIN_VALUE);
     }
 
     public Row getData()
     {
-        return replies.iterator().next().payload.row();
+        assert !replies.isEmpty();
+        return replies.peek().payload.row();
     }
 
     public boolean isDataPresent()

diff --git a/src/java/org/apache/cassandra/service/RowDigestResolver.java b/src/java/org/apache/cassandra/service/RowDigestResolver.java
index ec9f0d3..21b16bf 100644
--- a/src/java/org/apache/cassandra/service/RowDigestResolver.java
+++ b/src/java/org/apache/cassandra/service/RowDigestResolver.java

@@ -59,7 +59,7 @@
     public Row resolve() throws DigestMismatchException
     {
         if (logger.isDebugEnabled())
-            logger.debug("resolving " + replies.size() + " responses");
+            logger.debug("resolving {} responses", replies.size());
 
         long start = System.nanoTime();
 

diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index d8b6619..f30862b 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java

@@ -43,6 +43,8 @@
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.db.index.SecondaryIndexSearcher;
 import org.apache.cassandra.db.marshal.UUIDType;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.Bounds;
@@ -91,6 +93,8 @@
     private static final CASClientRequestMetrics casWriteMetrics = new CASClientRequestMetrics("CASWrite");
     private static final CASClientRequestMetrics casReadMetrics = new CASClientRequestMetrics("CASRead");
 
+    private static final double CONCURRENT_SUBREQUESTS_MARGIN = 0.10;
+
     private StorageProxy() {}
 
     static
@@ -98,7 +102,7 @@
         MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
         try
         {
-            mbs.registerMBean(new StorageProxy(), new ObjectName(MBEAN_NAME));
+            mbs.registerMBean(instance, new ObjectName(MBEAN_NAME));
         }
         catch (Exception e)
         {
@@ -114,15 +118,15 @@
                               ConsistencyLevel consistency_level)
             throws OverloadedException
             {
-                assert mutation instanceof RowMutation;
-                sendToHintedEndpoints((RowMutation) mutation, targets, responseHandler, localDataCenter);
+                assert mutation instanceof Mutation;
+                sendToHintedEndpoints((Mutation) mutation, targets, responseHandler, localDataCenter);
             }
         };
 
         /*
          * We execute counter writes in 2 places: either directly in the coordinator node if it is a replica, or
-         * in CounterMutationVerbHandler on a replica othewise. The write must be executed on the MUTATION stage
-         * but on the latter case, the verb handler already run on the MUTATION stage, so we must not execute the
+         * in CounterMutationVerbHandler on a replica othewise. The write must be executed on the COUNTER_MUTATION stage
+         * but on the latter case, the verb handler already run on the COUNTER_MUTATION stage, so we must not execute the
          * underlying on the stage otherwise we risk a deadlock. Hence two different performer.
          */
         counterWritePerformer = new WritePerformer()
@@ -131,10 +135,9 @@
                               Iterable<InetAddress> targets,
                               AbstractWriteResponseHandler responseHandler,
                               String localDataCenter,
-                              ConsistencyLevel consistency_level)
+                              ConsistencyLevel consistencyLevel)
             {
-                Runnable runnable = counterWriteTask(mutation, targets, responseHandler, localDataCenter, consistency_level);
-                runnable.run();
+                counterWriteTask(mutation, targets, responseHandler, localDataCenter).run();
             }
         };
 
@@ -144,10 +147,10 @@
                               Iterable<InetAddress> targets,
                               AbstractWriteResponseHandler responseHandler,
                               String localDataCenter,
-                              ConsistencyLevel consistency_level)
+                              ConsistencyLevel consistencyLevel)
             {
-                Runnable runnable = counterWriteTask(mutation, targets, responseHandler, localDataCenter, consistency_level);
-                StageManager.getStage(Stage.MUTATION).execute(runnable);
+                StageManager.getStage(Stage.COUNTER_MUTATION)
+                            .execute(counterWriteTask(mutation, targets, responseHandler, localDataCenter));
             }
         };
     }
@@ -186,8 +189,7 @@
      * @param keyspaceName the keyspace for the CAS
      * @param cfName the column family for the CAS
      * @param key the row key for the row to CAS
-     * @param conditions the conditions for the CAS to apply.
-     * @param updates the value to insert if {@code condtions} matches the current values.
+     * @param request the conditions for the CAS to apply as well as the update to perform if the conditions hold.
      * @param consistencyForPaxos the consistency for the paxos prepare and propose round. This can only be either SERIAL or LOCAL_SERIAL.
      * @param consistencyForCommit the consistency for write done during the commit phase. This can be anything, except SERIAL or LOCAL_SERIAL.
      *
@@ -197,13 +199,12 @@
     public static ColumnFamily cas(String keyspaceName,
                                    String cfName,
                                    ByteBuffer key,
-                                   CASConditions conditions,
-                                   ColumnFamily updates,
+                                   CASRequest request,
                                    ConsistencyLevel consistencyForPaxos,
                                    ConsistencyLevel consistencyForCommit)
     throws UnavailableException, IsBootstrappingException, ReadTimeoutException, WriteTimeoutException, InvalidRequestException
     {
-        long start = System.nanoTime();
+        final long start = System.nanoTime();
         int contentions = 0;
         try
         {
@@ -223,26 +224,26 @@
                 final Pair<UUID, Integer> pair = beginAndRepairPaxos(start, key, metadata, liveEndpoints, requiredParticipants, consistencyForPaxos, consistencyForCommit, true);
                 final UUID ballot = pair.left;
                 contentions += pair.right;
-
                 // read the current values and check they validate the conditions
                 Tracing.trace("Reading existing values for CAS precondition");
                 long timestamp = System.currentTimeMillis();
-                ReadCommand readCommand = ReadCommand.create(keyspaceName, key, cfName, timestamp, conditions.readFilter());
+                ReadCommand readCommand = ReadCommand.create(keyspaceName, key, cfName, timestamp, request.readFilter());
                 List<Row> rows = read(Arrays.asList(readCommand), consistencyForPaxos == ConsistencyLevel.LOCAL_SERIAL ? ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.QUORUM);
                 ColumnFamily current = rows.get(0).cf;
-                if (!conditions.appliesTo(current))
+                if (!request.appliesTo(current))
                 {
-                    Tracing.trace("CAS precondition {} does not match current values {}", conditions, current);
+                    Tracing.trace("CAS precondition does not match current values {}", current);
                     // We should not return null as this means success
                     casWriteMetrics.conditionNotMet.inc();
-                    return current == null ? EmptyColumns.factory.create(metadata) : current;
+                    return current == null ? ArrayBackedSortedColumns.factory.create(metadata) : current;
                 }
 
                 // finish the paxos round w/ the desired updates
                 // TODO turn null updates into delete?
+                ColumnFamily updates = request.makeUpdates(current);
 
                 // Apply triggers to cas updates. A consideration here is that
-                // triggers emit RowMutations, and so a given trigger implementation
+                // triggers emit Mutations, and so a given trigger implementation
                 // may generate mutations for partitions other than the one this
                 // paxos round is scoped for. In this case, TriggerExecutor will
                 // validate that the generated mutations are targetted at the same
@@ -261,7 +262,7 @@
 
                 Tracing.trace("Paxos proposal not accepted (pre-empted by a higher ballot)");
                 contentions++;
-                Uninterruptibles.sleepUninterruptibly(FBUtilities.threadLocalRandom().nextInt(100), TimeUnit.MILLISECONDS);
+                Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(100), TimeUnit.MILLISECONDS);
                 // continue to retry
             }
 
@@ -281,10 +282,8 @@
         {
             if(contentions > 0)
                 casWriteMetrics.contention.update(contentions);
-
             casWriteMetrics.addNano(System.nanoTime() - start);
         }
-
     }
 
     private static Predicate<InetAddress> sameDCPredicateFor(final String dc)
@@ -348,7 +347,7 @@
                 Tracing.trace("Some replicas have already promised a higher ballot than ours; aborting");
                 contentions++;
                 // sleep a random amount to give the other proposer a chance to finish
-                Uninterruptibles.sleepUninterruptibly(FBUtilities.threadLocalRandom().nextInt(100), TimeUnit.MILLISECONDS);
+                Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(100), TimeUnit.MILLISECONDS);
                 continue;
             }
 
@@ -374,7 +373,7 @@
                     Tracing.trace("Some replicas have already promised a higher ballot than ours; aborting");
                     // sleep a random amount to give the other proposer a chance to finish
                     contentions++;
-                    Uninterruptibles.sleepUninterruptibly(FBUtilities.threadLocalRandom().nextInt(100), TimeUnit.MILLISECONDS);
+                    Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(100), TimeUnit.MILLISECONDS);
                 }
                 continue;
             }
@@ -489,7 +488,7 @@
         final String localDataCenter = DatabaseDescriptor.getEndpointSnitch().getDatacenter(FBUtilities.getBroadcastAddress());
 
         long startTime = System.nanoTime();
-        List<AbstractWriteResponseHandler> responseHandlers = new ArrayList<AbstractWriteResponseHandler>(mutations.size());
+        List<AbstractWriteResponseHandler> responseHandlers = new ArrayList<>(mutations.size());
 
         try
         {
@@ -532,7 +531,7 @@
                         // local writes can timeout, but cannot be dropped (see LocalMutationRunnable and
                         // CASSANDRA-6510), so there is no need to hint or retry
                         if (!target.equals(FBUtilities.getBroadcastAddress()) && shouldHint(target))
-                            submitHint((RowMutation) mutation, target, null);
+                            submitHint((Mutation) mutation, target, null);
                     }
                 }
                 Tracing.trace("Wrote hint to satisfy CL.ANY after no replicas acknowledged the write");
@@ -570,12 +569,12 @@
                                           boolean mutateAtomically)
     throws WriteTimeoutException, UnavailableException, OverloadedException, InvalidRequestException
     {
-        Collection<RowMutation> augmented = TriggerExecutor.instance.execute(mutations);
+        Collection<Mutation> augmented = TriggerExecutor.instance.execute(mutations);
 
         if (augmented != null)
             mutateAtomically(augmented, consistencyLevel);
         else if (mutateAtomically)
-            mutateAtomically((Collection<RowMutation>) mutations, consistencyLevel);
+            mutateAtomically((Collection<Mutation>) mutations, consistencyLevel);
         else
             mutate(mutations, consistencyLevel);
     }
@@ -586,10 +585,10 @@
      *      write the entire batch to a batchlog elsewhere in the cluster.
      * After: remove the batchlog entry (after writing hints for the batch rows, if necessary).
      *
-     * @param mutations the RowMutations to be applied across the replicas
+     * @param mutations the Mutations to be applied across the replicas
      * @param consistency_level the consistency level for the operation
      */
-    public static void mutateAtomically(Collection<RowMutation> mutations, ConsistencyLevel consistency_level)
+    public static void mutateAtomically(Collection<Mutation> mutations, ConsistencyLevel consistency_level)
     throws UnavailableException, OverloadedException, WriteTimeoutException
     {
         Tracing.trace("Determining replicas for atomic batch");
@@ -601,7 +600,7 @@
         try
         {
             // add a handler for each mutation - includes checking availability, but doesn't initiate any writes, yet
-            for (RowMutation mutation : mutations)
+            for (Mutation mutation : mutations)
             {
                 WriteResponseHandlerWrapper wrapper = wrapResponseHandler(mutation, consistency_level, WriteType.BATCH);
                 // exit early if we can't fulfill the CL at this time.
@@ -615,7 +614,7 @@
             syncWriteToBatchlog(mutations, batchlogEndpoints, batchUUID);
 
             // now actually perform the writes and wait for them to complete
-            syncWriteBatchedMutations(wrappers, localDataCenter, consistency_level);
+            syncWriteBatchedMutations(wrappers, localDataCenter);
 
             // remove the batchlog entries asynchronously
             asyncRemoveFromBatchlog(batchlogEndpoints, batchUUID);
@@ -640,52 +639,63 @@
         }
     }
 
-    private static void syncWriteToBatchlog(Collection<RowMutation> mutations, Collection<InetAddress> endpoints, UUID uuid)
+    private static void syncWriteToBatchlog(Collection<Mutation> mutations, Collection<InetAddress> endpoints, UUID uuid)
     throws WriteTimeoutException
     {
-        RowMutation rm = BatchlogManager.getBatchlogMutationFor(mutations, uuid);
         AbstractWriteResponseHandler handler = new WriteResponseHandler(endpoints,
                                                                         Collections.<InetAddress>emptyList(),
                                                                         ConsistencyLevel.ONE,
                                                                         Keyspace.open(Keyspace.SYSTEM_KS),
                                                                         null,
                                                                         WriteType.BATCH_LOG);
-        updateBatchlog(rm, endpoints, handler);
+
+        MessageOut<Mutation> message = BatchlogManager.getBatchlogMutationFor(mutations, uuid, MessagingService.current_version)
+                                                      .createMessage();
+        for (InetAddress target : endpoints)
+        {
+            int targetVersion = MessagingService.instance().getVersion(target);
+            if (target.equals(FBUtilities.getBroadcastAddress()) && OPTIMIZE_LOCAL_REQUESTS)
+            {
+                insertLocal(message.payload, handler);
+            }
+            else if (targetVersion == MessagingService.current_version)
+            {
+                MessagingService.instance().sendRR(message, target, handler, false);
+            }
+            else
+            {
+                MessagingService.instance().sendRR(BatchlogManager.getBatchlogMutationFor(mutations, uuid, targetVersion)
+                                                                  .createMessage(),
+                                                   target,
+                                                   handler,
+                                                   false);
+            }
+        }
+
         handler.get();
     }
 
     private static void asyncRemoveFromBatchlog(Collection<InetAddress> endpoints, UUID uuid)
     {
-        ColumnFamily cf = EmptyColumns.factory.create(Schema.instance.getCFMetaData(Keyspace.SYSTEM_KS, SystemKeyspace.BATCHLOG_CF));
-        cf.delete(new DeletionInfo(FBUtilities.timestampMicros(), (int) (System.currentTimeMillis() / 1000)));
         AbstractWriteResponseHandler handler = new WriteResponseHandler(endpoints,
                                                                         Collections.<InetAddress>emptyList(),
                                                                         ConsistencyLevel.ANY,
                                                                         Keyspace.open(Keyspace.SYSTEM_KS),
                                                                         null,
                                                                         WriteType.SIMPLE);
-        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, UUIDType.instance.decompose(uuid), cf);
-        updateBatchlog(rm, endpoints, handler);
-    }
-
-    private static void updateBatchlog(RowMutation rm, Collection<InetAddress> endpoints, AbstractWriteResponseHandler handler)
-    {
-        if (endpoints.contains(FBUtilities.getBroadcastAddress()))
+        Mutation mutation = new Mutation(Keyspace.SYSTEM_KS, UUIDType.instance.decompose(uuid));
+        mutation.delete(SystemKeyspace.BATCHLOG_CF, FBUtilities.timestampMicros());
+        MessageOut<Mutation> message = mutation.createMessage();
+        for (InetAddress target : endpoints)
         {
-            assert endpoints.size() == 1;
-            insertLocal(rm, handler);
-        }
-        else
-        {
-            MessageOut<RowMutation> message = rm.createMessage();
-            for (InetAddress target : endpoints)
+            if (target.equals(FBUtilities.getBroadcastAddress()) && OPTIMIZE_LOCAL_REQUESTS)
+                insertLocal(message.payload, handler);
+            else
                 MessagingService.instance().sendRR(message, target, handler, false);
         }
     }
 
-    private static void syncWriteBatchedMutations(List<WriteResponseHandlerWrapper> wrappers,
-                                                  String localDataCenter,
-                                                  ConsistencyLevel consistencyLevel)
+    private static void syncWriteBatchedMutations(List<WriteResponseHandlerWrapper> wrappers, String localDataCenter)
     throws WriteTimeoutException, OverloadedException
     {
         for (WriteResponseHandlerWrapper wrapper : wrappers)
@@ -695,9 +705,7 @@
         }
 
         for (WriteResponseHandlerWrapper wrapper : wrappers)
-        {
             wrapper.handler.get();
-        }
     }
 
     /**
@@ -739,7 +747,7 @@
     }
 
     // same as above except does not initiate writes (but does perform availability checks).
-    private static WriteResponseHandlerWrapper wrapResponseHandler(RowMutation mutation, ConsistencyLevel consistency_level, WriteType writeType)
+    private static WriteResponseHandlerWrapper wrapResponseHandler(Mutation mutation, ConsistencyLevel consistency_level, WriteType writeType)
     {
         AbstractReplicationStrategy rs = Keyspace.open(mutation.getKeyspaceName()).getReplicationStrategy();
         String keyspaceName = mutation.getKeyspaceName();
@@ -754,9 +762,9 @@
     private static class WriteResponseHandlerWrapper
     {
         final AbstractWriteResponseHandler handler;
-        final RowMutation mutation;
+        final Mutation mutation;
 
-        WriteResponseHandlerWrapper(AbstractWriteResponseHandler handler, RowMutation mutation)
+        WriteResponseHandlerWrapper(AbstractWriteResponseHandler handler, Mutation mutation)
         {
             this.handler = handler;
             this.mutation = mutation;
@@ -775,14 +783,9 @@
     {
         TokenMetadata.Topology topology = StorageService.instance.getTokenMetadata().cachedOnlyTokenMap().getTopology();
         Multimap<String, InetAddress> localEndpoints = HashMultimap.create(topology.getDatacenterRacks().get(localDataCenter));
-        
-        // special case for single-node datacenters
-        if (localEndpoints.size() == 1)
-            return localEndpoints.values();
-
         String localRack = DatabaseDescriptor.getEndpointSnitch().getRack(FBUtilities.getBroadcastAddress());
-        Collection<InetAddress> chosenEndpoints = new BatchlogEndpointSelector(localRack).chooseEndpoints(localEndpoints);
 
+        Collection<InetAddress> chosenEndpoints = new BatchlogManager.EndpointFilter(localRack, localEndpoints).filter();
         if (chosenEndpoints.isEmpty())
         {
             if (consistencyLevel == ConsistencyLevel.ANY)
@@ -808,7 +811,7 @@
      *
      * @throws OverloadedException if the hints cannot be written/enqueued
      */
-    public static void sendToHintedEndpoints(final RowMutation rm,
+    public static void sendToHintedEndpoints(final Mutation mutation,
                                              Iterable<InetAddress> targets,
                                              AbstractWriteResponseHandler responseHandler,
                                              String localDataCenter)
@@ -817,7 +820,10 @@
         // extra-datacenter replicas, grouped by dc
         Map<String, Collection<InetAddress>> dcGroups = null;
         // only need to create a Message for non-local writes
-        MessageOut<RowMutation> message = null;
+        MessageOut<Mutation> message = null;
+
+        boolean insertLocal = false;
+
 
         for (InetAddress destination : targets)
         {
@@ -827,7 +833,7 @@
             // a small number of nodes causing problems, so we should avoid shutting down writes completely to
             // healthy nodes.  Any node with no hintsInProgress is considered healthy.
             if (StorageMetrics.totalHintsInProgress.count() > maxHintsInProgress
-                && (getHintsInProgressFor(destination).get() > 0 && shouldHint(destination)))
+                    && (getHintsInProgressFor(destination).get() > 0 && shouldHint(destination)))
             {
                 throw new OverloadedException("Too many in flight hints: " + StorageMetrics.totalHintsInProgress.count());
             }
@@ -836,21 +842,19 @@
             {
                 if (destination.equals(FBUtilities.getBroadcastAddress()) && OPTIMIZE_LOCAL_REQUESTS)
                 {
-                    insertLocal(rm, responseHandler);
-                }
-                else
+                    insertLocal = true;
+                } else
                 {
                     // belongs on a different server
                     if (message == null)
-                        message = rm.createMessage();
+                        message = mutation.createMessage();
                     String dc = DatabaseDescriptor.getEndpointSnitch().getDatacenter(destination);
                     // direct writes to local DC or old Cassandra versions
                     // (1.1 knows how to forward old-style String message IDs; updated to int in 2.0)
-                    if (localDataCenter.equals(dc) || MessagingService.instance().getVersion(destination) < MessagingService.VERSION_20)
+                    if (localDataCenter.equals(dc))
                     {
                         MessagingService.instance().sendRR(message, destination, responseHandler, true);
-                    }
-                    else
+                    } else
                     {
                         Collection<InetAddress> messages = (dcGroups != null) ? dcGroups.get(dc) : null;
                         if (messages == null)
@@ -863,22 +867,24 @@
                         messages.add(destination);
                     }
                 }
-            }
-            else
+            } else
             {
                 if (!shouldHint(destination))
                     continue;
 
                 // Schedule a local hint
-                submitHint(rm, destination, responseHandler);
+                submitHint(mutation, destination, responseHandler);
             }
         }
 
+        if (insertLocal)
+            insertLocal(mutation, responseHandler);
+
         if (dcGroups != null)
         {
             // for each datacenter, send the message to one node to relay the write to other replicas
             if (message == null)
-                message = rm.createMessage();
+                message = mutation.createMessage();
 
             for (Collection<InetAddress> dcTargets : dcGroups.values())
                 sendMessagesToNonlocalDC(message, dcTargets, responseHandler);
@@ -897,7 +903,7 @@
         }
     }
 
-    public static Future<Void> submitHint(final RowMutation mutation,
+    public static Future<Void> submitHint(final Mutation mutation,
                                           final InetAddress target,
                                           final AbstractWriteResponseHandler responseHandler)
     {
@@ -916,8 +922,7 @@
                     // Notify the handler only for CL == ANY
                     if (responseHandler != null && responseHandler.consistencyLevel == ConsistencyLevel.ANY)
                         responseHandler.response(null);
-                }
-                else
+                } else
                 {
                     logger.debug("Skipped writing hint for {} (ttl {})", target, ttl);
                 }
@@ -937,7 +942,7 @@
     /**
      * @param now current time in milliseconds - relevant for hint replay handling of truncated CFs
      */
-    public static void writeHintForMutation(RowMutation mutation, long now, int ttl, InetAddress target)
+    public static void writeHintForMutation(Mutation mutation, long now, int ttl, InetAddress target)
     {
         assert ttl > 0;
         UUID hostId = StorageService.instance.getTokenMetadata().getHostId(target);
@@ -969,7 +974,7 @@
                 out.writeInt(id);
                 logger.trace("Adding FWD message to {}@{}", id, destination);
             }
-            message = message.withParameter(RowMutation.FORWARD_TO, out.getData());
+            message = message.withParameter(Mutation.FORWARD_TO, out.getData());
             // send the combined message + forward headers
             int id = MessagingService.instance().sendRR(message, target, handler, true);
             logger.trace("Sending message to {}@{}", id, target);
@@ -981,16 +986,17 @@
         }
     }
 
-    private static void insertLocal(final RowMutation rm, final AbstractWriteResponseHandler responseHandler)
+    private static void insertLocal(final Mutation mutation, final AbstractWriteResponseHandler responseHandler)
     {
-        StageManager.getStage(Stage.MUTATION).execute(new LocalMutationRunnable()
+
+        StageManager.getStage(Stage.MUTATION).maybeExecuteImmediately(new LocalMutationRunnable()
         {
             public void runMayThrow()
             {
-                IMutation processed = SinkManager.processWriteRequest(rm);
+                IMutation processed = SinkManager.processWriteRequest(mutation);
                 if (processed != null)
                 {
-                    processed.apply();
+                    ((Mutation) processed).apply();
                     responseHandler.response(null);
                 }
             }
@@ -1072,7 +1078,7 @@
         }
         else
         {
-            return localEndpoints.get(FBUtilities.threadLocalRandom().nextInt(localEndpoints.size()));
+            return localEndpoints.get(ThreadLocalRandom.current().nextInt(localEndpoints.size()));
         }
     }
 
@@ -1095,49 +1101,27 @@
     private static Runnable counterWriteTask(final IMutation mutation,
                                              final Iterable<InetAddress> targets,
                                              final AbstractWriteResponseHandler responseHandler,
-                                             final String localDataCenter,
-                                             final ConsistencyLevel consistency_level)
+                                             final String localDataCenter)
     {
         return new DroppableRunnable(MessagingService.Verb.COUNTER_MUTATION)
         {
-            public void runMayThrow()
+            @Override
+            public void runMayThrow() throws OverloadedException, WriteTimeoutException
             {
                 IMutation processed = SinkManager.processWriteRequest(mutation);
                 if (processed == null)
                     return;
 
                 assert processed instanceof CounterMutation;
-                final CounterMutation cm = (CounterMutation) processed;
+                CounterMutation cm = (CounterMutation) processed;
 
-                // apply mutation
-                cm.apply();
+                Mutation result = cm.apply();
                 responseHandler.response(null);
 
-                // then send to replicas, if any
-                final Set<InetAddress> remotes = Sets.difference(ImmutableSet.copyOf(targets), ImmutableSet.of(FBUtilities.getBroadcastAddress()));
-                if (cm.shouldReplicateOnWrite() && !remotes.isEmpty())
-                {
-                    // We do the replication on another stage because it involves a read (see CM.makeReplicationMutation)
-                    // and we want to avoid blocking too much the MUTATION stage
-                    StageManager.getStage(Stage.REPLICATE_ON_WRITE).execute(new DroppableRunnable(MessagingService.Verb.READ)
-                    {
-                        public void runMayThrow() throws OverloadedException
-                        {
-                            // send the mutation to other replicas, if not null (see CASSANDRA-7144 for details)
-                            RowMutation replicationMutation = cm.makeReplicationMutation();
-                            if (replicationMutation != null)
-                            {
-                                sendToHintedEndpoints(replicationMutation, remotes, responseHandler, localDataCenter);
-                            }
-                            else
-                            {
-                                // simulate the rest of the responses to avoid the timeout
-                                for (int i = 0; i < remotes.size(); i++)
-                                    responseHandler.response(null);
-                            }
-                        }
-                    });
-                }
+                Set<InetAddress> remotes = Sets.difference(ImmutableSet.copyOf(targets),
+                            ImmutableSet.of(FBUtilities.getBroadcastAddress()));
+                if (!remotes.isEmpty())
+                    sendToHintedEndpoints(result, remotes, responseHandler, localDataCenter);
             }
         };
     }
@@ -1425,7 +1409,7 @@
 
         LocalRangeSliceRunnable(AbstractRangeCommand command, ReadCallback<RangeSliceReply, Iterable<Row>> handler)
         {
-            super(MessagingService.Verb.READ);
+            super(MessagingService.Verb.RANGE_SLICE);
             this.command = command;
             this.handler = handler;
         }
@@ -1462,10 +1446,65 @@
         return inter;
     }
 
+    /**
+     * Estimate the number of result rows (either cql3 rows or storage rows, as called for by the command) per
+     * range in the ring based on our local data.  This assumes that ranges are uniformly distributed across the cluster
+     * and that the queried data is also uniformly distributed.
+     */
+    private static float estimateResultRowsPerRange(AbstractRangeCommand command, Keyspace keyspace)
+    {
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(command.columnFamily);
+        float resultRowsPerRange = Float.POSITIVE_INFINITY;
+        if (command.rowFilter != null && !command.rowFilter.isEmpty())
+        {
+            List<SecondaryIndexSearcher> searchers = cfs.indexManager.getIndexSearchersForQuery(command.rowFilter);
+            if (searchers.isEmpty())
+            {
+                resultRowsPerRange = calculateResultRowsUsingEstimatedKeys(cfs);
+            }
+            else
+            {
+                // Secondary index query (cql3 or otherwise).  Estimate result rows based on most selective 2ary index.
+                for (SecondaryIndexSearcher searcher : searchers)
+                {
+                    // use our own mean column count as our estimate for how many matching rows each node will have
+                    SecondaryIndex highestSelectivityIndex = searcher.highestSelectivityIndex(command.rowFilter);
+                    resultRowsPerRange = Math.min(resultRowsPerRange, highestSelectivityIndex.estimateResultRows());
+                }
+            }
+        }
+        else if (!command.countCQL3Rows())
+        {
+            // non-cql3 query
+            resultRowsPerRange = cfs.estimateKeys();
+        }
+        else
+        {
+            resultRowsPerRange = calculateResultRowsUsingEstimatedKeys(cfs);
+        }
+
+        // adjust resultRowsPerRange by the number of tokens this node has and the replication factor for this ks
+        return (resultRowsPerRange / DatabaseDescriptor.getNumTokens()) / keyspace.getReplicationStrategy().getReplicationFactor();
+    }
+
+    private static float calculateResultRowsUsingEstimatedKeys(ColumnFamilyStore cfs)
+    {
+        if (cfs.metadata.comparator.isDense())
+        {
+            // one storage row per result row, so use key estimate directly
+            return cfs.estimateKeys();
+        }
+        else
+        {
+            float resultRowsPerStorageRow = ((float) cfs.getMeanColumns()) / cfs.metadata.regularColumns().size();
+            return resultRowsPerStorageRow * (cfs.estimateKeys());
+        }
+    }
+
     public static List<Row> getRangeSlice(AbstractRangeCommand command, ConsistencyLevel consistency_level)
     throws UnavailableException, ReadTimeoutException
     {
-        Tracing.trace("Determining replicas to query");
+        Tracing.trace("Computing ranges to query");
         long startTime = System.nanoTime();
 
         Keyspace keyspace = Keyspace.open(command.keyspace);
@@ -1484,115 +1523,160 @@
             else
                 ranges = getRestrictedRanges(command.keyRange);
 
+            // our estimate of how many result rows there will be per-range
+            float resultRowsPerRange = estimateResultRowsPerRange(command, keyspace);
+            // underestimate how many rows we will get per-range in order to increase the likelihood that we'll
+            // fetch enough rows in the first round
+            resultRowsPerRange -= resultRowsPerRange * CONCURRENT_SUBREQUESTS_MARGIN;
+            int concurrencyFactor = resultRowsPerRange == 0.0
+                                  ? 1
+                                  : Math.max(1, Math.min(ranges.size(), (int) Math.ceil(command.limit() / resultRowsPerRange)));
+            logger.debug("Estimated result rows per range: {}; requested rows: {}, ranges.size(): {}; concurrent range requests: {}",
+                         resultRowsPerRange, command.limit(), ranges.size(), concurrencyFactor);
+            Tracing.trace("Submitting range requests on {} ranges with a concurrency of {} ({} rows per range expected)", new Object[]{ ranges.size(), concurrencyFactor, resultRowsPerRange});
+
+            boolean haveSufficientRows = false;
             int i = 0;
             AbstractBounds<RowPosition> nextRange = null;
             List<InetAddress> nextEndpoints = null;
             List<InetAddress> nextFilteredEndpoints = null;
             while (i < ranges.size())
             {
-                AbstractBounds<RowPosition> range = nextRange == null
-                                                  ? ranges.get(i)
-                                                  : nextRange;
-                List<InetAddress> liveEndpoints = nextEndpoints == null
-                                                ? getLiveSortedEndpoints(keyspace, range.right)
-                                                : nextEndpoints;
-                List<InetAddress> filteredEndpoints = nextFilteredEndpoints == null
-                                                    ? consistency_level.filterForQuery(keyspace, liveEndpoints)
-                                                    : nextFilteredEndpoints;
-                ++i;
-
-                // getRestrictedRange has broken the queried range into per-[vnode] token ranges, but this doesn't take
-                // the replication factor into account. If the intersection of live endpoints for 2 consecutive ranges
-                // still meets the CL requirements, then we can merge both ranges into the same RangeSliceCommand.
-                while (i < ranges.size())
+                List<Pair<AbstractRangeCommand, ReadCallback<RangeSliceReply, Iterable<Row>>>> scanHandlers = new ArrayList<>(concurrencyFactor);
+                int concurrentFetchStartingIndex = i;
+                int concurrentRequests = 0;
+                while ((i - concurrentFetchStartingIndex) < concurrencyFactor)
                 {
-                    nextRange = ranges.get(i);
-                    nextEndpoints = getLiveSortedEndpoints(keyspace, nextRange.right);
-                    nextFilteredEndpoints = consistency_level.filterForQuery(keyspace, nextEndpoints);
-
-                    /*
-                     * If the current range right is the min token, we should stop merging because CFS.getRangeSlice
-                     * don't know how to deal with a wrapping range.
-                     * Note: it would be slightly more efficient to have CFS.getRangeSlice on the destination nodes unwraps
-                     * the range if necessary and deal with it. However, we can't start sending wrapped range without breaking
-                     * wire compatibility, so It's likely easier not to bother;
-                     */
-                    if (range.right.isMinimum())
-                        break;
-
-                    List<InetAddress> merged = intersection(liveEndpoints, nextEndpoints);
-
-                    // Check if there is enough endpoint for the merge to be possible.
-                    if (!consistency_level.isSufficientLiveNodes(keyspace, merged))
-                        break;
-
-                    List<InetAddress> filteredMerged = consistency_level.filterForQuery(keyspace, merged);
-
-                    // Estimate whether merging will be a win or not
-                    if (!DatabaseDescriptor.getEndpointSnitch().isWorthMergingForRangeQuery(filteredMerged, filteredEndpoints, nextFilteredEndpoints))
-                        break;
-
-                    // If we get there, merge this range and the next one
-                    range = range.withNewRight(nextRange.right);
-                    liveEndpoints = merged;
-                    filteredEndpoints = filteredMerged;
+                    AbstractBounds<RowPosition> range = nextRange == null
+                                                      ? ranges.get(i)
+                                                      : nextRange;
+                    List<InetAddress> liveEndpoints = nextEndpoints == null
+                                                    ? getLiveSortedEndpoints(keyspace, range.right)
+                                                    : nextEndpoints;
+                    List<InetAddress> filteredEndpoints = nextFilteredEndpoints == null
+                                                        ? consistency_level.filterForQuery(keyspace, liveEndpoints)
+                                                        : nextFilteredEndpoints;
                     ++i;
-                }
+                    ++concurrentRequests;
 
-                AbstractRangeCommand nodeCmd = command.forSubRange(range);
-
-                // collect replies and resolve according to consistency level
-                RangeSliceResponseResolver resolver = new RangeSliceResponseResolver(nodeCmd.keyspace, command.timestamp);
-                List<InetAddress> minimalEndpoints = filteredEndpoints.subList(0, Math.min(filteredEndpoints.size(), consistency_level.blockFor(keyspace)));
-                ReadCallback<RangeSliceReply, Iterable<Row>> handler = new ReadCallback<>(resolver, consistency_level, nodeCmd, minimalEndpoints);
-                handler.assureSufficientLiveNodes();
-                resolver.setSources(filteredEndpoints);
-                if (filteredEndpoints.size() == 1
-                    && filteredEndpoints.get(0).equals(FBUtilities.getBroadcastAddress())
-                    && OPTIMIZE_LOCAL_REQUESTS)
-                {
-                    StageManager.getStage(Stage.READ).execute(new LocalRangeSliceRunnable(nodeCmd, handler));
-                }
-                else
-                {
-                    MessageOut<? extends AbstractRangeCommand> message = nodeCmd.createMessage();
-                    for (InetAddress endpoint : filteredEndpoints)
+                    // getRestrictedRange has broken the queried range into per-[vnode] token ranges, but this doesn't take
+                    // the replication factor into account. If the intersection of live endpoints for 2 consecutive ranges
+                    // still meets the CL requirements, then we can merge both ranges into the same RangeSliceCommand.
+                    while (i < ranges.size())
                     {
-                        Tracing.trace("Enqueuing request to {}", endpoint);
-                        MessagingService.instance().sendRR(message, endpoint, handler);
+                        nextRange = ranges.get(i);
+                        nextEndpoints = getLiveSortedEndpoints(keyspace, nextRange.right);
+                        nextFilteredEndpoints = consistency_level.filterForQuery(keyspace, nextEndpoints);
+
+                        // If the current range right is the min token, we should stop merging because CFS.getRangeSlice
+                        // don't know how to deal with a wrapping range.
+                        // Note: it would be slightly more efficient to have CFS.getRangeSlice on the destination nodes unwraps
+                        // the range if necessary and deal with it. However, we can't start sending wrapped range without breaking
+                        // wire compatibility, so It's likely easier not to bother;
+                        if (range.right.isMinimum())
+                            break;
+
+                        List<InetAddress> merged = intersection(liveEndpoints, nextEndpoints);
+
+                        // Check if there is enough endpoint for the merge to be possible.
+                        if (!consistency_level.isSufficientLiveNodes(keyspace, merged))
+                            break;
+
+                        List<InetAddress> filteredMerged = consistency_level.filterForQuery(keyspace, merged);
+
+                        // Estimate whether merging will be a win or not
+                        if (!DatabaseDescriptor.getEndpointSnitch().isWorthMergingForRangeQuery(filteredMerged, filteredEndpoints, nextFilteredEndpoints))
+                            break;
+
+                        // If we get there, merge this range and the next one
+                        range = range.withNewRight(nextRange.right);
+                        liveEndpoints = merged;
+                        filteredEndpoints = filteredMerged;
+                        ++i;
+                    }
+
+                    AbstractRangeCommand nodeCmd = command.forSubRange(range);
+
+                    // collect replies and resolve according to consistency level
+                    RangeSliceResponseResolver resolver = new RangeSliceResponseResolver(nodeCmd.keyspace, command.timestamp);
+                    List<InetAddress> minimalEndpoints = filteredEndpoints.subList(0, Math.min(filteredEndpoints.size(), consistency_level.blockFor(keyspace)));
+                    ReadCallback<RangeSliceReply, Iterable<Row>> handler = new ReadCallback<>(resolver, consistency_level, nodeCmd, minimalEndpoints);
+                    handler.assureSufficientLiveNodes();
+                    resolver.setSources(filteredEndpoints);
+                    if (filteredEndpoints.size() == 1
+                        && filteredEndpoints.get(0).equals(FBUtilities.getBroadcastAddress())
+                        && OPTIMIZE_LOCAL_REQUESTS)
+                    {
+                        StageManager.getStage(Stage.READ).execute(new LocalRangeSliceRunnable(nodeCmd, handler), Tracing.instance.get());
+                    }
+                    else
+                    {
+                        MessageOut<? extends AbstractRangeCommand> message = nodeCmd.createMessage();
+                        for (InetAddress endpoint : filteredEndpoints)
+                        {
+                            Tracing.trace("Enqueuing request to {}", endpoint);
+                            MessagingService.instance().sendRR(message, endpoint, handler);
+                        }
+                    }
+                    scanHandlers.add(Pair.create(nodeCmd, handler));
+                }
+                Tracing.trace("Submitted {} concurrent range requests covering {} ranges", concurrentRequests, i - concurrentFetchStartingIndex);
+
+                List<AsyncOneResponse> repairResponses = new ArrayList<>();
+                for (Pair<AbstractRangeCommand, ReadCallback<RangeSliceReply, Iterable<Row>>> cmdPairHandler : scanHandlers)
+                {
+                    AbstractRangeCommand nodeCmd = cmdPairHandler.left;
+                    ReadCallback<RangeSliceReply, Iterable<Row>> handler = cmdPairHandler.right;
+                    RangeSliceResponseResolver resolver = (RangeSliceResponseResolver)handler.resolver;
+
+                    try
+                    {
+                        for (Row row : handler.get())
+                        {
+                            rows.add(row);
+                            if (nodeCmd.countCQL3Rows())
+                                cql3RowCount += row.getLiveCount(command.predicate, command.timestamp);
+                        }
+                        repairResponses.addAll(resolver.repairResults);
+                    }
+                    catch (ReadTimeoutException ex)
+                    {
+                        // we timed out waiting for responses
+                        int blockFor = consistency_level.blockFor(keyspace);
+                        int responseCount = resolver.responses.size();
+                        String gotData = responseCount > 0
+                                         ? resolver.isDataPresent() ? " (including data)" : " (only digests)"
+                                         : "";
+
+                        if (Tracing.isTracing())
+                        {
+                            Tracing.trace("Timed out; received {} of {} responses{} for range {} of {}",
+                                          new Object[]{ responseCount, blockFor, gotData, i, ranges.size() });
+                        }
+                        else if (logger.isDebugEnabled())
+                        {
+                            logger.debug("Range slice timeout; received {} of {} responses{} for range {} of {}",
+                                         responseCount, blockFor, gotData, i, ranges.size());
+                        }
+                        throw ex;
+                    }
+                    catch (DigestMismatchException e)
+                    {
+                        throw new AssertionError(e); // no digests in range slices yet
+                    }
+
+                    // if we're done, great, otherwise, move to the next range
+                    int count = nodeCmd.countCQL3Rows() ? cql3RowCount : rows.size();
+                    if (count >= nodeCmd.limit())
+                    {
+                        haveSufficientRows = true;
+                        break;
                     }
                 }
 
                 try
                 {
-                    for (Row row : handler.get())
-                    {
-                        rows.add(row);
-                        if (nodeCmd.countCQL3Rows())
-                            cql3RowCount += row.getLiveCount(command.predicate, command.timestamp);
-                    }
-                    FBUtilities.waitOnFutures(resolver.repairResults, DatabaseDescriptor.getWriteRpcTimeout());
-                }
-                catch (ReadTimeoutException ex)
-                {
-                    // we timed out waiting for responses
-                    int blockFor = consistency_level.blockFor(keyspace);
-                    int responseCount = resolver.responses.size();
-                    String gotData = responseCount > 0
-                                     ? resolver.isDataPresent() ? " (including data)" : " (only digests)"
-                                     : "";
-
-                    if (Tracing.isTracing())
-                    {
-                        Tracing.trace("Timed out; received {} of {} responses{} for range {} of {}",
-                                new Object[]{ responseCount, blockFor, gotData, i, ranges.size() });
-                    }
-                    else if (logger.isDebugEnabled())
-                    {
-                        logger.debug("Range slice timeout; received {} of {} responses{} for range {} of {}",
-                                responseCount, blockFor, gotData, i, ranges.size());
-                    }
-                    throw ex;
+                    FBUtilities.waitOnFutures(repairResponses, DatabaseDescriptor.getWriteRpcTimeout());
                 }
                 catch (TimeoutException ex)
                 {
@@ -1604,15 +1688,31 @@
                         logger.debug("Range slice timeout while read-repairing after receiving all {} data and digest responses", blockFor);
                     throw new ReadTimeoutException(consistency_level, blockFor-1, blockFor, true);
                 }
-                catch (DigestMismatchException e)
-                {
-                    throw new AssertionError(e); // no digests in range slices yet
-                }
 
-                // if we're done, great, otherwise, move to the next range
-                int count = nodeCmd.countCQL3Rows() ? cql3RowCount : rows.size();
-                if (count >= nodeCmd.limit())
-                    break;
+                if (haveSufficientRows)
+                    return trim(command, rows);
+
+                // we didn't get enough rows in our concurrent fetch; recalculate our concurrency factor
+                // based on the results we've seen so far (as long as we still have ranges left to query)
+                if (i < ranges.size())
+                {
+                    float fetchedRows = command.countCQL3Rows() ? cql3RowCount : rows.size();
+                    float remainingRows = command.limit() - fetchedRows;
+                    float actualRowsPerRange;
+                    if (fetchedRows == 0.0)
+                    {
+                        // we haven't actually gotten any results, so query all remaining ranges at once
+                        actualRowsPerRange = 0.0f;
+                        concurrencyFactor = ranges.size() - i;
+                    }
+                    else
+                    {
+                        actualRowsPerRange = i / fetchedRows;
+                        concurrencyFactor = Math.max(1, Math.min(ranges.size() - i, Math.round(remainingRows / actualRowsPerRange)));
+                    }
+                    logger.debug("Didn't get enough response rows; actual rows per range: {}; remaining rows: {}, new concurrent requests: {}",
+                                 actualRowsPerRange, (int) remainingRows, concurrencyFactor);
+                }
             }
         }
         finally
@@ -1869,7 +1969,7 @@
         if (DatabaseDescriptor.shouldHintByDC())
         {
             final String dc = DatabaseDescriptor.getEndpointSnitch().getDatacenter(ep);
-            // Disable DC specific hints
+            //Disable DC specific hints
             if(!DatabaseDescriptor.hintedHandoffEnabled(dc))
             {
                 HintedHandOffManager.instance.metrics.incrPastWindow(ep);
@@ -1948,7 +2048,11 @@
     
     public interface WritePerformer
     {
-        public void apply(IMutation mutation, Iterable<InetAddress> targets, AbstractWriteResponseHandler responseHandler, String localDataCenter, ConsistencyLevel consistency_level) throws OverloadedException;
+        public void apply(IMutation mutation,
+                          Iterable<InetAddress> targets,
+                          AbstractWriteResponseHandler responseHandler,
+                          String localDataCenter,
+                          ConsistencyLevel consistencyLevel) throws OverloadedException;
     }
 
     /**
@@ -1966,17 +2070,16 @@
 
         public final void run()
         {
+
             if (TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - constructionTime) > DatabaseDescriptor.getTimeout(verb))
             {
                 MessagingService.instance().incrementDroppedMessages(verb);
                 return;
             }
-
             try
             {
                 runMayThrow();
-            }
-            catch (Exception e)
+            } catch (Exception e)
             {
                 throw new RuntimeException(e);
             }
@@ -2090,6 +2193,9 @@
     public Long getWriteRpcTimeout() { return DatabaseDescriptor.getWriteRpcTimeout(); }
     public void setWriteRpcTimeout(Long timeoutInMillis) { DatabaseDescriptor.setWriteRpcTimeout(timeoutInMillis); }
 
+    public Long getCounterWriteRpcTimeout() { return DatabaseDescriptor.getCounterWriteRpcTimeout(); }
+    public void setCounterWriteRpcTimeout(Long timeoutInMillis) { DatabaseDescriptor.setCounterWriteRpcTimeout(timeoutInMillis); }
+
     public Long getCasContentionTimeout() { return DatabaseDescriptor.getCasContentionTimeout(); }
     public void setCasContentionTimeout(Long timeoutInMillis) { DatabaseDescriptor.setCasContentionTimeout(timeoutInMillis); }
 

diff --git a/src/java/org/apache/cassandra/service/StorageProxyMBean.java b/src/java/org/apache/cassandra/service/StorageProxyMBean.java
index 203cabe..a04b660 100644
--- a/src/java/org/apache/cassandra/service/StorageProxyMBean.java
+++ b/src/java/org/apache/cassandra/service/StorageProxyMBean.java

@@ -88,6 +88,8 @@
     public void setReadRpcTimeout(Long timeoutInMillis);
     public Long getWriteRpcTimeout();
     public void setWriteRpcTimeout(Long timeoutInMillis);
+    public Long getCounterWriteRpcTimeout();
+    public void setCounterWriteRpcTimeout(Long timeoutInMillis);
     public Long getCasContentionTimeout();
     public void setCasContentionTimeout(Long timeoutInMillis);
     public Long getRangeRpcTimeout();

diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index 56056ab..4fb0435 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java

@@ -29,26 +29,30 @@
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
+
+import javax.management.JMX;
 import javax.management.MBeanServer;
 import javax.management.Notification;
 import javax.management.NotificationBroadcasterSupport;
 import javax.management.ObjectName;
+import javax.management.openmbean.TabularData;
+import javax.management.openmbean.TabularDataSupport;
+
+import ch.qos.logback.classic.LoggerContext;
+import ch.qos.logback.classic.jmx.JMXConfiguratorMBean;
+import ch.qos.logback.classic.spi.ILoggingEvent;
+import ch.qos.logback.core.Appender;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Predicate;
 import com.google.common.collect.*;
-import com.google.common.util.concurrent.AtomicDouble;
 import com.google.common.util.concurrent.FutureCallback;
 import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.Uninterruptibles;
 
-import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.log4j.Level;
-import org.apache.log4j.LogManager;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.auth.Auth;
 import org.apache.cassandra.concurrent.DebuggableScheduledThreadPoolExecutor;
 import org.apache.cassandra.concurrent.Stage;
@@ -59,6 +63,7 @@
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Range;
@@ -139,6 +144,8 @@
 
     public volatile VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(getPartitioner());
 
+    private Thread drainOnShutdown = null;
+
     public static final StorageService instance = new StorageService();
 
     public static IPartitioner getPartitioner()
@@ -151,11 +158,16 @@
         return getRangesForEndpoint(keyspaceName, FBUtilities.getBroadcastAddress());
     }
 
-    public Collection<Range<Token>> getLocalPrimaryRanges(String keyspace)
+    public Collection<Range<Token>> getPrimaryRanges(String keyspace)
     {
         return getPrimaryRangesForEndpoint(keyspace, FBUtilities.getBroadcastAddress());
     }
 
+    public Collection<Range<Token>> getPrimaryRangesWithinDC(String keyspace)
+    {
+        return getPrimaryRangeForEndpointWithinDC(keyspace, FBUtilities.getBroadcastAddress());
+    }
+
     private final Set<InetAddress> replicatingNodes = Collections.synchronizedSet(new HashSet<InetAddress>());
     private CassandraDaemon daemon;
 
@@ -183,7 +195,7 @@
 
     private static final AtomicInteger nextRepairCommand = new AtomicInteger();
 
-    private final List<IEndpointLifecycleSubscriber> lifecycleSubscribers = new CopyOnWriteArrayList<IEndpointLifecycleSubscriber>();
+    private final List<IEndpointLifecycleSubscriber> lifecycleSubscribers = new CopyOnWriteArrayList<>();
 
     private static final BackgroundActivityMonitor bgMonitor = new BackgroundActivityMonitor();
 
@@ -226,7 +238,7 @@
         }
 
         /* register the verb handlers */
-        MessagingService.instance().registerVerbHandlers(MessagingService.Verb.MUTATION, new RowMutationVerbHandler());
+        MessagingService.instance().registerVerbHandlers(MessagingService.Verb.MUTATION, new MutationVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ_REPAIR, new ReadRepairVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ, new ReadVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.RANGE_SLICE, new RangeSliceVerbHandler());
@@ -328,7 +340,7 @@
         {
             throw new IllegalStateException("No configured daemon");
         }
-        
+
         try
         {
             daemon.nativeServer.start();
@@ -426,10 +438,10 @@
             if (Gossiper.instance.getEndpointStateForEndpoint(DatabaseDescriptor.getReplaceAddress()).getApplicationState(ApplicationState.TOKENS) == null)
                 throw new RuntimeException("Could not find tokens for " + DatabaseDescriptor.getReplaceAddress() + " to replace");
             Collection<Token> tokens = TokenSerializer.deserialize(getPartitioner(), new DataInputStream(new ByteArrayInputStream(getApplicationStateValue(DatabaseDescriptor.getReplaceAddress(), ApplicationState.TOKENS))));
-            
+
             SystemKeyspace.setLocalHostId(hostId); // use the replacee's host Id as our own so we receive hints, etc
             Gossiper.instance.resetEndpointStateMap(); // clean up since we have what we need
-            return tokens;        
+            return tokens;
         }
         catch (IOException e)
         {
@@ -450,6 +462,20 @@
                                                      "Use cassandra.replace_address if you want to replace this node.",
                                                      FBUtilities.getBroadcastAddress()));
         }
+        if (RangeStreamer.useStrictConsistency)
+        {
+            for (Map.Entry<InetAddress, EndpointState> entry : Gossiper.instance.getEndpointStates())
+            {
+
+                if (entry.getValue().getApplicationState(ApplicationState.STATUS) == null)
+                        continue;
+                String[] pieces = entry.getValue().getApplicationState(ApplicationState.STATUS).value.split(VersionedValue.DELIMITER_STR, -1);
+                assert (pieces.length > 0);
+                String state = pieces[0];
+                if (state.equals(VersionedValue.STATUS_BOOTSTRAPPING) || state.equals(VersionedValue.STATUS_LEAVING) || state.equals(VersionedValue.STATUS_MOVING))
+                    throw new UnsupportedOperationException("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
+            }
+        }
         Gossiper.instance.resetEndpointStateMap();
     }
 
@@ -505,9 +531,9 @@
 
     public synchronized void initServer(int delay) throws ConfigurationException
     {
-        logger.info("Cassandra version: " + FBUtilities.getReleaseVersionString());
-        logger.info("Thrift API version: " + cassandraConstants.VERSION);
-        logger.info("CQL supported versions: " + StringUtils.join(ClientState.getCQLSupportedVersion(), ",") + " (default: " + ClientState.DEFAULT_CQL_VERSION + ")");
+        logger.info("Cassandra version: {}", FBUtilities.getReleaseVersionString());
+        logger.info("Thrift API version: {}", cassandraConstants.VERSION);
+        logger.info("CQL supported versions: {} (default: {})", StringUtils.join(ClientState.getCQLSupportedVersion(), ","), ClientState.DEFAULT_CQL_VERSION);
 
         if (initialized)
         {
@@ -518,10 +544,12 @@
         initialized = true;
         isClientMode = false;
 
-        // Ensure StorageProxy is initialized on start-up; see CASSANDRA-3797.
         try
         {
+            // Ensure StorageProxy is initialized on start-up; see CASSANDRA-3797.
             Class.forName("org.apache.cassandra.service.StorageProxy");
+            // also IndexSummaryManager, which is otherwise unreferenced
+            Class.forName("org.apache.cassandra.io.sstable.IndexSummaryManager");
         }
         catch (ClassNotFoundException e)
         {
@@ -550,20 +578,15 @@
             }
         }
 
-        if (Boolean.parseBoolean(System.getProperty("cassandra.renew_counter_id", "false")))
-        {
-            logger.info("Renewing local node id (as requested)");
-            CounterId.renewLocalId();
-        }
-
         // daemon threads, like our executors', continue to run while shutdown hooks are invoked
-        Thread drainOnShutdown = new Thread(new WrappedRunnable()
+        drainOnShutdown = new Thread(new WrappedRunnable()
         {
             @Override
-            public void runMayThrow() throws ExecutionException, InterruptedException, IOException
+            public void runMayThrow() throws InterruptedException
             {
+                ExecutorService counterMutationStage = StageManager.getStage(Stage.COUNTER_MUTATION);
                 ExecutorService mutationStage = StageManager.getStage(Stage.MUTATION);
-                if (mutationStage.isShutdown())
+                if (mutationStage.isShutdown() && counterMutationStage.isShutdown())
                     return; // drained already
 
                 if (daemon != null)
@@ -574,11 +597,13 @@
                 // In-progress writes originating here could generate hints to be written, so shut down MessagingService
                 // before mutation stage, so we can get all the hints saved before shutting down
                 MessagingService.instance().shutdown();
+                counterMutationStage.shutdown();
                 mutationStage.shutdown();
+                counterMutationStage.awaitTermination(3600, TimeUnit.SECONDS);
                 mutationStage.awaitTermination(3600, TimeUnit.SECONDS);
                 StorageProxy.instance.verifyNoHintsInProgress();
 
-                List<Future<?>> flushes = new ArrayList<Future<?>>();
+                List<Future<?>> flushes = new ArrayList<>();
                 for (Keyspace keyspace : Keyspace.all())
                 {
                     KSMetaData ksm = Schema.instance.getKSMetaData(keyspace.getName());
@@ -592,10 +617,11 @@
                 {
                     FBUtilities.waitOnFutures(flushes);
                 }
-                catch (Throwable e)
+                catch (Throwable t)
                 {
+                    JVMStabilityInspector.inspectThrowable(t);
                     // don't let this stop us from shutting down the commitlog and other thread pools
-                    logger.warn("Caught exception while waiting for memtable flushes during shutdown hook", e);
+                    logger.warn("Caught exception while waiting for memtable flushes during shutdown hook", t);
                 }
 
                 CommitLog.instance.shutdownBlocking();
@@ -609,6 +635,12 @@
         Runtime.getRuntime().addShutdownHook(drainOnShutdown);
 
         prepareToJoin();
+
+        // Has to be called after the host id has potentially changed in prepareToJoin().
+        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
+            if (cfs.metadata.isCounter())
+                cfs.initCounterCache();
+
         if (Boolean.parseBoolean(System.getProperty("cassandra.join_ring", "true")))
         {
             joinTokenRing(delay);
@@ -629,6 +661,15 @@
         }
     }
 
+    /**
+     * In the event of forceful termination we need to remove the shutdown hook to prevent hanging (OOM for instance)
+     */
+    public void removeShutdownHook()
+    {
+        if (drainOnShutdown != null)
+            Runtime.getRuntime().removeShutdownHook(drainOnShutdown);
+    }
+
     private boolean shouldBootstrap()
     {
         return DatabaseDescriptor.isAutoBootstrap() && !SystemKeyspace.bootstrapComplete() && !DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddress());
@@ -638,7 +679,7 @@
     {
         if (!joined)
         {
-            Map<ApplicationState, VersionedValue> appStates = new HashMap<ApplicationState, VersionedValue>();
+            Map<ApplicationState, VersionedValue> appStates = new HashMap<>();
 
             if (DatabaseDescriptor.isReplacing() && !(Boolean.parseBoolean(System.getProperty("cassandra.join_ring", "true"))))
                 throw new ConfigurationException("Cannot set both join_ring=false and attempt to replace a node");
@@ -658,6 +699,7 @@
             {
                 checkForEndpointCollision();
             }
+
             // have to start the gossip service before we can see any info on other nodes.  this is necessary
             // for bootstrap to get the load info it needs.
             // (we won't be part of the storage ring though until we add a counterId to our state, below.)
@@ -666,7 +708,7 @@
             getTokenMetadata().updateHostId(localHostId, FBUtilities.getBroadcastAddress());
             appStates.put(ApplicationState.NET_VERSION, valueFactory.networkVersion());
             appStates.put(ApplicationState.HOST_ID, valueFactory.hostId(localHostId));
-            appStates.put(ApplicationState.RPC_ADDRESS, valueFactory.rpcaddress(DatabaseDescriptor.getRpcAddress()));
+            appStates.put(ApplicationState.RPC_ADDRESS, valueFactory.rpcaddress(DatabaseDescriptor.getBroadcastRpcAddress()));
             appStates.put(ApplicationState.RELEASE_VERSION, valueFactory.releaseVersion());
             logger.info("Starting up server gossip");
             Gossiper.instance.register(this);
@@ -676,7 +718,6 @@
             // gossip Schema.emptyVersion forcing immediate check for schema updates (see MigrationManager#maybeScheduleSchemaPull)
             Schema.instance.updateVersionAndAnnounce(); // Ensure we know our own actual Schema UUID in preparation for updates
 
-
             if (!MessagingService.instance().isListening())
                 MessagingService.instance().listen(FBUtilities.getLocalAddress());
             LoadBroadcaster.instance.startBroadcasting();
@@ -699,7 +740,7 @@
         //
         // We attempted to replace this with a schema-presence check, but you need a meaningful sleep
         // to get schema info from gossip which defeats the purpose.  See CASSANDRA-4427 for the gory details.
-        Set<InetAddress> current = new HashSet<InetAddress>();
+        Set<InetAddress> current = new HashSet<>();
         logger.debug("Bootstrap variables: {} {} {} {}",
                      DatabaseDescriptor.isAutoBootstrap(),
                      SystemKeyspace.bootstrapInProgress(),
@@ -741,6 +782,14 @@
             if (logger.isDebugEnabled())
                 logger.debug("... got ring + schema info");
 
+            if (Boolean.parseBoolean(System.getProperty("cassandra.consistent.rangemovement", "true")) &&
+                    (
+                        tokenMetadata.getBootstrapTokens().valueSet().size() > 0 ||
+                        tokenMetadata.getLeavingEndpoints().size() > 0 ||
+                        tokenMetadata.getMovingEndpoints().size() > 0
+                    ))
+                throw new UnsupportedOperationException("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
+
             if (!DatabaseDescriptor.isReplacing())
             {
                 if (tokenMetadata.isMember(FBUtilities.getBroadcastAddress()))
@@ -836,7 +885,7 @@
         if (Schema.instance.getKSMetaData(Tracing.TRACE_KS) == null)
         {
             KSMetaData tracingKeyspace = KSMetaData.traceKeyspace();
-            MigrationManager.announceNewKeyspace(tracingKeyspace, 0);
+            MigrationManager.announceNewKeyspace(tracingKeyspace, 0, false);
         }
 
         if (!isSurveyMode)
@@ -1035,7 +1084,7 @@
     public Map<List<String>, List<String>> getRangeToEndpointMap(String keyspace)
     {
         /* All the ranges for the tokens */
-        Map<List<String>, List<String>> map = new HashMap<List<String>, List<String>>();
+        Map<List<String>, List<String>> map = new HashMap<>();
         for (Map.Entry<Range<Token>,List<InetAddress>> entry : getRangeToAddressMap(keyspace).entrySet())
         {
             map.put(entry.getKey().asList(), stringify(entry.getValue()));
@@ -1051,7 +1100,7 @@
     public String getRpcaddress(InetAddress endpoint)
     {
         if (endpoint.equals(FBUtilities.getBroadcastAddress()))
-            return DatabaseDescriptor.getRpcAddress().getHostAddress();
+            return DatabaseDescriptor.getBroadcastRpcAddress().getHostAddress();
         else if (Gossiper.instance.getEndpointStateForEndpoint(endpoint).getApplicationState(ApplicationState.RPC_ADDRESS) == null)
             return endpoint.getHostAddress();
         else
@@ -1066,10 +1115,10 @@
     public Map<List<String>, List<String>> getRangeToRpcaddressMap(String keyspace)
     {
         /* All the ranges for the tokens */
-        Map<List<String>, List<String>> map = new HashMap<List<String>, List<String>>();
+        Map<List<String>, List<String>> map = new HashMap<>();
         for (Map.Entry<Range<Token>, List<InetAddress>> entry : getRangeToAddressMap(keyspace).entrySet())
         {
-            List<String> rpcaddrs = new ArrayList<String>(entry.getValue().size());
+            List<String> rpcaddrs = new ArrayList<>(entry.getValue().size());
             for (InetAddress endpoint: entry.getValue())
             {
                 rpcaddrs.add(getRpcaddress(endpoint));
@@ -1086,10 +1135,10 @@
         if (keyspace == null)
             keyspace = Schema.instance.getNonSystemKeyspaces().get(0);
 
-        Map<List<String>, List<String>> map = new HashMap<List<String>, List<String>>();
+        Map<List<String>, List<String>> map = new HashMap<>();
         for (Map.Entry<Range<Token>, Collection<InetAddress>> entry : tokenMetadata.getPendingRanges(keyspace).entrySet())
         {
-            List<InetAddress> l = new ArrayList<InetAddress>(entry.getValue());
+            List<InetAddress> l = new ArrayList<>(entry.getValue());
             map.put(entry.getKey().asList(), stringify(l));
         }
         return map;
@@ -1170,7 +1219,7 @@
         {
             throw new IOException(e.getMessage());
         }
-        List<String> result = new ArrayList<String>(tokenRanges.size());
+        List<String> result = new ArrayList<>(tokenRanges.size());
 
         for (TokenRange tokenRange : tokenRanges)
             result.add(tokenRange.toString());
@@ -1208,7 +1257,7 @@
         if (keyspace == null || Keyspace.open(keyspace).getReplicationStrategy() instanceof LocalStrategy)
             throw new InvalidRequestException("There is no ring for the keyspace: " + keyspace);
 
-        List<TokenRange> ranges = new ArrayList<TokenRange>();
+        List<TokenRange> ranges = new ArrayList<>();
         Token.TokenFactory tf = getPartitioner().getTokenFactory();
 
         Map<Range<Token>, List<InetAddress>> rangeToAddressMap =
@@ -1220,9 +1269,9 @@
         {
             Range range = entry.getKey();
             List<InetAddress> addresses = entry.getValue();
-            List<String> endpoints = new ArrayList<String>(addresses.size());
-            List<String> rpc_endpoints = new ArrayList<String>(addresses.size());
-            List<EndpointDetails> epDetails = new ArrayList<EndpointDetails>(addresses.size());
+            List<String> endpoints = new ArrayList<>(addresses.size());
+            List<String> rpc_endpoints = new ArrayList<>(addresses.size());
+            List<EndpointDetails> epDetails = new ArrayList<>(addresses.size());
 
             for (InetAddress endpoint : addresses)
             {
@@ -1251,8 +1300,8 @@
     {
         Map<Token, InetAddress> mapInetAddress = tokenMetadata.getNormalAndBootstrappingTokenToEndpointMap();
         // in order to preserve tokens in ascending order, we use LinkedHashMap here
-        Map<String, String> mapString = new LinkedHashMap<String, String>(mapInetAddress.size());
-        List<Token> tokens = new ArrayList<Token>(mapInetAddress.keySet());
+        Map<String, String> mapString = new LinkedHashMap<>(mapInetAddress.size());
+        List<Token> tokens = new ArrayList<>(mapInetAddress.keySet());
         Collections.sort(tokens);
         for (Token token : tokens)
         {
@@ -1268,7 +1317,7 @@
 
     public Map<String, String> getHostIdMap()
     {
-        Map<String, String> mapOut = new HashMap<String, String>();
+        Map<String, String> mapOut = new HashMap<>();
         for (Map.Entry<InetAddress, UUID> entry : getTokenMetadata().getEndpointToHostIdMapForReading().entrySet())
             mapOut.put(entry.getKey().getHostAddress(), entry.getValue().toString());
         return mapOut;
@@ -1282,7 +1331,7 @@
     */
     private Map<Range<Token>, List<InetAddress>> constructRangeToEndpointMap(String keyspace, List<Range<Token>> ranges)
     {
-        Map<Range<Token>, List<InetAddress>> rangeToEndpointMap = new HashMap<Range<Token>, List<InetAddress>>();
+        Map<Range<Token>, List<InetAddress>> rangeToEndpointMap = new HashMap<>();
         for (Range<Token> range : ranges)
         {
             rangeToEndpointMap.put(range, Keyspace.open(keyspace).getReplicationStrategy().getNaturalEndpoints(range.right));
@@ -1337,18 +1386,28 @@
 
             String moveName = pieces[0];
 
-            if (moveName.equals(VersionedValue.STATUS_BOOTSTRAPPING))
-                handleStateBootstrap(endpoint, pieces);
-            else if (moveName.equals(VersionedValue.STATUS_NORMAL))
-                handleStateNormal(endpoint, pieces);
-            else if (moveName.equals(VersionedValue.REMOVING_TOKEN) || moveName.equals(VersionedValue.REMOVED_TOKEN))
-                handleStateRemoving(endpoint, pieces);
-            else if (moveName.equals(VersionedValue.STATUS_LEAVING))
-                handleStateLeaving(endpoint, pieces);
-            else if (moveName.equals(VersionedValue.STATUS_LEFT))
-                handleStateLeft(endpoint, pieces);
-            else if (moveName.equals(VersionedValue.STATUS_MOVING))
-                handleStateMoving(endpoint, pieces);
+            switch (moveName)
+            {
+                case VersionedValue.STATUS_BOOTSTRAPPING:
+                    handleStateBootstrap(endpoint);
+                    break;
+                case VersionedValue.STATUS_NORMAL:
+                    handleStateNormal(endpoint);
+                    break;
+                case VersionedValue.REMOVING_TOKEN:
+                case VersionedValue.REMOVED_TOKEN:
+                    handleStateRemoving(endpoint, pieces);
+                    break;
+                case VersionedValue.STATUS_LEAVING:
+                    handleStateLeaving(endpoint);
+                    break;
+                case VersionedValue.STATUS_LEFT:
+                    handleStateLeft(endpoint, pieces);
+                    break;
+                case VersionedValue.STATUS_MOVING:
+                    handleStateMoving(endpoint, pieces);
+                    break;
+            }
         }
         else
         {
@@ -1362,23 +1421,30 @@
             switch (state)
             {
                 case RELEASE_VERSION:
-                    SystemKeyspace.updatePeerInfo(endpoint, "release_version", quote(value.value));
+                    SystemKeyspace.updatePeerInfo(endpoint, "release_version", value.value);
                     break;
                 case DC:
-                    SystemKeyspace.updatePeerInfo(endpoint, "data_center", quote(value.value));
+                    SystemKeyspace.updatePeerInfo(endpoint, "data_center", value.value);
                     break;
                 case RACK:
-                    SystemKeyspace.updatePeerInfo(endpoint, "rack", quote(value.value));
+                    SystemKeyspace.updatePeerInfo(endpoint, "rack", value.value);
                     break;
                 case RPC_ADDRESS:
-                    SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", quote(value.value));
+                    try
+                    {
+                        SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", InetAddress.getByName(value.value));
+                    }
+                    catch (UnknownHostException e)
+                    {
+                        throw new RuntimeException(e);
+                    }
                     break;
                 case SCHEMA:
-                    SystemKeyspace.updatePeerInfo(endpoint, "schema_version", value.value);
+                    SystemKeyspace.updatePeerInfo(endpoint, "schema_version", UUID.fromString(value.value));
                     MigrationManager.instance.scheduleSchemaPull(endpoint, epState);
                     break;
                 case HOST_ID:
-                    SystemKeyspace.updatePeerInfo(endpoint, "host_id", value.value);
+                    SystemKeyspace.updatePeerInfo(endpoint, "host_id", UUID.fromString(value.value));
                     break;
             }
         }
@@ -1392,74 +1458,65 @@
             switch (entry.getKey())
             {
                 case RELEASE_VERSION:
-                    SystemKeyspace.updatePeerInfo(endpoint, "release_version", quote(entry.getValue().value));
+                    SystemKeyspace.updatePeerInfo(endpoint, "release_version", entry.getValue().value);
                     break;
                 case DC:
-                    SystemKeyspace.updatePeerInfo(endpoint, "data_center", quote(entry.getValue().value));
+                    SystemKeyspace.updatePeerInfo(endpoint, "data_center", entry.getValue().value);
                     break;
                 case RACK:
-                    SystemKeyspace.updatePeerInfo(endpoint, "rack", quote(entry.getValue().value));
+                    SystemKeyspace.updatePeerInfo(endpoint, "rack", entry.getValue().value);
                     break;
                 case RPC_ADDRESS:
-                    SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", quote(entry.getValue().value));
+                    try
+                    {
+                        SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", InetAddress.getByName(entry.getValue().value));
+                    }
+                    catch (UnknownHostException e)
+                    {
+                        throw new RuntimeException(e);
+                    }
                     break;
                 case SCHEMA:
-                    SystemKeyspace.updatePeerInfo(endpoint, "schema_version", entry.getValue().value);
+                    SystemKeyspace.updatePeerInfo(endpoint, "schema_version", UUID.fromString(entry.getValue().value));
                     break;
                 case HOST_ID:
-                    SystemKeyspace.updatePeerInfo(endpoint, "host_id", entry.getValue().value);
+                    SystemKeyspace.updatePeerInfo(endpoint, "host_id", UUID.fromString(entry.getValue().value));
                     break;
             }
         }
     }
 
-    private String quote(String value)
-    {
-        return "'" + value + "'";
-    }
-
     private byte[] getApplicationStateValue(InetAddress endpoint, ApplicationState appstate)
     {
         String vvalue = Gossiper.instance.getEndpointStateForEndpoint(endpoint).getApplicationState(appstate).value;
         return vvalue.getBytes(ISO_8859_1);
     }
 
-    private Collection<Token> getTokensFor(InetAddress endpoint, String piece)
+    private Collection<Token> getTokensFor(InetAddress endpoint)
     {
-        if (Gossiper.instance.usesVnodes(endpoint))
+        try
         {
-            try
-            {
-                return TokenSerializer.deserialize(getPartitioner(), new DataInputStream(new ByteArrayInputStream(getApplicationStateValue(endpoint, ApplicationState.TOKENS))));
-            }
-            catch (IOException e)
-            {
-                throw new RuntimeException(e);
-            }
+            return TokenSerializer.deserialize(getPartitioner(), new DataInputStream(new ByteArrayInputStream(getApplicationStateValue(endpoint, ApplicationState.TOKENS))));
         }
-        else
-            return Arrays.asList(getPartitioner().getTokenFactory().fromString(piece));
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
     }
 
     /**
      * Handle node bootstrap
      *
      * @param endpoint bootstrapping node
-     * @param pieces STATE_BOOTSTRAPPING,bootstrap token as string
      */
-    private void handleStateBootstrap(InetAddress endpoint, String[] pieces)
+    private void handleStateBootstrap(InetAddress endpoint)
     {
-        assert pieces.length >= 2;
-
-        // Parse versioned values according to end-point version:
-        //   versions  < 1.2 .....: STATUS,TOKEN
-        //   versions >= 1.2 .....: use TOKENS app state
         Collection<Token> tokens;
         // explicitly check for TOKENS, because a bootstrapping node might be bootstrapping in legacy mode; that is, not using vnodes and no token specified
-        tokens = getTokensFor(endpoint, pieces[1]);
+        tokens = getTokensFor(endpoint);
 
         if (logger.isDebugEnabled())
-            logger.debug("Node " + endpoint + " state bootstrapping, token " + tokens);
+            logger.debug("Node {} state bootstrapping, token {}", endpoint, tokens);
 
         // if this node is present in token metadata, either we have missed intermediate states
         // or the node had crashed. Print warning if needed, clear obsolete stuff and
@@ -1472,7 +1529,7 @@
             // common (not enough time for gossip to spread). Therefore we report only the
             // former in the log.
             if (!tokenMetadata.isLeaving(endpoint))
-                logger.info("Node " + endpoint + " state jump to bootstrap");
+                logger.info("Node {} state jump to bootstrap", endpoint);
             tokenMetadata.removeEndpoint(endpoint);
         }
 
@@ -1488,31 +1545,24 @@
      * in reads.
      *
      * @param endpoint node
-     * @param pieces STATE_NORMAL,token
      */
-    private void handleStateNormal(final InetAddress endpoint, String[] pieces)
+    private void handleStateNormal(final InetAddress endpoint)
     {
-        assert pieces.length >= 2;
-
-        // Parse versioned values according to end-point version:
-        //   versions  < 1.2 .....: STATUS,TOKEN
-        //   versions >= 1.2 .....: uses HOST_ID/TOKENS app states
-
         Collection<Token> tokens;
 
-        tokens = getTokensFor(endpoint, pieces[1]);
+        tokens = getTokensFor(endpoint);
 
-        Set<Token> tokensToUpdateInMetadata = new HashSet<Token>();
-        Set<Token> tokensToUpdateInSystemKeyspace = new HashSet<Token>();
-        Set<Token> localTokensToRemove = new HashSet<Token>();
-        Set<InetAddress> endpointsToRemove = new HashSet<InetAddress>();
+        Set<Token> tokensToUpdateInMetadata = new HashSet<>();
+        Set<Token> tokensToUpdateInSystemKeyspace = new HashSet<>();
+        Set<Token> localTokensToRemove = new HashSet<>();
+        Set<InetAddress> endpointsToRemove = new HashSet<>();
 
 
         if (logger.isDebugEnabled())
-            logger.debug("Node " + endpoint + " state normal, token " + tokens);
+            logger.debug("Node {} state normal, token {}", endpoint, tokens);
 
         if (tokenMetadata.isMember(endpoint))
-            logger.info("Node " + endpoint + " state jump to normal");
+            logger.info("Node {} state jump to normal", endpoint);
 
         updatePeerInfo(endpoint);
         // Order Matters, TM.updateHostID() should be called before TM.updateNormalToken(), (see CASSANDRA-4300).
@@ -1557,7 +1607,7 @@
             InetAddress currentOwner = tokenMetadata.getEndpoint(token);
             if (currentOwner == null)
             {
-                logger.debug("New node " + endpoint + " at token " + token);
+                logger.debug("New node {} at token {}", endpoint, token);
                 tokensToUpdateInMetadata.add(token);
                 if (!isClientMode)
                     tokensToUpdateInSystemKeyspace.add(token);
@@ -1624,28 +1674,26 @@
      * Handle node preparing to leave the ring
      *
      * @param endpoint node
-     * @param pieces STATE_LEAVING,token
      */
-    private void handleStateLeaving(InetAddress endpoint, String[] pieces)
+    private void handleStateLeaving(InetAddress endpoint)
     {
-        assert pieces.length >= 2;
         Collection<Token> tokens;
-        tokens = getTokensFor(endpoint, pieces[1]);
+        tokens = getTokensFor(endpoint);
 
         if (logger.isDebugEnabled())
-            logger.debug("Node " + endpoint + " state leaving, tokens " + tokens);
+            logger.debug("Node {} state leaving, tokens {}", endpoint, tokens);
 
         // If the node is previously unknown or tokens do not match, update tokenmetadata to
         // have this node as 'normal' (it must have been using this token before the
         // leave). This way we'll get pending ranges right.
         if (!tokenMetadata.isMember(endpoint))
         {
-            logger.info("Node " + endpoint + " state jump to leaving");
+            logger.info("Node {} state jump to leaving", endpoint);
             tokenMetadata.updateNormalTokens(tokens, endpoint);
         }
         else if (!tokenMetadata.getTokens(endpoint).containsAll(tokens))
         {
-            logger.warn("Node " + endpoint + " 'leaving' token mismatch. Long network partition?");
+            logger.warn("Node {} 'leaving' token mismatch. Long network partition?", endpoint);
             tokenMetadata.updateNormalTokens(tokens, endpoint);
         }
 
@@ -1665,10 +1713,10 @@
     {
         assert pieces.length >= 2;
         Collection<Token> tokens;
-        tokens = getTokensFor(endpoint, pieces[1]);
+        tokens = getTokensFor(endpoint);
 
         if (logger.isDebugEnabled())
-            logger.debug("Node " + endpoint + " state left, tokens " + tokens);
+            logger.debug("Node {} state left, tokens {}", endpoint, tokens);
 
         excise(tokens, endpoint, extractExpireTime(pieces));
     }
@@ -1685,7 +1733,7 @@
         Token token = getPartitioner().getTokenFactory().fromString(pieces[1]);
 
         if (logger.isDebugEnabled())
-            logger.debug("Node " + endpoint + " state moving, new token " + token);
+            logger.debug("Node {} state moving, new token {}", endpoint, token);
 
         tokenMetadata.addMovingEndpoint(token, endpoint);
 
@@ -1727,7 +1775,7 @@
             else if (VersionedValue.REMOVING_TOKEN.equals(state))
             {
                 if (logger.isDebugEnabled())
-                    logger.debug("Tokens " + removeTokens + " removed manually (endpoint was " + endpoint + ")");
+                    logger.debug("Tokens {} removed manually (endpoint was {})", removeTokens, endpoint);
 
                 // Note that the endpoint is being removed
                 tokenMetadata.addLeavingEndpoint(endpoint);
@@ -1750,7 +1798,7 @@
 
     private void excise(Collection<Token> tokens, InetAddress endpoint)
     {
-        logger.info("Removing tokens " + tokens + " for " + endpoint);
+        logger.info("Removing tokens {} for {}", tokens, endpoint);
         HintedHandOffManager.instance.deleteHintsForEndpoint(endpoint);
         removeEndpoint(endpoint);
         tokenMetadata.removeEndpoint(endpoint);
@@ -1837,7 +1885,7 @@
         MessageOut msg = new MessageOut(MessagingService.Verb.REPLICATION_FINISHED);
         IFailureDetector failureDetector = FailureDetector.instance;
         if (logger.isDebugEnabled())
-            logger.debug("Notifying " + remote.toString() + " of replication completion\n");
+            logger.debug("Notifying {} of replication completion\n", remote);
         while (failureDetector.isAlive(remote))
         {
             AsyncOneResponse iar = MessagingService.instance().sendRR(msg, remote);
@@ -1867,12 +1915,12 @@
     {
         Multimap<String, Map.Entry<InetAddress, Collection<Range<Token>>>> rangesToFetch = HashMultimap.create();
 
-        final InetAddress myAddress = FBUtilities.getBroadcastAddress();
+        InetAddress myAddress = FBUtilities.getBroadcastAddress();
 
         for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
         {
             Multimap<Range<Token>, InetAddress> changedRanges = getChangedRangesForLeaving(keyspaceName, endpoint);
-            Set<Range<Token>> myNewRanges = new HashSet<Range<Token>>();
+            Set<Range<Token>> myNewRanges = new HashSet<>();
             for (Map.Entry<Range<Token>, InetAddress> entry : changedRanges.entries())
             {
                 if (entry.getValue().equals(myAddress))
@@ -1886,14 +1934,14 @@
         }
 
         StreamPlan stream = new StreamPlan("Restore replica count");
-        for (final String keyspaceName : rangesToFetch.keySet())
+        for (String keyspaceName : rangesToFetch.keySet())
         {
             for (Map.Entry<InetAddress, Collection<Range<Token>>> entry : rangesToFetch.get(keyspaceName))
             {
-                final InetAddress source = entry.getKey();
+                InetAddress source = entry.getKey();
                 Collection<Range<Token>> ranges = entry.getValue();
                 if (logger.isDebugEnabled())
-                    logger.debug("Requesting from " + source + " ranges " + StringUtils.join(ranges, ", "));
+                    logger.debug("Requesting from {} ranges {}", source, StringUtils.join(ranges, ", "));
                 stream.requestRanges(source, keyspaceName, ranges);
             }
         }
@@ -1921,9 +1969,9 @@
         Collection<Range<Token>> ranges = getRangesForEndpoint(keyspaceName, endpoint);
 
         if (logger.isDebugEnabled())
-            logger.debug("Node " + endpoint + " ranges [" + StringUtils.join(ranges, ", ") + "]");
+            logger.debug("Node {} ranges [{}]", endpoint, StringUtils.join(ranges, ", "));
 
-        Map<Range<Token>, List<InetAddress>> currentReplicaEndpoints = new HashMap<Range<Token>, List<InetAddress>>();
+        Map<Range<Token>, List<InetAddress>> currentReplicaEndpoints = new HashMap<>();
 
         // Find (for each range) all nodes that store replicas for these ranges as well
         TokenMetadata metadata = tokenMetadata.cloneOnlyTokenMap(); // don't do this in the loop! #7758
@@ -1950,9 +1998,9 @@
             newReplicaEndpoints.removeAll(currentReplicaEndpoints.get(range));
             if (logger.isDebugEnabled())
                 if (newReplicaEndpoints.isEmpty())
-                    logger.debug("Range " + range + " already in all replicas");
+                    logger.debug("Range {} already in all replicas", range);
                 else
-                    logger.debug("Range " + range + " will be responsibility of " + StringUtils.join(newReplicaEndpoints, ", "));
+                    logger.debug("Range {} will be responsibility of {}", range, StringUtils.join(newReplicaEndpoints, ", "));
             changedRanges.putAll(range, newReplicaEndpoints);
         }
 
@@ -2033,7 +2081,7 @@
 
     public Map<String, String> getLoadMap()
     {
-        Map<String, String> map = new HashMap<String, String>();
+        Map<String, String> map = new HashMap<>();
         for (Map.Entry<InetAddress,Double> entry : LoadBroadcaster.instance.getLoadInfo().entrySet())
         {
             map.put(entry.getKey().getHostAddress(), FileUtils.stringifyFileSize(entry.getValue()));
@@ -2069,7 +2117,7 @@
 
     private List<String> getTokens(InetAddress endpoint)
     {
-        List<String> strTokens = new ArrayList<String>();
+        List<String> strTokens = new ArrayList<>();
         for (Token tok : getTokenMetadata().getTokens(endpoint))
             strTokens.add(tok.toString());
         return strTokens;
@@ -2092,7 +2140,7 @@
 
     public List<String> getMovingNodes()
     {
-        List<String> endpoints = new ArrayList<String>();
+        List<String> endpoints = new ArrayList<>();
 
         for (Pair<Token, InetAddress> node : tokenMetadata.getMovingEndpoints())
         {
@@ -2137,7 +2185,7 @@
 
     private List<String> stringify(Iterable<InetAddress> endpoints)
     {
-        List<String> stringEndpoints = new ArrayList<String>();
+        List<String> stringEndpoints = new ArrayList<>();
         for (InetAddress ep : endpoints)
         {
             stringEndpoints.add(ep.getHostAddress());
@@ -2150,28 +2198,43 @@
         return Gossiper.instance.getCurrentGenerationNumber(FBUtilities.getBroadcastAddress());
     }
 
-    public void forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
     {
         if (keyspaceName.equals(Keyspace.SYSTEM_KS))
             throw new RuntimeException("Cleanup of the system keyspace is neither necessary nor wise");
 
-        CounterId.OneShotRenewer counterIdRenewer = new CounterId.OneShotRenewer();
+        CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
         for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
         {
-            cfStore.forceCleanup(counterIdRenewer);
+            CompactionManager.AllSSTableOpStatus oneStatus = cfStore.forceCleanup();
+            if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
+                status = oneStatus;
         }
+        return status.statusCode;
     }
 
-    public void scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
     {
+        CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
         for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
-            cfStore.scrub(disableSnapshot, skipCorrupted);
+        {
+            CompactionManager.AllSSTableOpStatus oneStatus = cfStore.scrub(disableSnapshot, skipCorrupted);
+            if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
+                status = oneStatus;
+        }
+        return status.statusCode;
     }
 
-    public void upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
     {
+        CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
         for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, true, keyspaceName, columnFamilies))
-            cfStore.sstablesRewrite(excludeCurrentVersion);
+        {
+            CompactionManager.AllSSTableOpStatus oneStatus = cfStore.sstablesRewrite(excludeCurrentVersion);
+            if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
+                status = oneStatus;
+        }
+        return status.statusCode;
     }
 
     public void forceKeyspaceCompaction(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
@@ -2202,7 +2265,7 @@
         }
         else
         {
-            ArrayList<Keyspace> t = new ArrayList<Keyspace>(keyspaceNames.length);
+            ArrayList<Keyspace> t = new ArrayList<>(keyspaceNames.length);
             for (String keyspaceName : keyspaceNames)
                 t.add(getValidKeyspace(keyspaceName));
             keyspaces = t;
@@ -2284,11 +2347,55 @@
             logger.debug("Cleared out snapshot directories");
     }
 
+    public Map<String, TabularData> getSnapshotDetails()
+    {
+        Map<String, TabularData> snapshotMap = new HashMap<>();
+        for (Keyspace keyspace : Keyspace.all())
+        {
+            if (Keyspace.SYSTEM_KS.equals(keyspace.getName()))
+                continue;
+
+            for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
+            {
+                for (Map.Entry<String, Pair<Long,Long>> snapshotDetail : cfStore.getSnapshotDetails().entrySet())
+                {
+                    TabularDataSupport data = (TabularDataSupport)snapshotMap.get(snapshotDetail.getKey());
+                    if (data == null)
+                    {
+                        data = new TabularDataSupport(SnapshotDetailsTabularData.TABULAR_TYPE);
+                        snapshotMap.put(snapshotDetail.getKey(), data);
+                    }
+
+                    SnapshotDetailsTabularData.from(snapshotDetail.getKey(), keyspace.getName(), cfStore.getColumnFamilyName(), snapshotDetail, data);
+                }
+            }
+        }
+        return snapshotMap;
+    }
+
+    public long trueSnapshotsSize()
+    {
+        long total = 0;
+        for (Keyspace keyspace : Keyspace.all())
+        {
+            if (Keyspace.SYSTEM_KS.equals(keyspace.getName()))
+                continue;
+
+            for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
+            {
+                total += cfStore.trueSnapshotsSize();
+            }
+        }
+
+        return total;
+    }
+
     /**
      * @param allowIndexes Allow index CF names to be passed in
      * @param autoAddIndexes Automatically add secondary indexes if a CF has them
      * @param keyspaceName keyspace
      * @param cfNames CFs
+     * @throws java.lang.IllegalArgumentException when given CF name does not exist
      */
     public Iterable<ColumnFamilyStore> getValidColumnFamilies(boolean allowIndexes, boolean autoAddIndexes, String keyspaceName, String... cfNames) throws IOException
     {
@@ -2335,15 +2442,9 @@
             }
 
             ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(baseCfName);
-            if (cfStore == null)
-            {
-                // this means there was a cf passed in that is not recognized in the keyspace. report it and continue.
-                logger.warn(String.format("Invalid column family specified: %s. Proceeding with others.", baseCfName));
-                continue;
-            }
             if (idxName != null)
             {
-                Collection< SecondaryIndex > indexes = cfStore.indexManager.getIndexesByNames(new HashSet<String>(Arrays.asList(cfName)));
+                Collection< SecondaryIndex > indexes = cfStore.indexManager.getIndexesByNames(new HashSet<>(Arrays.asList(cfName)));
                 if (indexes.isEmpty())
                     logger.warn(String.format("Invalid column family index specified: %s/%s. Proceeding with others.", baseCfName, idxName));
                 else
@@ -2373,11 +2474,11 @@
      * @param columnFamilies
      * @throws IOException
      */
-    public void forceKeyspaceFlush(final String keyspaceName, final String... columnFamilies) throws IOException
+    public void forceKeyspaceFlush(String keyspaceName, String... columnFamilies) throws IOException
     {
         for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
         {
-            logger.debug("Forcing flush on keyspace " + keyspaceName + ", CF " + cfStore.name);
+            logger.debug("Forcing flush on keyspace {}, CF {}", keyspaceName, cfStore.name);
             cfStore.forceBlockingFlush();
         }
     }
@@ -2396,115 +2497,92 @@
         sendNotification(jmxNotification);
     }
 
-    public int forceRepairAsync(final String keyspace, final boolean isSequential, final Collection<String> dataCenters, final Collection<String> hosts, final boolean primaryRange, final String... columnFamilies)
+    public int forceRepairAsync(String keyspace, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, boolean primaryRange, boolean fullRepair, String... columnFamilies) throws IOException
     {
-        // when repairing only primary range, dataCenter nor hosts can be set
-        if (primaryRange && (dataCenters != null || hosts != null))
+        Collection<Range<Token>> ranges;
+        if (primaryRange)
         {
-            throw new IllegalArgumentException("You need to run primary range repair on all nodes in the cluster.");
+            // when repairing only primary range, neither dataCenters nor hosts can be set
+            if (dataCenters == null && hosts == null)
+                ranges = getPrimaryRanges(keyspace);
+            // except dataCenters only contain local DC (i.e. -local)
+            else if (dataCenters != null && dataCenters.size() == 1 && dataCenters.contains(DatabaseDescriptor.getLocalDataCenter()))
+                ranges = getPrimaryRangesWithinDC(keyspace);
+            else
+                throw new IllegalArgumentException("You need to run primary range repair on all nodes in the cluster.");
         }
-        final Collection<Range<Token>> ranges = primaryRange ? getLocalPrimaryRanges(keyspace) : getLocalRanges(keyspace);
-        return forceRepairAsync(keyspace, isSequential, dataCenters, hosts, ranges, columnFamilies);
+        else
+        {
+             ranges = getLocalRanges(keyspace);
+        }
+
+        return forceRepairAsync(keyspace, isSequential, dataCenters, hosts, ranges, fullRepair, columnFamilies);
     }
 
-    public int forceRepairAsync(final String keyspace, final boolean isSequential, final Collection<String> dataCenters, final Collection<String> hosts,  final Collection<Range<Token>> ranges, final String... columnFamilies)
+    public int forceRepairAsync(String keyspace, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, Collection<Range<Token>> ranges, boolean fullRepair, String... columnFamilies)
     {
         if (ranges.isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor() < 2)
             return 0;
 
-        final int cmd = nextRepairCommand.incrementAndGet();
+        int cmd = nextRepairCommand.incrementAndGet();
         if (ranges.size() > 0)
         {
-            new Thread(createRepairTask(cmd, keyspace, ranges, isSequential, dataCenters, hosts, columnFamilies)).start();
+            if (!FBUtilities.isUnix() && isSequential)
+            {
+                logger.warn("Snapshot-based repair is not yet supported on Windows.  Reverting to parallel repair.");
+                isSequential = false;
+            }
+            new Thread(createRepairTask(cmd, keyspace, ranges, isSequential, dataCenters, hosts, fullRepair, columnFamilies)).start();
         }
         return cmd;
     }
 
-    public int forceRepairAsync(final String keyspace, final boolean isSequential, final boolean isLocal, final boolean primaryRange, final String... columnFamilies)
+    public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, boolean primaryRange, boolean fullRepair, String... columnFamilies)
     {
-        // when repairing only primary range, you cannot repair only on local DC
-        if (primaryRange && isLocal)
+        Collection<Range<Token>> ranges;
+        if (primaryRange)
         {
-            throw new IllegalArgumentException("You need to run primary range repair on all nodes in the cluster.");
+            ranges = isLocal ? getPrimaryRangesWithinDC(keyspace) : getPrimaryRanges(keyspace);
         }
-        final Collection<Range<Token>> ranges = primaryRange ? getLocalPrimaryRanges(keyspace) : getLocalRanges(keyspace);
-        return forceRepairAsync(keyspace, isSequential, isLocal, ranges, columnFamilies);
+        else
+        {
+            ranges = getLocalRanges(keyspace);
+        }
+
+        return forceRepairAsync(keyspace, isSequential, isLocal, ranges, fullRepair, columnFamilies);
     }
 
-    public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, Collection<Range<Token>> ranges, String... columnFamilies)
+    public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, Collection<Range<Token>> ranges, boolean fullRepair, String... columnFamilies)
     {
         if (ranges.isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor() < 2)
             return 0;
 
-        final int cmd = nextRepairCommand.incrementAndGet();
+        int cmd = nextRepairCommand.incrementAndGet();
         if (!FBUtilities.isUnix() && isSequential)
         {
             logger.warn("Snapshot-based repair is not yet supported on Windows.  Reverting to parallel repair.");
             isSequential = false;
         }
-        new Thread(createRepairTask(cmd, keyspace, ranges, isSequential, isLocal, columnFamilies)).start();
+        new Thread(createRepairTask(cmd, keyspace, ranges, isSequential, isLocal, fullRepair, columnFamilies)).start();
         return cmd;
     }
 
-    public int forceRepairRangeAsync(String beginToken, String endToken, final String keyspaceName, boolean isSequential, Collection<String> dataCenters, final Collection<String> hosts, final String... columnFamilies)
-    {
-        Collection<Range<Token>> repairingRange = createRepairRangeFrom(beginToken, endToken);
-
-        logger.info("starting user-requested repair of range {} for keyspace {} and column families {}",
-                    repairingRange, keyspaceName, columnFamilies);
-
-        if (!FBUtilities.isUnix() && isSequential)
-        {
-            logger.warn("Snapshot-based repair is not yet supported on Windows.  Reverting to parallel repair.");
-            isSequential = false;
-        }
-        return forceRepairAsync(keyspaceName, isSequential, dataCenters, hosts, repairingRange, columnFamilies);
-    }
-
-    public int forceRepairRangeAsync(String beginToken, String endToken, final String keyspaceName, boolean isSequential, boolean isLocal, final String... columnFamilies)
-    {
-        Set<String> dataCenters = null;
-        if (isLocal)
-        {
-            dataCenters = Sets.newHashSet(DatabaseDescriptor.getLocalDataCenter());
-        }
-        return forceRepairRangeAsync(beginToken, endToken, keyspaceName, isSequential, dataCenters, null, columnFamilies);
-    }
-
-    /**
-     * Trigger proactive repair for a keyspace and column families.
-     */
-    public void forceKeyspaceRepair(final String keyspaceName, boolean isSequential, boolean isLocal, final String... columnFamilies) throws IOException
-    {
-        forceKeyspaceRepairRange(keyspaceName, getLocalRanges(keyspaceName), isSequential, isLocal, columnFamilies);
-    }
-
-    public void forceKeyspaceRepairPrimaryRange(final String keyspaceName, boolean isSequential, boolean isLocal, final String... columnFamilies) throws IOException
-    {
-        // primary range repair can only be performed for whole cluster.
-        // NOTE: we should omit the param but keep API as is for now.
-        if (isLocal)
-        {
-            throw new IllegalArgumentException("You need to run primary range repair on all nodes in the cluster.");
-        }
-
-        forceKeyspaceRepairRange(keyspaceName, getLocalPrimaryRanges(keyspaceName), isSequential, false, columnFamilies);
-    }
-
-    public void forceKeyspaceRepairRange(String beginToken, String endToken, final String keyspaceName, boolean isSequential, boolean isLocal, final String... columnFamilies) throws IOException
+    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, boolean fullRepair, String... columnFamilies) throws IOException
     {
         Collection<Range<Token>> repairingRange = createRepairRangeFrom(beginToken, endToken);
 
         logger.info("starting user-requested repair of range {} for keyspace {} and column families {}",
                            repairingRange, keyspaceName, columnFamilies);
-        forceKeyspaceRepairRange(keyspaceName, repairingRange, isSequential, isLocal, columnFamilies);
+        return forceRepairAsync(keyspaceName, isSequential, dataCenters, hosts, repairingRange, fullRepair, columnFamilies);
     }
 
-    public void forceKeyspaceRepairRange(final String keyspaceName, final Collection<Range<Token>> ranges, boolean isSequential, boolean isLocal, final String... columnFamilies) throws IOException
+    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, boolean isLocal, boolean fullRepair, String... columnFamilies)
     {
-        if (ranges.isEmpty() || Keyspace.open(keyspaceName).getReplicationStrategy().getReplicationFactor() < 2)
-            return;
-        createRepairTask(nextRepairCommand.incrementAndGet(), keyspaceName, ranges, isSequential, isLocal, columnFamilies).run();
+        Collection<Range<Token>> repairingRange = createRepairRangeFrom(beginToken, endToken);
+
+        logger.info("starting user-requested repair of range {} for keyspace {} and column families {}",
+                           repairingRange, keyspaceName, columnFamilies);
+        return forceRepairAsync(keyspaceName, isSequential, isLocal, repairingRange, fullRepair, columnFamilies);
     }
 
     /**
@@ -2546,17 +2624,30 @@
         return repairingRange;
     }
 
-    private FutureTask<Object> createRepairTask(final int cmd, final String keyspace, final Collection<Range<Token>> ranges, final boolean isSequential, final boolean isLocal, final String... columnFamilies)
+    private FutureTask<Object> createRepairTask(int cmd,
+                                                String keyspace,
+                                                Collection<Range<Token>> ranges,
+                                                boolean isSequential,
+                                                boolean isLocal,
+                                                boolean fullRepair,
+                                                String... columnFamilies)
     {
         Set<String> dataCenters = null;
         if (isLocal)
         {
             dataCenters = Sets.newHashSet(DatabaseDescriptor.getLocalDataCenter());
         }
-        return createRepairTask(cmd, keyspace, ranges, isSequential, dataCenters, null, columnFamilies);
+        return createRepairTask(cmd, keyspace, ranges, isSequential, dataCenters, null, fullRepair, columnFamilies);
     }
 
-    private FutureTask<Object> createRepairTask(final int cmd, final String keyspace, final Collection<Range<Token>> ranges, final boolean isSequential, final Collection<String> dataCenters, final Collection<String> hosts, final String... columnFamilies)
+    private FutureTask<Object> createRepairTask(final int cmd,
+                                                final String keyspace,
+                                                final Collection<Range<Token>> ranges,
+                                                final boolean isSequential,
+                                                final Collection<String> dataCenters,
+                                                final Collection<String> hosts,
+                                                final boolean fullRepair,
+                                                final String... columnFamilies)
     {
         if (dataCenters != null && !dataCenters.contains(DatabaseDescriptor.getLocalDataCenter()))
         {
@@ -2567,24 +2658,71 @@
         {
             protected void runMayThrow() throws Exception
             {
-                String message = String.format("Starting repair command #%d, repairing %d ranges for keyspace %s", cmd, ranges.size(), keyspace);
+                String message = String.format("Starting repair command #%d, repairing %d ranges for keyspace %s (seq=%b, full=%b)", cmd, ranges.size(), keyspace, isSequential, fullRepair);
                 logger.info(message);
                 sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.STARTED.ordinal()});
 
-                List<RepairFuture> futures = new ArrayList<>(ranges.size());
+                if (isSequential && !fullRepair)
+                {
+                    message = "It is not possible to mix sequential repair and incremental repairs.";
+                    logger.error(message);
+                    sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
+                    return;
+                }
+
+                Set<InetAddress> allNeighbors = new HashSet<>();
+                Map<Range, Set<InetAddress>> rangeToNeighbors = new HashMap<>();
                 for (Range<Token> range : ranges)
                 {
-                    RepairFuture future;
                     try
                     {
-                        future = forceKeyspaceRepair(range, keyspace, isSequential, dataCenters, hosts, columnFamilies);
+                        Set<InetAddress> neighbors = ActiveRepairService.getNeighbors(keyspace, range, dataCenters, hosts);
+                        rangeToNeighbors.put(range, neighbors);
+                        allNeighbors.addAll(neighbors);
                     }
                     catch (IllegalArgumentException e)
                     {
-                        logger.error("Repair session failed:", e);
-                        sendNotification("repair", e.getMessage(), new int[]{cmd, ActiveRepairService.Status.SESSION_FAILED.ordinal()});
-                        continue;
+                        logger.error("Repair failed:", e);
+                        sendNotification("repair", e.getMessage(), new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
+                        return;
                     }
+                }
+
+                // Validate columnfamilies
+                List<ColumnFamilyStore> columnFamilyStores = new ArrayList<>();
+                try
+                {
+                    Iterables.addAll(columnFamilyStores, getValidColumnFamilies(false, false, keyspace, columnFamilies));
+                }
+                catch (IllegalArgumentException e)
+                {
+                    sendNotification("repair", e.getMessage(), new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
+                    return;
+                }
+
+                UUID parentSession = null;
+                if (!fullRepair)
+                {
+                    try
+                    {
+                        parentSession = ActiveRepairService.instance.prepareForRepair(allNeighbors, ranges, columnFamilyStores);
+                    }
+                    catch (Throwable t)
+                    {
+                        sendNotification("repair", String.format("Repair failed with error %s", t.getMessage()), new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
+                        return;
+                    }
+                }
+
+                List<RepairFuture> futures = new ArrayList<>(ranges.size());
+                String[] cfnames = new String[columnFamilyStores.size()];
+                for (int i = 0; i < columnFamilyStores.size(); i++)
+                {
+                    cfnames[i] = columnFamilyStores.get(i).name;
+                }
+                for (Range<Token> range : ranges)
+                {
+                    RepairFuture future = ActiveRepairService.instance.submitRepairSession(parentSession, range, keyspace, isSequential, rangeToNeighbors.get(range), cfnames);
                     if (future == null)
                         continue;
                     futures.add(future);
@@ -2600,6 +2738,8 @@
                         sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.SESSION_FAILED.ordinal()});
                     }
                 }
+
+                boolean successful = true;
                 for (RepairFuture future : futures)
                 {
                     try
@@ -2611,39 +2751,26 @@
                     }
                     catch (ExecutionException e)
                     {
+                        successful = false;
                         message = String.format("Repair session %s for range %s failed with error %s", future.session.getId(), future.session.getRange().toString(), e.getCause().getMessage());
                         logger.error(message, e);
                         sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.SESSION_FAILED.ordinal()});
                     }
                     catch (Exception e)
                     {
+                        successful = false;
                         message = String.format("Repair session %s for range %s failed with error %s", future.session.getId(), future.session.getRange().toString(), e.getMessage());
                         logger.error(message, e);
                         sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.SESSION_FAILED.ordinal()});
                     }
                 }
+                if (!fullRepair)
+                    ActiveRepairService.instance.finishParentSession(parentSession, allNeighbors, successful);
                 sendNotification("repair", String.format("Repair command #%d finished", cmd), new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
             }
         }, null);
     }
 
-    public RepairFuture forceKeyspaceRepair(final Range<Token> range, final String keyspaceName, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, final String... columnFamilies) throws IOException
-    {
-        ArrayList<String> names = new ArrayList<String>();
-        for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
-        {
-            names.add(cfStore.name);
-        }
-
-        if (names.isEmpty())
-        {
-            logger.info("No column family to repair for keyspace " + keyspaceName);
-            return null;
-        }
-
-        return ActiveRepairService.instance.submitRepairSession(range, keyspaceName, isSequential, dataCenters, hosts, names.toArray(new String[names.size()]));
-    }
-
     public void forceTerminateAllRepairSessions() {
         ActiveRepairService.instance.terminateSessions();
     }
@@ -2656,39 +2783,57 @@
      * The node that stores replica primarily is defined as the first node returned
      * by {@link AbstractReplicationStrategy#calculateNaturalEndpoints}.
      *
-     * @param keyspace
+     * @param keyspace Keyspace name to check primary ranges
      * @param ep endpoint we are interested in.
      * @return primary ranges for the specified endpoint.
      */
     public Collection<Range<Token>> getPrimaryRangesForEndpoint(String keyspace, InetAddress ep)
     {
         AbstractReplicationStrategy strategy = Keyspace.open(keyspace).getReplicationStrategy();
-        Collection<Range<Token>> primaryRanges = new HashSet<Range<Token>>();
+        Collection<Range<Token>> primaryRanges = new HashSet<>();
         TokenMetadata metadata = tokenMetadata.cloneOnlyTokenMap();
         for (Token token : metadata.sortedTokens())
         {
             List<InetAddress> endpoints = strategy.calculateNaturalEndpoints(token, metadata);
             if (endpoints.size() > 0 && endpoints.get(0).equals(ep))
-                primaryRanges.add(new Range<Token>(metadata.getPredecessor(token), token));
+                primaryRanges.add(new Range<>(metadata.getPredecessor(token), token));
         }
         return primaryRanges;
     }
 
     /**
-     * Previously, primary range is the range that the node is responsible for and calculated
-     * only from the token assigned to the node.
-     * But this does not take replication strategy into account, and therefore returns insufficient
-     * range especially using NTS with replication only to certain DC(see CASSANDRA-5424).
+     * Get the "primary ranges" within local DC for the specified keyspace and endpoint.
      *
-     * @deprecated
-     * @param ep endpoint we are interested in.
-     * @return range for the specified endpoint.
+     * @see #getPrimaryRangesForEndpoint(String, java.net.InetAddress)
+     * @param keyspace Keyspace name to check primary ranges
+     * @param referenceEndpoint endpoint we are interested in.
+     * @return primary ranges within local DC for the specified endpoint.
      */
-    @Deprecated
-    @VisibleForTesting
-    public Range<Token> getPrimaryRangeForEndpoint(InetAddress ep)
+    public Collection<Range<Token>> getPrimaryRangeForEndpointWithinDC(String keyspace, InetAddress referenceEndpoint)
     {
-        return tokenMetadata.getPrimaryRangeFor(tokenMetadata.getToken(ep));
+        TokenMetadata metadata = tokenMetadata.cloneOnlyTokenMap();
+        String localDC = DatabaseDescriptor.getEndpointSnitch().getDatacenter(referenceEndpoint);
+        Collection<InetAddress> localDcNodes = metadata.getTopology().getDatacenterEndpoints().get(localDC);
+        AbstractReplicationStrategy strategy = Keyspace.open(keyspace).getReplicationStrategy();
+
+        Collection<Range<Token>> localDCPrimaryRanges = new HashSet<>();
+        for (Token token : metadata.sortedTokens())
+        {
+            List<InetAddress> endpoints = strategy.calculateNaturalEndpoints(token, metadata);
+            for (InetAddress endpoint : endpoints)
+            {
+                if (localDcNodes.contains(endpoint))
+                {
+                    if (endpoint.equals(referenceEndpoint))
+                    {
+                        localDCPrimaryRanges.add(new Range<>(metadata.getPredecessor(token), token));
+                    }
+                    break;
+                }
+            }
+        }
+
+        return localDCPrimaryRanges;
     }
 
     /**
@@ -2710,18 +2855,18 @@
     public List<Range<Token>> getAllRanges(List<Token> sortedTokens)
     {
         if (logger.isDebugEnabled())
-            logger.debug("computing ranges for " + StringUtils.join(sortedTokens, ", "));
+            logger.debug("computing ranges for {}", StringUtils.join(sortedTokens, ", "));
 
         if (sortedTokens.isEmpty())
             return Collections.emptyList();
         int size = sortedTokens.size();
-        List<Range<Token>> ranges = new ArrayList<Range<Token>>(size + 1);
+        List<Range<Token>> ranges = new ArrayList<>(size + 1);
         for (int i = 1; i < size; ++i)
         {
-            Range<Token> range = new Range<Token>(sortedTokens.get(i - 1), sortedTokens.get(i));
+            Range<Token> range = new Range<>(sortedTokens.get(i - 1), sortedTokens.get(i));
             ranges.add(range);
         }
-        Range<Token> range = new Range<Token>(sortedTokens.get(size - 1), sortedTokens.get(0));
+        Range<Token> range = new Range<>(sortedTokens.get(size - 1), sortedTokens.get(0));
         ranges.add(range);
 
         return ranges;
@@ -2776,7 +2921,7 @@
     public List<InetAddress> getLiveNaturalEndpoints(Keyspace keyspace, RingPosition pos)
     {
         List<InetAddress> endpoints = keyspace.getReplicationStrategy().getNaturalEndpoints(pos);
-        List<InetAddress> liveEps = new ArrayList<InetAddress>(endpoints.size());
+        List<InetAddress> liveEps = new ArrayList<>(endpoints.size());
 
         for (InetAddress endpoint : endpoints)
         {
@@ -2787,26 +2932,29 @@
         return liveEps;
     }
 
-    public void setLog4jLevel(String classQualifier, String rawLevel)
+    public void setLoggingLevel(String classQualifier, String rawLevel) throws Exception
     {
-        org.apache.log4j.Logger log4jlogger = org.apache.log4j.Logger.getLogger(classQualifier);
+        ch.qos.logback.classic.Logger logBackLogger = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(classQualifier);
+
         // if both classQualifer and rawLevel are empty, reload from configuration
-        if (StringUtils.isBlank(classQualifier) && StringUtils.isBlank(rawLevel))
+        if (StringUtils.isBlank(classQualifier) && StringUtils.isBlank(rawLevel) )
         {
-            LogManager.resetConfiguration();
-            CassandraDaemon.initLog4j();
+            JMXConfiguratorMBean jmxConfiguratorMBean = JMX.newMBeanProxy(ManagementFactory.getPlatformMBeanServer(),
+                    new ObjectName("ch.qos.logback.classic:Name=default,Type=ch.qos.logback.classic.jmx.JMXConfigurator"),
+                    JMXConfiguratorMBean.class);
+            jmxConfiguratorMBean.reloadDefaultConfiguration();
             return;
         }
         // classQualifer is set, but blank level given
-        else if (StringUtils.isNotBlank(classQualifier) && StringUtils.isBlank(rawLevel))
+        else if (StringUtils.isNotBlank(classQualifier) && StringUtils.isBlank(rawLevel) )
         {
-            if (log4jlogger.getLevel() != null || log4jlogger.getAllAppenders().hasMoreElements())
-                log4jlogger.setLevel(null);
+            if (logBackLogger.getLevel() != null || hasAppenders(logBackLogger))
+                logBackLogger.setLevel(null);
             return;
         }
 
-        Level level = Level.toLevel(rawLevel);
-        log4jlogger.setLevel(level);
+        ch.qos.logback.classic.Level level = ch.qos.logback.classic.Level.toLevel(rawLevel);
+        logBackLogger.setLevel(level);
         logger.info("set log level to {} for classes under '{}' (if the level doesn't look like '{}' then the logger couldn't parse '{}')", level, classQualifier, rawLevel, rawLevel);
     }
 
@@ -2814,55 +2962,55 @@
      * @return the runtime logging levels for all the configured loggers
      */
     @Override
-    public Map<String,String> getLoggingLevels()
-    {
+    public Map<String,String>getLoggingLevels() {
         Map<String, String> logLevelMaps = Maps.newLinkedHashMap();
-        org.apache.log4j.Logger rootLogger = org.apache.log4j.Logger.getRootLogger();
-        logLevelMaps.put(rootLogger.getName(), rootLogger.getLevel().toString());
-        Enumeration<org.apache.log4j.Logger> loggers = LogManager.getCurrentLoggers();
-        while (loggers.hasMoreElements())
+        LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory();
+        for (ch.qos.logback.classic.Logger logger : lc.getLoggerList())
         {
-            org.apache.log4j.Logger logger= loggers.nextElement();
-            if (logger.getLevel() != null)
+            if(logger.getLevel() != null || hasAppenders(logger))
                 logLevelMaps.put(logger.getName(), logger.getLevel().toString());
         }
         return logLevelMaps;
     }
 
+    private boolean hasAppenders(ch.qos.logback.classic.Logger logger) {
+        Iterator<Appender<ILoggingEvent>> it = logger.iteratorForAppenders();
+        return it.hasNext();
+    }
+
     /**
      * @return list of Token ranges (_not_ keys!) together with estimated key count,
      *      breaking up the data this node is responsible for into pieces of roughly keysPerSplit
      */
-    public List<Pair<Range<Token>, Long>> getSplits(String keyspaceName, String cfName, Range<Token> range, int keysPerSplit, CFMetaData metadata)
+    public List<Pair<Range<Token>, Long>> getSplits(String keyspaceName, String cfName, Range<Token> range, int keysPerSplit)
     {
         Keyspace t = Keyspace.open(keyspaceName);
         ColumnFamilyStore cfs = t.getColumnFamilyStore(cfName);
         List<DecoratedKey> keys = keySamples(Collections.singleton(cfs), range);
 
-        final long totalRowCountEstimate = (keys.size() + 1) * metadata.getIndexInterval();
+        long totalRowCountEstimate = cfs.estimatedKeysForRange(range);
 
         // splitCount should be much smaller than number of key samples, to avoid huge sampling error
-        final int minSamplesPerSplit = 4;
-        final int maxSplitCount = keys.size() / minSamplesPerSplit + 1;
-        final int splitCount = Math.max(1, Math.min(maxSplitCount, (int)(totalRowCountEstimate / keysPerSplit)));
+        int minSamplesPerSplit = 4;
+        int maxSplitCount = keys.size() / minSamplesPerSplit + 1;
+        int splitCount = Math.max(1, Math.min(maxSplitCount, (int)(totalRowCountEstimate / keysPerSplit)));
 
         List<Token> tokens = keysToTokens(range, keys);
-        return getSplits(tokens, splitCount, metadata);
+        return getSplits(tokens, splitCount, cfs);
     }
 
-    private List<Pair<Range<Token>, Long>> getSplits(List<Token> tokens, int splitCount, CFMetaData metadata)
+    private List<Pair<Range<Token>, Long>> getSplits(List<Token> tokens, int splitCount, ColumnFamilyStore cfs)
     {
-        final double step = (double) (tokens.size() - 1) / splitCount;
-        int prevIndex = 0;
+        double step = (double) (tokens.size() - 1) / splitCount;
         Token prevToken = tokens.get(0);
         List<Pair<Range<Token>, Long>> splits = Lists.newArrayListWithExpectedSize(splitCount);
         for (int i = 1; i <= splitCount; i++)
         {
             int index = (int) Math.round(i * step);
             Token token = tokens.get(index);
-            long rowCountEstimate = (index - prevIndex) * metadata.getIndexInterval();
-            splits.add(Pair.create(new Range<Token>(prevToken, token), rowCountEstimate));
-            prevIndex = index;
+            Range<Token> range = new Range<>(prevToken, token);
+            // always return an estimate > 0 (see CASSANDRA-7322)
+            splits.add(Pair.create(range, Math.max(cfs.metadata.getMinIndexInterval(), cfs.estimatedKeysForRange(range))));
             prevToken = token;
         }
         return splits;
@@ -2873,14 +3021,14 @@
         List<Token> tokens = Lists.newArrayListWithExpectedSize(keys.size() + 2);
         tokens.add(range.left);
         for (DecoratedKey key : keys)
-            tokens.add(key.token);
+            tokens.add(key.getToken());
         tokens.add(range.right);
         return tokens;
     }
 
     private List<DecoratedKey> keySamples(Iterable<ColumnFamilyStore> cfses, Range<Token> range)
     {
-        List<DecoratedKey> keys = new ArrayList<DecoratedKey>();
+        List<DecoratedKey> keys = new ArrayList<>();
         for (ColumnFamilyStore cfs : cfses)
             Iterables.addAll(keys, cfs.keySamples(range));
         FBUtilities.sortSampledKeys(keys, range);
@@ -2941,20 +3089,20 @@
 
         Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, valueFactory.left(getLocalTokens(),Gossiper.computeExpireTime()));
         int delay = Math.max(RING_DELAY, Gossiper.intervalInMillis * 2);
-        logger.info("Announcing that I have left the ring for " + delay + "ms");
+        logger.info("Announcing that I have left the ring for {}ms", delay);
         Uninterruptibles.sleepUninterruptibly(delay, TimeUnit.MILLISECONDS);
     }
 
-    private void unbootstrap(final Runnable onFinish)
+    private void unbootstrap(Runnable onFinish)
     {
-        Map<String, Multimap<Range<Token>, InetAddress>> rangesToStream = new HashMap<String, Multimap<Range<Token>, InetAddress>>();
+        Map<String, Multimap<Range<Token>, InetAddress>> rangesToStream = new HashMap<>();
 
-        for (final String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
         {
             Multimap<Range<Token>, InetAddress> rangesMM = getChangedRangesForLeaving(keyspaceName, FBUtilities.getBroadcastAddress());
 
             if (logger.isDebugEnabled())
-                logger.debug("Ranges needing transfer are [" + StringUtils.join(rangesMM.keySet(), ",") + "]");
+                logger.debug("Ranges needing transfer are [{}]", StringUtils.join(rangesMM.keySet(), ","));
 
             rangesToStream.put(keyspaceName, rangesMM);
         }
@@ -3003,7 +3151,7 @@
         FBUtilities.waitOnFuture(hintsCF.forceFlush());
 
         // gather all live nodes in the cluster that aren't also leaving
-        List<InetAddress> candidates = new ArrayList<InetAddress>(StorageService.instance.getTokenMetadata().cloneAfterAllLeft().getAllEndpoints());
+        List<InetAddress> candidates = new ArrayList<>(StorageService.instance.getTokenMetadata().cloneAfterAllLeft().getAllEndpoints());
         candidates.remove(FBUtilities.getBroadcastAddress());
         for (Iterator<InetAddress> iter = candidates.iterator(); iter.hasNext(); )
         {
@@ -3025,7 +3173,7 @@
 
             // stream all hints -- range list will be a singleton of "the entire ring"
             Token token = StorageService.getPartitioner().getMinimumToken();
-            List<Range<Token>> ranges = Collections.singletonList(new Range<Token>(token, token));
+            List<Range<Token>> ranges = Collections.singletonList(new Range<>(token, token));
 
             return new StreamPlan("Hints").transferRanges(hintsDestinationHost,
                                                                       Keyspace.SYSTEM_KS,
@@ -3116,7 +3264,7 @@
 
     private class RangeRelocator
     {
-        private StreamPlan streamPlan = new StreamPlan("Moving");
+        private final StreamPlan streamPlan = new StreamPlan("Relocation");
 
         private RangeRelocator(Collection<Token> tokens, List<String> keyspaceNames)
         {
@@ -3133,6 +3281,7 @@
 
             for (String keyspace : keyspaceNames)
             {
+                logger.debug("Calculating ranges to stream and request for keyspace {}", keyspace);
                 for (Token newToken : newTokens)
                 {
                     // replication strategy of the current keyspace (aka table)
@@ -3141,7 +3290,7 @@
                     // getting collection of the currently used ranges by this keyspace
                     Collection<Range<Token>> currentRanges = getRangesForEndpoint(keyspace, localAddress);
                     // collection of ranges which this node will serve after move to the new token
-                    Collection<Range<Token>> updatedRanges = strategy.getPendingAddressRanges(tokenMetadata, newToken, localAddress);
+                    Collection<Range<Token>> updatedRanges = strategy.getPendingAddressRanges(tokenMetaClone, newToken, localAddress);
 
                     // ring ranges and endpoints associated with them
                     // this used to determine what nodes should we ping about range data
@@ -3161,11 +3310,51 @@
                         {
                             if (range.contains(toFetch))
                             {
-                                List<InetAddress> endpoints = snitch.getSortedListByProximity(localAddress, rangeAddresses.get(range));
+                                List<InetAddress> endpoints = null;
+
+                                if (RangeStreamer.useStrictConsistency)
+                                {
+                                    Set<InetAddress> oldEndpoints = Sets.newHashSet(rangeAddresses.get(range));
+                                    Set<InetAddress> newEndpoints = Sets.newHashSet(strategy.calculateNaturalEndpoints(toFetch.right, tokenMetaCloneAllSettled));
+
+                                    //Due to CASSANDRA-5953 we can have a higher RF then we have endpoints.
+                                    //So we need to be careful to only be strict when endpoints == RF
+                                    if (oldEndpoints.size() == strategy.getReplicationFactor())
+                                    {
+                                        oldEndpoints.removeAll(newEndpoints);
+
+                                        //No relocation required
+                                        if (oldEndpoints.isEmpty())
+                                            continue;
+
+                                        assert oldEndpoints.size() == 1 : "Expected 1 endpoint but found " + oldEndpoints.size();
+                                    }
+
+                                    endpoints = Lists.newArrayList(oldEndpoints.iterator().next());
+                                }
+                                else
+                                {
+                                    endpoints = snitch.getSortedListByProximity(localAddress, rangeAddresses.get(range));
+                                }
+
                                 // storing range and preferred endpoint set
                                 rangesToFetchWithPreferredEndpoints.putAll(toFetch, endpoints);
                             }
                         }
+
+                        Collection<InetAddress> addressList = rangesToFetchWithPreferredEndpoints.get(toFetch);
+                        if (addressList == null || addressList.isEmpty())
+                            continue;
+
+                        if (RangeStreamer.useStrictConsistency)
+                        {
+                            if (addressList.size() > 1)
+                                throw new IllegalStateException("Multiple strict sources found for " + toFetch);
+
+                            InetAddress sourceIp = addressList.iterator().next();
+                            if (Gossiper.instance.isEnabled() && !Gossiper.instance.getEndpointStateForEndpoint(sourceIp).isAlive())
+                                throw new RuntimeException("A node required to move the data consistently is down ("+sourceIp+").  If you wish to move the data from a potentially inconsistent replica, restart the node with -Dcassandra.consistent.rangemovement=false");
+                        }
                     }
 
                     // calculating endpoints to stream current ranges to if needed
@@ -3175,19 +3364,28 @@
                     {
                         Set<InetAddress> currentEndpoints = ImmutableSet.copyOf(strategy.calculateNaturalEndpoints(toStream.right, tokenMetaClone));
                         Set<InetAddress> newEndpoints = ImmutableSet.copyOf(strategy.calculateNaturalEndpoints(toStream.right, tokenMetaCloneAllSettled));
-                        logger.debug("Range:" + toStream + "Current endpoints: " + currentEndpoints + " New endpoints: " + newEndpoints);
+                        logger.debug("Range: {} Current endpoints: {} New endpoints: {}", toStream, currentEndpoints, newEndpoints);
                         for (InetAddress address : Sets.difference(newEndpoints, currentEndpoints))
+                        {
+                            logger.debug("Range {} has new owner {}", toStream, address);
                             endpointRanges.put(address, toStream);
+                        }
                     }
 
                     // stream ranges
                     for (InetAddress address : endpointRanges.keySet())
+                    {
+                        logger.debug("Will stream range {} of keyspace {} to endpoint {}", endpointRanges.get(address), keyspace, address);
                         streamPlan.transferRanges(address, keyspace, endpointRanges.get(address));
+                    }
 
                     // stream requests
                     Multimap<InetAddress, Range<Token>> workMap = RangeStreamer.getWorkMap(rangesToFetchWithPreferredEndpoints);
                     for (InetAddress address : workMap.keySet())
+                    {
+                        logger.debug("Will request range {} of keyspace {} from endpoint {}", workMap.get(address), keyspace, address);
                         streamPlan.requestRanges(address, keyspace, workMap.get(address));
+                    }
 
                     if (logger.isDebugEnabled())
                         logger.debug("Keyspace {}: work map {}.", keyspace, workMap);
@@ -3228,7 +3426,7 @@
     {
         if (!replicatingNodes.isEmpty()  || !tokenMetadata.getLeavingEndpoints().isEmpty())
         {
-            logger.warn("Removal not confirmed for for " + StringUtils.join(this.replicatingNodes, ","));
+            logger.warn("Removal not confirmed for for {}", StringUtils.join(this.replicatingNodes, ","));
             for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints())
             {
                 UUID hostId = tokenMetadata.getHostId(endpoint);
@@ -3273,7 +3471,7 @@
 
         // A leaving endpoint that is dead is already being removed.
         if (tokenMetadata.isLeaving(endpoint))
-            logger.warn("Node " + endpoint + " is already being removed, continuing removal anyway");
+            logger.warn("Node {} is already being removed, continuing removal anyway", endpoint);
 
         if (!replicatingNodes.isEmpty())
             throw new UnsupportedOperationException("This node is already processing a removal. Wait for it to complete, or use 'removenode force' if this has failed.");
@@ -3294,7 +3492,7 @@
                 if (failureDetector.isAlive(ep))
                     replicatingNodes.add(ep);
                 else
-                    logger.warn("Endpoint " + ep + " is down and will not receive data for re-replication of " + endpoint);
+                    logger.warn("Endpoint {} is down and will not receive data for re-replication of {}", ep, endpoint);
             }
         }
         removingNode = endpoint;
@@ -3335,8 +3533,7 @@
         }
         else
         {
-            logger.info("Received unexpected REPLICATION_FINISHED message from " + node
-                         + ". Was this node recently a removal coordinator?");
+            logger.info("Received unexpected REPLICATION_FINISHED message from {}. Was this node recently a removal coordinator?", node);
         }
     }
 
@@ -3383,8 +3580,9 @@
      */
     public synchronized void drain() throws IOException, InterruptedException, ExecutionException
     {
+        ExecutorService counterMutationStage = StageManager.getStage(Stage.COUNTER_MUTATION);
         ExecutorService mutationStage = StageManager.getStage(Stage.MUTATION);
-        if (mutationStage.isTerminated())
+        if (mutationStage.isTerminated() && counterMutationStage.isTerminated())
         {
             logger.warn("Cannot drain node (did it already happen?)");
             return;
@@ -3398,7 +3596,9 @@
         MessagingService.instance().shutdown();
 
         setMode(Mode.DRAINING, "clearing mutation stage", false);
+        counterMutationStage.shutdown();
         mutationStage.shutdown();
+        counterMutationStage.awaitTermination(3600, TimeUnit.SECONDS);
         mutationStage.awaitTermination(3600, TimeUnit.SECONDS);
 
         StorageProxy.instance.verifyNoHintsInProgress();
@@ -3410,7 +3610,7 @@
             totalCFs += keyspace.getColumnFamilyStores().size();
         remainingCFs = totalCFs;
         // flush
-        List<Future<?>> flushes = new ArrayList<Future<?>>();
+        List<Future<?>> flushes = new ArrayList<>();
         for (Keyspace keyspace : Keyspace.nonSystem())
         {
             for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
@@ -3437,6 +3637,10 @@
         BatchlogManager.batchlogTasks.shutdown();
         BatchlogManager.batchlogTasks.awaitTermination(60, TimeUnit.SECONDS);
 
+        // whilst we've flushed all the CFs, which will have recycled all completed segments, we want to ensure
+        // there are no segments to replay, so we force the recycling of any remaining (should be at most one)
+        CommitLog.instance.forceRecycleAllSegments();
+
         ColumnFamilyStore.postFlushExecutor.shutdown();
         ColumnFamilyStore.postFlushExecutor.awaitTermination(60, TimeUnit.SECONDS);
 
@@ -3483,7 +3687,7 @@
         List<Token> sortedTokens = tokenMetadata.sortedTokens();
         // describeOwnership returns tokens in an unspecified order, let's re-order them
         Map<Token, Float> tokenMap = new TreeMap<Token, Float>(getPartitioner().describeOwnership(sortedTokens));
-        Map<InetAddress, Float> nodeMap = new LinkedHashMap<InetAddress, Float>();
+        Map<InetAddress, Float> nodeMap = new LinkedHashMap<>();
         for (Map.Entry<Token, Float> entry : tokenMap.entrySet())
         {
             InetAddress endpoint = tokenMetadata.getEndpoint(entry.getKey());
@@ -3507,19 +3711,37 @@
      */
     public LinkedHashMap<InetAddress, Float> effectiveOwnership(String keyspace) throws IllegalStateException
     {
-        if (Schema.instance.getNonSystemKeyspaces().size() <= 0)
-            throw new IllegalStateException("Couldn't find any Non System Keyspaces to infer replication topology");
-        if (keyspace == null && !hasSameReplication(Schema.instance.getNonSystemKeyspaces()))
-            throw new IllegalStateException("Non System keyspaces doesnt have the same topology");
-
+    	
+    	if (keyspace != null)
+    	{
+    		Keyspace keyspaceInstance = Schema.instance.getKeyspaceInstance(keyspace);
+			if(keyspaceInstance == null)
+				throw new IllegalArgumentException("The keyspace " + keyspace + ", does not exist");
+    		
+    		if(keyspaceInstance.getReplicationStrategy() instanceof LocalStrategy)
+				throw new IllegalStateException("Ownership values for keyspaces with LocalStrategy are meaningless");
+    	}
+    	else
+    	{
+        	List<String> nonSystemKeyspaces = Schema.instance.getNonSystemKeyspaces();
+        	
+        	//system_traces is a non-system keyspace however it needs to be counted as one for this process
+        	int specialTableCount = 0;
+        	if (nonSystemKeyspaces.contains("system_traces"))
+			{
+        		specialTableCount += 1;
+			}
+        	if (nonSystemKeyspaces.size() > specialTableCount) 	   		
+        		throw new IllegalStateException("Non-system keyspaces don't have the same replication settings, effective ownership information is meaningless");
+        	
+        	keyspace = "system_traces";
+    	}
+    	
         TokenMetadata metadata = tokenMetadata.cloneOnlyTokenMap();
 
-        if (keyspace == null)
-            keyspace = Schema.instance.getNonSystemKeyspaces().get(0);
-
-        Collection<Collection<InetAddress>> endpointsGroupedByDc = new ArrayList<Collection<InetAddress>>();
+        Collection<Collection<InetAddress>> endpointsGroupedByDc = new ArrayList<>();
         // mapping of dc's to nodes, use sorted map so that we get dcs sorted
-        SortedMap<String, Collection<InetAddress>> sortedDcsToEndpoints = new TreeMap<String, Collection<InetAddress>>();
+        SortedMap<String, Collection<InetAddress>> sortedDcsToEndpoints = new TreeMap<>();
         sortedDcsToEndpoints.putAll(metadata.getTopology().getDatacenterEndpoints().asMap());
         for (Collection<InetAddress> endpoints : sortedDcsToEndpoints.values())
             endpointsGroupedByDc.add(endpoints);
@@ -3565,7 +3787,7 @@
 
     public List<String> getKeyspaces()
     {
-        List<String> keyspaceNamesList = new ArrayList<String>(Schema.instance.getKeyspaces());
+        List<String> keyspaceNamesList = new ArrayList<>(Schema.instance.getKeyspaces());
         return Collections.unmodifiableList(keyspaceNamesList);
     }
 
@@ -3608,10 +3830,10 @@
      * @param rangesToStreamByKeyspace keyspaces and data ranges with endpoints included for each
      * @return async Future for whether stream was success
      */
-    private Future<StreamState> streamRanges(final Map<String, Multimap<Range<Token>, InetAddress>> rangesToStreamByKeyspace)
+    private Future<StreamState> streamRanges(Map<String, Multimap<Range<Token>, InetAddress>> rangesToStreamByKeyspace)
     {
         // First, we build a list of ranges to stream to each host, per table
-        final Map<String, Map<InetAddress, List<Range<Token>>>> sessionsToStreamByKeyspace = new HashMap<String, Map<InetAddress, List<Range<Token>>>>();
+        Map<String, Map<InetAddress, List<Range<Token>>>> sessionsToStreamByKeyspace = new HashMap<>();
         for (Map.Entry<String, Multimap<Range<Token>, InetAddress>> entry : rangesToStreamByKeyspace.entrySet())
         {
             String keyspace = entry.getKey();
@@ -3620,16 +3842,16 @@
             if (rangesWithEndpoints.isEmpty())
                 continue;
 
-            Map<InetAddress, List<Range<Token>>> rangesPerEndpoint = new HashMap<InetAddress, List<Range<Token>>>();
-            for (final Map.Entry<Range<Token>, InetAddress> endPointEntry : rangesWithEndpoints.entries())
+            Map<InetAddress, List<Range<Token>>> rangesPerEndpoint = new HashMap<>();
+            for (Map.Entry<Range<Token>, InetAddress> endPointEntry : rangesWithEndpoints.entries())
             {
-                final Range<Token> range = endPointEntry.getKey();
-                final InetAddress endpoint = endPointEntry.getValue();
+                Range<Token> range = endPointEntry.getKey();
+                InetAddress endpoint = endPointEntry.getValue();
 
                 List<Range<Token>> curRanges = rangesPerEndpoint.get(endpoint);
                 if (curRanges == null)
                 {
-                    curRanges = new LinkedList<Range<Token>>();
+                    curRanges = new LinkedList<>();
                     rangesPerEndpoint.put(endpoint, curRanges);
                 }
                 curRanges.add(range);
@@ -3641,13 +3863,13 @@
         StreamPlan streamPlan = new StreamPlan("Unbootstrap");
         for (Map.Entry<String, Map<InetAddress, List<Range<Token>>>> entry : sessionsToStreamByKeyspace.entrySet())
         {
-            final String keyspaceName = entry.getKey();
-            final Map<InetAddress, List<Range<Token>>> rangesPerEndpoint = entry.getValue();
+            String keyspaceName = entry.getKey();
+            Map<InetAddress, List<Range<Token>>> rangesPerEndpoint = entry.getValue();
 
-            for (final Map.Entry<InetAddress, List<Range<Token>>> rangesEntry : rangesPerEndpoint.entrySet())
+            for (Map.Entry<InetAddress, List<Range<Token>>> rangesEntry : rangesPerEndpoint.entrySet())
             {
-                final List<Range<Token>> ranges = rangesEntry.getValue();
-                final InetAddress newEndpoint = rangesEntry.getKey();
+                List<Range<Token>> ranges = rangesEntry.getValue();
+                InetAddress newEndpoint = rangesEntry.getKey();
 
                 // TODO each call to transferRanges re-flushes, this is potentially a lot of waste
                 streamPlan.transferRanges(newEndpoint, keyspaceName, ranges);
@@ -3666,8 +3888,8 @@
      */
     public Pair<Set<Range<Token>>, Set<Range<Token>>> calculateStreamAndFetchRanges(Collection<Range<Token>> current, Collection<Range<Token>> updated)
     {
-        Set<Range<Token>> toStream = new HashSet<Range<Token>>();
-        Set<Range<Token>> toFetch  = new HashSet<Range<Token>>();
+        Set<Range<Token>> toStream = new HashSet<>();
+        Set<Range<Token>> toFetch  = new HashSet<>();
 
 
         for (Range r1 : current)
@@ -3786,14 +4008,14 @@
      */
     public List<String> sampleKeyRange() // do not rename to getter - see CASSANDRA-4452 for details
     {
-        List<DecoratedKey> keys = new ArrayList<DecoratedKey>();
+        List<DecoratedKey> keys = new ArrayList<>();
         for (Keyspace keyspace : Keyspace.nonSystem())
         {
             for (Range<Token> range : getPrimaryRangesForEndpoint(keyspace.getName(), FBUtilities.getBroadcastAddress()))
                 keys.addAll(keySamples(keyspace.getColumnFamilyStores(), range));
         }
 
-        List<String> sampledKeys = new ArrayList<String>(keys.size());
+        List<String> sampledKeys = new ArrayList<>(keys.size());
         for (DecoratedKey key : keys)
             sampledKeys.add(key.getToken().toString());
         return sampledKeys;

diff --git a/src/java/org/apache/cassandra/service/StorageServiceMBean.java b/src/java/org/apache/cassandra/service/StorageServiceMBean.java
index 6308aa5..8bb13ae 100644
--- a/src/java/org/apache/cassandra/service/StorageServiceMBean.java
+++ b/src/java/org/apache/cassandra/service/StorageServiceMBean.java

@@ -28,6 +28,9 @@
 import java.util.concurrent.TimeoutException;
 
 import javax.management.NotificationEmitter;
+import javax.management.openmbean.TabularData;
+
+import org.apache.cassandra.db.compaction.CompactionManager;
 
 public interface StorageServiceMBean extends NotificationEmitter
 {
@@ -216,6 +219,18 @@
     public void clearSnapshot(String tag, String... keyspaceNames) throws IOException;
 
     /**
+     *  Get the details of all the snapshot
+     * @return A map of snapshotName to all its details in Tabular form.
+     */
+    public Map<String, TabularData> getSnapshotDetails();
+
+    /**
+     * Get the true size taken by all snapshots across all keyspaces.
+     * @return True size taken by all the snapshots.
+     */
+    public long trueSnapshotsSize();
+
+    /**
      * Forces major compaction of a single keyspace
      */
     public void forceKeyspaceCompaction(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
@@ -223,7 +238,7 @@
     /**
      * Trigger a cleanup of keys on a single keyspace
      */
-    public void forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Scrub (deserialize + reserialize at the latest version, skipping bad rows if any) the given keyspace.
@@ -231,13 +246,13 @@
      *
      * Scrubbed CFs will be snapshotted first, if disableSnapshot is false
      */
-    public void scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Rewrite all sstables to the latest version.
      * Unlike scrub, it doesn't skip bad rows and do not snapshot sstables first.
      */
-    public void upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Flush all memtables for the given column families, or all columnfamilies for the given keyspace
@@ -257,13 +272,12 @@
      *
      * @return Repair command number, or 0 if nothing to repair
      */
-    public int forceRepairAsync(String keyspace, boolean isSequential, Collection<String> dataCenters, final Collection<String> hosts, boolean primaryRange, String... columnFamilies);
+    public int forceRepairAsync(String keyspace, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts,  boolean primaryRange, boolean repairedAt, String... columnFamilies) throws IOException;
 
     /**
      * Same as forceRepairAsync, but handles a specified range
      */
-    public int forceRepairRangeAsync(String beginToken, String endToken, final String keyspaceName, boolean isSequential, Collection<String> dataCenters, final Collection<String> hosts,  final String... columnFamilies);
-
+    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, boolean repairedAt, String... columnFamilies) throws IOException;
 
     /**
      * Invoke repair asynchronously.
@@ -273,36 +287,13 @@
      *   userObject: int array of length 2, [0]=command number, [1]=ordinal of AntiEntropyService.Status
      *
      * @return Repair command number, or 0 if nothing to repair
-     * @see #forceKeyspaceRepair(String, boolean, boolean, String...)
      */
-    public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, boolean primaryRange, String... columnFamilies);
+    public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, boolean primaryRange, boolean fullRepair, String... columnFamilies);
 
     /**
      * Same as forceRepairAsync, but handles a specified range
      */
-    public int forceRepairRangeAsync(String beginToken, String endToken, final String keyspaceName, boolean isSequential, boolean isLocal, final String... columnFamilies);
-
-    /**
-     * Triggers proactive repair for given column families, or all columnfamilies for the given keyspace
-     * if none are explicitly listed.
-     * @param keyspaceName
-     * @param columnFamilies
-     * @throws IOException
-     */
-    public void forceKeyspaceRepair(String keyspaceName, boolean isSequential, boolean isLocal, String... columnFamilies) throws IOException;
-
-    /**
-     * Triggers proactive repair but only for the node primary range.
-     */
-    public void forceKeyspaceRepairPrimaryRange(String keyspaceName, boolean isSequential, boolean isLocal, String... columnFamilies) throws IOException;
-
-    /**
-     * Perform repair of a specific range.
-     *
-     * This allows incremental repair to be performed by having an external controller submitting repair jobs.
-     * Note that the provided range much be a subset of one of the node local range.
-     */
-    public void forceKeyspaceRepairRange(String beginToken, String endToken, String keyspaceName, boolean isSequential, boolean isLocal, String... columnFamilies) throws IOException;
+    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, boolean isLocal, boolean repairedAt, String... columnFamilies);
 
     public void forceTerminateAllRepairSessions();
 
@@ -333,10 +324,25 @@
      */
     public void forceRemoveCompletion();
 
-    /** set the logging level at runtime */
-    public void setLog4jLevel(String classQualifier, String level);
+    /**
+     * set the logging level at runtime<br>
+     * <br>
+     * If both classQualifer and level are empty/null, it will reload the configuration to reset.<br>
+     * If classQualifer is not empty but level is empty/null, it will set the level to null for the defined classQualifer<br>
+     * If level cannot be parsed, then the level will be defaulted to DEBUG<br>
+     * <br>
+     * The logback configuration should have < jmxConfigurator /> set
+     * 
+     * @param classQualifier The logger's classQualifer
+     * @param level The log level
+     * @throws Exception 
+     * 
+     *  @see ch.qos.logback.classic.Level#toLevel(String)
+     */
+    public void setLoggingLevel(String classQualifier, String level) throws Exception;
 
-    public Map<String,String>getLoggingLevels();
+    /** get the runtime logging levels */
+    public Map<String,String> getLoggingLevels();
 
     /** get the operational mode (leaving, joining, normal, decommissioned, client) **/
     public String getOperationMode();

diff --git a/src/java/org/apache/cassandra/service/TruncateResponseHandler.java b/src/java/org/apache/cassandra/service/TruncateResponseHandler.java
index 3bacad8..cce8ecc 100644
--- a/src/java/org/apache/cassandra/service/TruncateResponseHandler.java
+++ b/src/java/org/apache/cassandra/service/TruncateResponseHandler.java

@@ -27,7 +27,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.net.IAsyncCallback;
 import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.utils.SimpleCondition;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
 public class TruncateResponseHandler implements IAsyncCallback
 {

diff --git a/src/java/org/apache/cassandra/service/WriteResponseHandler.java b/src/java/org/apache/cassandra/service/WriteResponseHandler.java
index 826ae01..df23b19 100644
--- a/src/java/org/apache/cassandra/service/WriteResponseHandler.java
+++ b/src/java/org/apache/cassandra/service/WriteResponseHandler.java

@@ -21,7 +21,7 @@
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -38,7 +38,9 @@
 {
     protected static final Logger logger = LoggerFactory.getLogger(WriteResponseHandler.class);
 
-    protected final AtomicInteger responses;
+    protected volatile int responses;
+    private static final AtomicIntegerFieldUpdater<WriteResponseHandler> responsesUpdater
+            = AtomicIntegerFieldUpdater.newUpdater(WriteResponseHandler.class, "responses");
 
     public WriteResponseHandler(Collection<InetAddress> writeEndpoints,
                                 Collection<InetAddress> pendingEndpoints,
@@ -48,7 +50,7 @@
                                 WriteType writeType)
     {
         super(keyspace, writeEndpoints, pendingEndpoints, consistencyLevel, callback, writeType);
-        responses = new AtomicInteger(totalBlockFor());
+        responses = totalBlockFor();
     }
 
     public WriteResponseHandler(InetAddress endpoint, WriteType writeType, Runnable callback)
@@ -63,13 +65,13 @@
 
     public void response(MessageIn m)
     {
-        if (responses.decrementAndGet() == 0)
+        if (responsesUpdater.decrementAndGet(this) == 0)
             signal();
     }
 
     protected int ackCount()
     {
-        return totalBlockFor() - responses.get();
+        return totalBlockFor() - responses;
     }
 
     public boolean isLatencyForSnitch()

diff --git a/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java b/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
index 4210296..964cf64 100644
--- a/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.service.pager;
 
-import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@@ -48,7 +47,7 @@
 
     private int remaining;
     private boolean exhausted;
-    private boolean lastWasRecorded;
+    private boolean shouldFetchExtraRow;
 
     protected AbstractQueryPager(ConsistencyLevel consistencyLevel,
                                  int toFetch,
@@ -124,9 +123,9 @@
             rows = discardFirst(rows);
             remaining++;
         }
-        // Otherwise, if 'lastWasRecorded', we queried for one more than the page size,
+        // Otherwise, if 'shouldFetchExtraRow' was set, we queried for one more than the page size,
         // so if the page is full, trim the last entry
-        else if (lastWasRecorded && !exhausted)
+        else if (shouldFetchExtraRow && !exhausted)
         {
             // We've asked for one more than necessary
             rows = discardLast(rows);
@@ -136,7 +135,7 @@
         logger.debug("Remaining rows to page: {}", remaining);
 
         if (!isExhausted())
-            lastWasRecorded = recordLast(rows.get(rows.size() - 1));
+            shouldFetchExtraRow = recordLast(rows.get(rows.size() - 1));
 
         return rows;
     }
@@ -145,12 +144,12 @@
     {
         for (Row row : result)
         {
-            if (row.cf == null || row.cf.getColumnCount() == 0)
+            if (row.cf == null || !row.cf.hasColumns())
             {
                 List<Row> newResult = new ArrayList<Row>(result.size() - 1);
                 for (Row row2 : result)
                 {
-                    if (row2.cf == null || row2.cf.getColumnCount() == 0)
+                    if (row2.cf == null || !row2.cf.hasColumns())
                         continue;
 
                     newResult.add(row2);
@@ -161,10 +160,10 @@
         return result;
     }
 
-    protected void restoreState(int remaining, boolean lastWasRecorded)
+    protected void restoreState(int remaining, boolean shouldFetchExtraRow)
     {
         this.remaining = remaining;
-        this.lastWasRecorded = lastWasRecorded;
+        this.shouldFetchExtraRow = shouldFetchExtraRow;
     }
 
     public boolean isExhausted()
@@ -184,7 +183,7 @@
 
     private int nextPageSize(int pageSize)
     {
-        return Math.min(remaining, pageSize) + (lastWasRecorded ? 1 : 0);
+        return Math.min(remaining, pageSize) + (shouldFetchExtraRow ? 1 : 0);
     }
 
     public ColumnCounter columnCounter()
@@ -193,8 +192,21 @@
     }
 
     protected abstract List<Row> queryNextPage(int pageSize, ConsistencyLevel consistency, boolean localQuery) throws RequestValidationException, RequestExecutionException;
+
+    /**
+     * Checks to see if the first row of a new page contains the last row from the previous page.
+     * @param first the first row of the new page
+     * @return true if <code>first</code> contains the last from from the previous page and it is live, false otherwise
+     */
     protected abstract boolean containsPreviousLast(Row first);
+
+    /**
+     * Saves the paging state by recording the last seen partition key and cell name (where applicable).
+     * @param last the last row in the current page
+     * @return true if an extra row should be fetched in the next page,false otherwise
+     */
     protected abstract boolean recordLast(Row last);
+
     protected abstract boolean isReversed();
 
     private List<Row> discardFirst(List<Row> rows)
@@ -303,14 +315,14 @@
              : discardTail(cf, toDiscard, newCf, cf.iterator(), tester);
     }
 
-    private int discardHead(ColumnFamily cf, int toDiscard, ColumnFamily copy, Iterator<Column> iter, DeletionInfo.InOrderTester tester)
+    private int discardHead(ColumnFamily cf, int toDiscard, ColumnFamily copy, Iterator<Cell> iter, DeletionInfo.InOrderTester tester)
     {
         ColumnCounter counter = columnCounter();
 
         // Discard the first 'toDiscard' live
         while (iter.hasNext())
         {
-            Column c = iter.next();
+            Cell c = iter.next();
             counter.count(c, tester);
             if (counter.live() > toDiscard)
             {
@@ -322,7 +334,7 @@
         return Math.min(counter.live(), toDiscard);
     }
 
-    private int discardTail(ColumnFamily cf, int toDiscard, ColumnFamily copy, Iterator<Column> iter, DeletionInfo.InOrderTester tester)
+    private int discardTail(ColumnFamily cf, int toDiscard, ColumnFamily copy, Iterator<Cell> iter, DeletionInfo.InOrderTester tester)
     {
         // Redoing the counting like that is not extremely efficient.
         // This is called only for reversed slices or in the case of a race between
@@ -333,7 +345,7 @@
         // Discard the last 'toDiscard' live (so stop adding as sound as we're past 'liveCount - toDiscard')
         while (iter.hasNext())
         {
-            Column c = iter.next();
+            Cell c = iter.next();
             counter.count(c, tester);
             if (counter.live() > liveCount - toDiscard)
                 break;
@@ -343,12 +355,12 @@
         return Math.min(liveCount, toDiscard);
     }
 
-    protected static Column firstColumn(ColumnFamily cf)
+    protected static Cell firstCell(ColumnFamily cf)
     {
         return cf.iterator().next();
     }
 
-    protected static Column lastColumn(ColumnFamily cf)
+    protected static Cell lastCell(ColumnFamily cf)
     {
         return cf.getReverseSortedColumns().iterator().next();
     }

diff --git a/src/java/org/apache/cassandra/service/pager/PagingState.java b/src/java/org/apache/cassandra/service/pager/PagingState.java
index bc77e3b..bbae921 100644
--- a/src/java/org/apache/cassandra/service/pager/PagingState.java
+++ b/src/java/org/apache/cassandra/service/pager/PagingState.java

@@ -18,12 +18,10 @@
 package org.apache.cassandra.service.pager;
 
 import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
-import org.apache.cassandra.io.util.ByteBufferOutputStream;
+import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.transport.ProtocolException;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -63,13 +61,11 @@
     {
         try
         {
-            ByteBuffer result = ByteBuffer.allocate(serializedSize());
-            DataOutput out = new DataOutputStream(new ByteBufferOutputStream(result));
+            DataOutputBuffer out = new DataOutputBuffer(serializedSize());
             ByteBufferUtil.writeWithShortLength(partitionKey, out);
             ByteBufferUtil.writeWithShortLength(cellName, out);
             out.writeInt(remaining);
-            result.flip();
-            return result;
+            return out.asByteBuffer();
         }
         catch (IOException e)
         {
@@ -83,4 +79,10 @@
              + 2 + cellName.remaining()
              + 4;
     }
+
+    @Override
+    public String toString()
+    {
+        return String.format("PagingState(key=%s, cellname=%s, remaining=%d", ByteBufferUtil.bytesToHex(partitionKey), ByteBufferUtil.bytesToHex(cellName), remaining);
+    }
 }

diff --git a/src/java/org/apache/cassandra/service/pager/QueryPagers.java b/src/java/org/apache/cassandra/service/pager/QueryPagers.java
index 65112aa..04702d0 100644
--- a/src/java/org/apache/cassandra/service/pager/QueryPagers.java
+++ b/src/java/org/apache/cassandra/service/pager/QueryPagers.java

@@ -153,7 +153,7 @@
                 {
                     List<Row> rows = pager.fetchPage(pageSize);
                     ColumnFamily cf = rows.isEmpty() ? null : rows.get(0).cf;
-                    return cf == null ? EmptyColumns.factory.create(cfs.metadata) : cf;
+                    return cf == null ? ArrayBackedSortedColumns.factory.create(cfs.metadata) : cf;
                 }
                 catch (Exception e)
                 {
@@ -182,7 +182,7 @@
         SliceFromReadCommand command = new SliceFromReadCommand(keyspace, key, columnFamily, now, filter);
         final SliceQueryPager pager = new SliceQueryPager(command, consistencyLevel, false);
 
-        ColumnCounter counter = filter.columnCounter(Schema.instance.getComparator(keyspace, columnFamily), now);
+        ColumnCounter counter = filter.columnCounter(Schema.instance.getCFMetaData(keyspace, columnFamily).comparator, now);
         while (!pager.isExhausted())
         {
             List<Row> next = pager.fetchPage(pageSize);

diff --git a/src/java/org/apache/cassandra/service/pager/RangeNamesQueryPager.java b/src/java/org/apache/cassandra/service/pager/RangeNamesQueryPager.java
index e3b0cf8..50d1280 100644
--- a/src/java/org/apache/cassandra/service/pager/RangeNamesQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/RangeNamesQueryPager.java

@@ -63,7 +63,7 @@
     {
         return lastReturnedKey == null
              ? null
-             : new PagingState(lastReturnedKey.key, null, maxRemaining());
+             : new PagingState(lastReturnedKey.getKey(), null, maxRemaining());
     }
 
     protected List<Row> queryNextPage(int pageSize, ConsistencyLevel consistencyLevel, boolean localQuery)

diff --git a/src/java/org/apache/cassandra/service/pager/RangeSliceQueryPager.java b/src/java/org/apache/cassandra/service/pager/RangeSliceQueryPager.java
index 0df1d25..cfcd953 100644
--- a/src/java/org/apache/cassandra/service/pager/RangeSliceQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/RangeSliceQueryPager.java

@@ -17,10 +17,11 @@
  */
 package org.apache.cassandra.service.pager;
 
-import java.nio.ByteBuffer;
 import java.util.List;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.RequestExecutionException;
@@ -37,7 +38,7 @@
 {
     private final RangeSliceCommand command;
     private volatile DecoratedKey lastReturnedKey;
-    private volatile ByteBuffer lastReturnedName;
+    private volatile CellName lastReturnedName;
 
     // Don't use directly, use QueryPagers method instead
     RangeSliceQueryPager(RangeSliceCommand command, ConsistencyLevel consistencyLevel, boolean localQuery)
@@ -54,7 +55,7 @@
         if (state != null)
         {
             lastReturnedKey = StorageService.getPartitioner().decorateKey(state.partitionKey);
-            lastReturnedName = state.cellName;
+            lastReturnedName = cfm.comparator.cellFromByteBuffer(state.cellName);
             restoreState(state.remaining, true);
         }
     }
@@ -63,7 +64,7 @@
     {
         return lastReturnedKey == null
              ? null
-             : new PagingState(lastReturnedKey.key, lastReturnedName, maxRemaining());
+             : new PagingState(lastReturnedKey.getKey(), lastReturnedName.toByteBuffer(), maxRemaining());
     }
 
     protected List<Row> queryNextPage(int pageSize, ConsistencyLevel consistencyLevel, boolean localQuery)
@@ -71,7 +72,7 @@
     {
         SliceQueryFilter sf = (SliceQueryFilter)columnFilter;
         AbstractBounds<RowPosition> keyRange = lastReturnedKey == null ? command.keyRange : makeIncludingKeyBounds(lastReturnedKey);
-        ByteBuffer start = lastReturnedName == null ? sf.start() : lastReturnedName;
+        Composite start = lastReturnedName == null ? sf.start() : lastReturnedName;
         PagedRangeCommand pageCmd = new PagedRangeCommand(command.keyspace,
                                                           command.columnFamily,
                                                           command.timestamp,
@@ -80,7 +81,8 @@
                                                           start,
                                                           sf.finish(),
                                                           command.rowFilter,
-                                                          pageSize);
+                                                          pageSize,
+                                                          command.countCQL3Rows);
 
         return localQuery
              ? pageCmd.executeLocally()
@@ -93,16 +95,16 @@
             return false;
 
         // Same as SliceQueryPager, we ignore a deleted column
-        Column firstColumn = isReversed() ? lastColumn(first.cf) : firstColumn(first.cf);
-        return !first.cf.deletionInfo().isDeleted(firstColumn)
-            && firstColumn.isLive(timestamp())
-            && lastReturnedName.equals(firstColumn.name());
+        Cell firstCell = isReversed() ? lastCell(first.cf) : firstCell(first.cf);
+        return !first.cf.deletionInfo().isDeleted(firstCell)
+            && firstCell.isLive(timestamp())
+            && lastReturnedName.equals(firstCell.name());
     }
 
     protected boolean recordLast(Row last)
     {
         lastReturnedKey = last.key;
-        lastReturnedName = (isReversed() ? firstColumn(last.cf) : lastColumn(last.cf)).name();
+        lastReturnedName = (isReversed() ? firstCell(last.cf) : lastCell(last.cf)).name();
         return true;
     }
 

diff --git a/src/java/org/apache/cassandra/service/pager/SliceQueryPager.java b/src/java/org/apache/cassandra/service/pager/SliceQueryPager.java
index cdad0a5..cd1caf3 100644
--- a/src/java/org/apache/cassandra/service/pager/SliceQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/SliceQueryPager.java

@@ -22,6 +22,8 @@
 import java.util.List;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.RequestExecutionException;
@@ -38,7 +40,7 @@
 
     private final SliceFromReadCommand command;
 
-    private volatile ByteBuffer lastReturned;
+    private volatile Composite lastReturned;
 
     // Don't use directly, use QueryPagers method instead
     SliceQueryPager(SliceFromReadCommand command, ConsistencyLevel consistencyLevel, boolean localQuery)
@@ -53,7 +55,7 @@
 
         if (state != null)
         {
-            lastReturned = state.cellName;
+            lastReturned = cfm.comparator.fromByteBuffer(state.cellName);
             restoreState(state.remaining, true);
         }
     }
@@ -67,7 +69,7 @@
     {
         return lastReturned == null
              ? null
-             : new PagingState(null, lastReturned, maxRemaining());
+             : new PagingState(null, lastReturned.toByteBuffer(), maxRemaining());
     }
 
     protected List<Row> queryNextPage(int pageSize, ConsistencyLevel consistencyLevel, boolean localQuery)
@@ -92,18 +94,18 @@
         if (lastReturned == null)
             return false;
 
-        Column firstColumn = isReversed() ? lastColumn(first.cf) : firstColumn(first.cf);
+        Cell firstCell = isReversed() ? lastCell(first.cf) : firstCell(first.cf);
         // Note: we only return true if the column is the lastReturned *and* it is live. If it is deleted, it is ignored by the
         // rest of the paging code (it hasn't been counted as live in particular) and we want to act as if it wasn't there.
-        return !first.cf.deletionInfo().isDeleted(firstColumn)
-            && firstColumn.isLive(timestamp())
-            && lastReturned.equals(firstColumn.name());
+        return !first.cf.deletionInfo().isDeleted(firstCell)
+            && firstCell.isLive(timestamp())
+            && lastReturned.equals(firstCell.name());
     }
 
     protected boolean recordLast(Row last)
     {
-        Column lastColumn = isReversed() ? firstColumn(last.cf) : lastColumn(last.cf);
-        lastReturned = lastColumn.name();
+        Cell lastCell = isReversed() ? firstCell(last.cf) : lastCell(last.cf);
+        lastReturned = lastCell.name();
         return true;
     }
 

diff --git a/src/java/org/apache/cassandra/service/paxos/Commit.java b/src/java/org/apache/cassandra/service/paxos/Commit.java
index 9904045..45d04f9 100644
--- a/src/java/org/apache/cassandra/service/paxos/Commit.java
+++ b/src/java/org/apache/cassandra/service/paxos/Commit.java

@@ -22,7 +22,6 @@
 
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.UUID;
 import java.nio.ByteBuffer;
@@ -30,13 +29,9 @@
 import com.google.common.base.Objects;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.Column;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnSerializer;
-import org.apache.cassandra.db.EmptyColumns;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.UnsortedColumns;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -62,7 +57,7 @@
 
     public static Commit newPrepare(ByteBuffer key, CFMetaData metadata, UUID ballot)
     {
-        return new Commit(key, ballot, EmptyColumns.factory.create(metadata));
+        return new Commit(key, ballot, ArrayBackedSortedColumns.factory.create(metadata));
     }
 
     public static Commit newProposal(ByteBuffer key, UUID ballot, ColumnFamily update)
@@ -72,7 +67,7 @@
 
     public static Commit emptyCommit(ByteBuffer key, CFMetaData metadata)
     {
-        return new Commit(key, UUIDGen.minTimeUUID(0), EmptyColumns.factory.create(metadata));
+        return new Commit(key, UUIDGen.minTimeUUID(0), ArrayBackedSortedColumns.factory.create(metadata));
     }
 
     public boolean isAfter(Commit other)
@@ -85,10 +80,10 @@
         return this.ballot.equals(ballot);
     }
 
-    public RowMutation makeMutation()
+    public Mutation makeMutation()
     {
         assert update != null;
-        return new RowMutation(key, update);
+        return new Mutation(key, update);
     }
 
     @Override
@@ -120,8 +115,8 @@
         // the collection and we want that to have a lower timestamp and our new values. Since tombstones wins over normal insert, using t-1
         // should not be a problem in general (see #6069).
         cf.deletionInfo().updateAllTimestamp(t-1);
-        for (Column column : updates)
-            cf.addAtom(column.withUpdatedTimestamp(t));
+        for (Cell cell : updates)
+            cf.addAtom(cell.withUpdatedTimestamp(t));
         return cf;
     }
 
@@ -133,7 +128,7 @@
 
     public static class CommitSerializer implements IVersionedSerializer<Commit>
     {
-        public void serialize(Commit commit, DataOutput out, int version) throws IOException
+        public void serialize(Commit commit, DataOutputPlus out, int version) throws IOException
         {
             ByteBufferUtil.writeWithShortLength(commit.key, out);
             UUIDSerializer.serializer.serialize(commit.ballot, out, version);
@@ -144,7 +139,10 @@
         {
             return new Commit(ByteBufferUtil.readWithShortLength(in),
                               UUIDSerializer.serializer.deserialize(in, version),
-                              ColumnFamily.serializer.deserialize(in, UnsortedColumns.factory, ColumnSerializer.Flag.LOCAL, version));
+                              ColumnFamily.serializer.deserialize(in,
+                                                                  ArrayBackedSortedColumns.factory,
+                                                                  ColumnSerializer.Flag.LOCAL,
+                                                                  version));
         }
 
         public long serializedSize(Commit commit, int version)

diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosState.java b/src/java/org/apache/cassandra/service/paxos/PaxosState.java
index 0196122..abd173c 100644
--- a/src/java/org/apache/cassandra/service/paxos/PaxosState.java
+++ b/src/java/org/apache/cassandra/service/paxos/PaxosState.java

@@ -1,4 +1,3 @@
-package org.apache.cassandra.service.paxos;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -19,34 +18,21 @@
  * under the License.
  * 
  */
-
+package org.apache.cassandra.service.paxos;
 
 import java.nio.ByteBuffer;
+import java.util.concurrent.locks.Lock;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import com.google.common.util.concurrent.Striped;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.tracing.Tracing;
 
 public class PaxosState
 {
-    private static final Logger logger = LoggerFactory.getLogger(PaxosState.class);
-
-    private static final Object[] locks;
-    static
-    {
-        locks = new Object[1024];
-        for (int i = 0; i < locks.length; i++)
-            locks[i] = new Object();
-    }
-    private static Object lockFor(ByteBuffer key)
-    {
-        return locks[(0x7FFFFFFF & key.hashCode()) % locks.length];
-    }
+    private static final Striped<Lock> LOCKS = Striped.lazyWeakLock(DatabaseDescriptor.getConcurrentWriters() * 1024);
 
     private final Commit promised;
     private final Commit accepted;
@@ -72,7 +58,9 @@
         long start = System.nanoTime();
         try
         {
-            synchronized (lockFor(toPrepare.key))
+            Lock lock = LOCKS.get(toPrepare.key);
+            lock.lock();
+            try
             {
                 PaxosState state = SystemKeyspace.loadPaxosState(toPrepare.key, toPrepare.update.metadata());
                 if (toPrepare.isAfter(state.promised))
@@ -88,11 +76,16 @@
                     return new PrepareResponse(false, state.promised, state.mostRecentCommit);
                 }
             }
+            finally
+            {
+                lock.unlock();
+            }
         }
         finally
         {
             Keyspace.open(toPrepare.update.metadata().ksName).getColumnFamilyStore(toPrepare.update.metadata().cfId).metric.casPrepare.addNano(System.nanoTime() - start);
         }
+
     }
 
     public static Boolean propose(Commit proposal)
@@ -100,7 +93,9 @@
         long start = System.nanoTime();
         try
         {
-            synchronized (lockFor(proposal.key))
+            Lock lock = LOCKS.get(proposal.key);
+            lock.lock();
+            try
             {
                 PaxosState state = SystemKeyspace.loadPaxosState(proposal.key, proposal.update.metadata());
                 if (proposal.hasBallot(state.promised.ballot) || proposal.isAfter(state.promised))
@@ -115,6 +110,10 @@
                     return false;
                 }
             }
+            finally
+            {
+                lock.unlock();
+            }
         }
         finally
         {
@@ -133,8 +132,8 @@
             // if our current in-progress ballot is strictly greater than the proposal one, we shouldn't
             // erase the in-progress update.
             Tracing.trace("Committing proposal {}", proposal);
-            RowMutation rm = proposal.makeMutation();
-            Keyspace.open(rm.getKeyspaceName()).apply(rm, true);
+            Mutation mutation = proposal.makeMutation();
+            Keyspace.open(mutation.getKeyspaceName()).apply(mutation, true);
 
             // We don't need to lock, we're just blindly updating
             SystemKeyspace.savePaxosCommit(proposal);

diff --git a/src/java/org/apache/cassandra/service/paxos/PrepareResponse.java b/src/java/org/apache/cassandra/service/paxos/PrepareResponse.java
index d2bd835..e766e34 100644
--- a/src/java/org/apache/cassandra/service/paxos/PrepareResponse.java
+++ b/src/java/org/apache/cassandra/service/paxos/PrepareResponse.java

@@ -22,14 +22,14 @@
 
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
+import org.apache.cassandra.db.ArrayBackedSortedColumns;
 import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.ColumnSerializer;
-import org.apache.cassandra.db.UnsortedColumns;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.UUIDSerializer;
 
@@ -65,7 +65,7 @@
 
     public static class PrepareResponseSerializer implements IVersionedSerializer<PrepareResponse>
     {
-        public void serialize(PrepareResponse response, DataOutput out, int version) throws IOException
+        public void serialize(PrepareResponse response, DataOutputPlus out, int version) throws IOException
         {
             out.writeBoolean(response.promised);
             ByteBufferUtil.writeWithShortLength(response.inProgressCommit.key, out);
@@ -82,10 +82,14 @@
             return new PrepareResponse(success,
                                        new Commit(key,
                                                   UUIDSerializer.serializer.deserialize(in, version),
-                                                  ColumnFamily.serializer.deserialize(in, UnsortedColumns.factory, ColumnSerializer.Flag.LOCAL, version)),
+                                                  ColumnFamily.serializer.deserialize(in,
+                                                                                      ArrayBackedSortedColumns.factory,
+                                                                                      ColumnSerializer.Flag.LOCAL, version)),
                                        new Commit(key,
                                                   UUIDSerializer.serializer.deserialize(in, version),
-                                                  ColumnFamily.serializer.deserialize(in, UnsortedColumns.factory, ColumnSerializer.Flag.LOCAL, version)));
+                                                  ColumnFamily.serializer.deserialize(in,
+                                                                                      ArrayBackedSortedColumns.factory,
+                                                                                      ColumnSerializer.Flag.LOCAL, version)));
         }
 
         public long serializedSize(PrepareResponse response, int version)

diff --git a/src/java/org/apache/cassandra/sink/IRequestSink.java b/src/java/org/apache/cassandra/sink/IRequestSink.java
index 8d68ce8..2873e46 100644
--- a/src/java/org/apache/cassandra/sink/IRequestSink.java
+++ b/src/java/org/apache/cassandra/sink/IRequestSink.java

@@ -22,9 +22,9 @@
 public interface IRequestSink
 {
     /**
-     * Transform or drop a write request (represented by a RowMutation).
+     * Transform or drop a write request (represented by a Mutation).
      *
-     * @param mutation the RowMutation to be applied locally.
+     * @param mutation the Mutation to be applied locally.
      * @return null if the mutation is to be dropped, or the transformed mutation to apply, which may be just
      * the original mutation.
      */

diff --git a/src/java/org/apache/cassandra/streaming/ConnectionHandler.java b/src/java/org/apache/cassandra/streaming/ConnectionHandler.java
index 8fba41b..6092046 100644
--- a/src/java/org/apache/cassandra/streaming/ConnectionHandler.java
+++ b/src/java/org/apache/cassandra/streaming/ConnectionHandler.java

@@ -36,9 +36,11 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.streaming.messages.StreamInitMessage;
 import org.apache.cassandra.streaming.messages.StreamMessage;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
  * ConnectionHandler manages incoming/outgoing message exchange for the {@link StreamSession}.
@@ -152,13 +154,13 @@
 
         protected abstract String name();
 
-        protected static WritableByteChannel getWriteChannel(Socket socket) throws IOException
+        protected static DataOutputStreamAndChannel getWriteChannel(Socket socket) throws IOException
         {
             WritableByteChannel out = socket.getChannel();
             // socket channel is null when encrypted(SSL)
-            return out == null
-                 ? Channels.newChannel(socket.getOutputStream())
-                 : out;
+            if (out == null)
+                out = Channels.newChannel(socket.getOutputStream());
+            return new DataOutputStreamAndChannel(socket.getOutputStream(), out);
         }
 
         protected static ReadableByteChannel getReadChannel(Socket socket) throws IOException
@@ -172,10 +174,14 @@
 
         public void sendInitMessage(Socket socket, boolean isForOutgoing) throws IOException
         {
-            StreamInitMessage message = new StreamInitMessage(FBUtilities.getBroadcastAddress(), session.planId(), session.description(), isForOutgoing);
+            StreamInitMessage message = new StreamInitMessage(
+                    FBUtilities.getBroadcastAddress(),
+                    session.sessionIndex(),
+                    session.planId(),
+                    session.description(),
+                    isForOutgoing);
             ByteBuffer messageBuf = message.createMessage(false, protocolVersion);
-            while (messageBuf.hasRemaining())
-                getWriteChannel(socket).write(messageBuf);
+            getWriteChannel(socket).write(messageBuf);
         }
 
         public void start(Socket socket, int protocolVersion)
@@ -251,9 +257,10 @@
                 // socket is closed
                 close();
             }
-            catch (Throwable e)
+            catch (Throwable t)
             {
-                session.onError(e);
+                JVMStabilityInspector.inspectThrowable(t);
+                session.onError(t);
             }
             finally
             {
@@ -300,7 +307,7 @@
         {
             try
             {
-                WritableByteChannel out = getWriteChannel(socket);
+                DataOutputStreamAndChannel out = getWriteChannel(socket);
 
                 StreamMessage next;
                 while (!isClosed())
@@ -332,7 +339,7 @@
             }
         }
 
-        private void sendMessage(WritableByteChannel out, StreamMessage message)
+        private void sendMessage(DataOutputStreamAndChannel out, StreamMessage message)
         {
             try
             {

diff --git a/src/java/org/apache/cassandra/streaming/DefaultConnectionFactory.java b/src/java/org/apache/cassandra/streaming/DefaultConnectionFactory.java
index 53af4c8..f711490 100644
--- a/src/java/org/apache/cassandra/streaming/DefaultConnectionFactory.java
+++ b/src/java/org/apache/cassandra/streaming/DefaultConnectionFactory.java

@@ -51,6 +51,7 @@
             {
                 Socket socket = OutboundTcpConnectionPool.newSocket(peer);
                 socket.setSoTimeout(DatabaseDescriptor.getStreamingSocketTimeout());
+                socket.setKeepAlive(true);
                 return socket;
             }
             catch (IOException e)

diff --git a/src/java/org/apache/cassandra/streaming/ProgressInfo.java b/src/java/org/apache/cassandra/streaming/ProgressInfo.java
index d308ed0..fdd3e97 100644
--- a/src/java/org/apache/cassandra/streaming/ProgressInfo.java
+++ b/src/java/org/apache/cassandra/streaming/ProgressInfo.java

@@ -49,16 +49,18 @@
     }
 
     public final InetAddress peer;
+    public final int sessionIndex;
     public final String fileName;
     public final Direction direction;
     public final long currentBytes;
     public final long totalBytes;
 
-    public ProgressInfo(InetAddress peer, String fileName, Direction direction, long currentBytes, long totalBytes)
+    public ProgressInfo(InetAddress peer, int sessionIndex, String fileName, Direction direction, long currentBytes, long totalBytes)
     {
         assert totalBytes > 0;
 
         this.peer = peer;
+        this.sessionIndex = sessionIndex;
         this.fileName = fileName;
         this.direction = direction;
         this.currentBytes = currentBytes;
@@ -70,7 +72,7 @@
      */
     public boolean isCompleted()
     {
-        return currentBytes == totalBytes;
+        return currentBytes >= totalBytes;
     }
 
     /**
@@ -87,13 +89,14 @@
         if (totalBytes != that.totalBytes) return false;
         if (direction != that.direction) return false;
         if (!fileName.equals(that.fileName)) return false;
+        if (sessionIndex != that.sessionIndex) return false;
         return peer.equals(that.peer);
     }
 
     @Override
     public int hashCode()
     {
-        return Objects.hashCode(peer, fileName, direction, totalBytes);
+        return Objects.hashCode(peer, sessionIndex, fileName, direction, totalBytes);
     }
 
     @Override
@@ -104,6 +107,7 @@
         sb.append("/").append(totalBytes).append(" bytes");
         sb.append("(").append(currentBytes*100/totalBytes).append("%) ");
         sb.append(direction == Direction.OUT ? "sent to " : "received from ");
+        sb.append("idx:").append(sessionIndex);
         sb.append(peer);
         return sb.toString();
     }

diff --git a/src/java/org/apache/cassandra/streaming/ReplicationFinishedVerbHandler.java b/src/java/org/apache/cassandra/streaming/ReplicationFinishedVerbHandler.java
index 4297b34..ce8a921 100644
--- a/src/java/org/apache/cassandra/streaming/ReplicationFinishedVerbHandler.java
+++ b/src/java/org/apache/cassandra/streaming/ReplicationFinishedVerbHandler.java

@@ -35,7 +35,7 @@
         StorageService.instance.confirmReplication(msg.from);
         MessageOut response = new MessageOut(MessagingService.Verb.INTERNAL_RESPONSE);
         if (logger.isDebugEnabled())
-            logger.debug("Replying to " + id + "@" + msg.from);
+            logger.debug("Replying to {}@{}", id, msg.from);
         MessagingService.instance().sendReply(response, id, msg.from);
     }
 }

diff --git a/src/java/org/apache/cassandra/streaming/SessionInfo.java b/src/java/org/apache/cassandra/streaming/SessionInfo.java
index b722ecf..98e945b 100644
--- a/src/java/org/apache/cassandra/streaming/SessionInfo.java
+++ b/src/java/org/apache/cassandra/streaming/SessionInfo.java

@@ -33,6 +33,7 @@
 public final class SessionInfo implements Serializable
 {
     public final InetAddress peer;
+    public final int sessionIndex;
     /** Immutable collection of receiving summaries */
     public final Collection<StreamSummary> receivingSummaries;
     /** Immutable collection of sending summaries*/
@@ -44,11 +45,13 @@
     private final Map<String, ProgressInfo> sendingFiles;
 
     public SessionInfo(InetAddress peer,
+                       int sessionIndex,
                        Collection<StreamSummary> receivingSummaries,
                        Collection<StreamSummary> sendingSummaries,
                        StreamSession.State state)
     {
         this.peer = peer;
+        this.sessionIndex = sessionIndex;
         this.receivingSummaries = ImmutableSet.copyOf(receivingSummaries);
         this.sendingSummaries = ImmutableSet.copyOf(sendingSummaries);
         this.receivingFiles = new ConcurrentHashMap<>();

diff --git a/src/java/org/apache/cassandra/streaming/StreamCoordinator.java b/src/java/org/apache/cassandra/streaming/StreamCoordinator.java
new file mode 100644
index 0000000..48192b4
--- /dev/null
+++ b/src/java/org/apache/cassandra/streaming/StreamCoordinator.java

@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.streaming;
+
+import java.net.InetAddress;
+import java.util.*;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * {@link StreamCoordinator} is a helper class that abstracts away maintaining multiple
+ * StreamSession and ProgressInfo instances per peer.
+ *
+ * This class coordinates multiple SessionStreams per peer in both the outgoing StreamPlan context and on the
+ * inbound StreamResultFuture context.
+ */
+public class StreamCoordinator
+{
+    private static final Logger logger = LoggerFactory.getLogger(StreamCoordinator.class);
+
+    // Executor strictly for establishing the initial connections. Once we're connected to the other end the rest of the
+    // streaming is handled directly by the ConnectionHandler's incoming and outgoing threads.
+    private static final DebuggableThreadPoolExecutor streamExecutor = DebuggableThreadPoolExecutor.createWithFixedPoolSize("StreamConnectionEstablisher",
+                                                                                                                            FBUtilities.getAvailableProcessors());
+
+    private Map<InetAddress, HostStreamingData> peerSessions = new HashMap<>();
+    private final int connectionsPerHost;
+    private StreamConnectionFactory factory;
+
+    public StreamCoordinator(int connectionsPerHost, StreamConnectionFactory factory)
+    {
+        this.connectionsPerHost = connectionsPerHost;
+        this.factory = factory;
+    }
+
+    public void setConnectionFactory(StreamConnectionFactory factory)
+    {
+        this.factory = factory;
+    }
+
+    /**
+     * @return true if any stream session is active
+     */
+    public synchronized boolean hasActiveSessions()
+    {
+        for (HostStreamingData data : peerSessions.values())
+        {
+            if (data.hasActiveSessions())
+                return true;
+        }
+        return false;
+    }
+
+    public synchronized Collection<StreamSession> getAllStreamSessions()
+    {
+        Collection<StreamSession> results = new ArrayList<>();
+        for (HostStreamingData data : peerSessions.values())
+        {
+            results.addAll(data.getAllStreamSessions());
+        }
+        return results;
+    }
+
+    public boolean isReceiving()
+    {
+        return connectionsPerHost == 0;
+    }
+
+    public void connectAllStreamSessions()
+    {
+        for (HostStreamingData data : peerSessions.values())
+            data.connectAllStreamSessions();
+    }
+
+    public synchronized Set<InetAddress> getPeers()
+    {
+        return new HashSet<>(peerSessions.keySet());
+    }
+
+    public synchronized StreamSession getOrCreateNextSession(InetAddress peer)
+    {
+        return getOrCreateHostData(peer).getOrCreateNextSession(peer);
+    }
+
+    public synchronized StreamSession getOrCreateSessionById(InetAddress peer, int id)
+    {
+        return getOrCreateHostData(peer).getOrCreateSessionById(peer, id);
+    }
+
+    public synchronized void updateProgress(ProgressInfo info)
+    {
+        getHostData(info.peer).updateProgress(info);
+    }
+
+    public synchronized void addSessionInfo(SessionInfo session)
+    {
+        HostStreamingData data = getOrCreateHostData(session.peer);
+        data.addSessionInfo(session);
+    }
+
+    public synchronized Set<SessionInfo> getAllSessionInfo()
+    {
+        Set<SessionInfo> result = new HashSet<>();
+        for (HostStreamingData data : peerSessions.values())
+        {
+            result.addAll(data.getAllSessionInfo());
+        }
+        return result;
+    }
+
+    public synchronized void transferFiles(InetAddress to, Collection<StreamSession.SSTableStreamingSections> sstableDetails)
+    {
+        HostStreamingData sessionList = getOrCreateHostData(to);
+
+        if (connectionsPerHost > 1)
+        {
+            List<List<StreamSession.SSTableStreamingSections>> buckets = sliceSSTableDetails(sstableDetails);
+
+            for (List<StreamSession.SSTableStreamingSections> subList : buckets)
+            {
+                StreamSession session = sessionList.getOrCreateNextSession(to);
+                session.addTransferFiles(subList);
+            }
+        }
+        else
+        {
+            StreamSession session = sessionList.getOrCreateNextSession(to);
+            session.addTransferFiles(sstableDetails);
+        }
+    }
+
+    private List<List<StreamSession.SSTableStreamingSections>> sliceSSTableDetails(Collection<StreamSession.SSTableStreamingSections> sstableDetails)
+    {
+        // There's no point in divvying things up into more buckets than we have sstableDetails
+        int targetSlices = Math.min(sstableDetails.size(), connectionsPerHost);
+        int step = Math.round((float) sstableDetails.size() / (float) targetSlices);
+        int index = 0;
+
+        List<List<StreamSession.SSTableStreamingSections>> result = new ArrayList<>();
+        List<StreamSession.SSTableStreamingSections> slice = null;
+        Iterator<StreamSession.SSTableStreamingSections> iter = sstableDetails.iterator();
+        while (iter.hasNext())
+        {
+            StreamSession.SSTableStreamingSections streamSession = iter.next();
+
+            if (index % step == 0)
+            {
+                slice = new ArrayList<>();
+                result.add(slice);
+            }
+            slice.add(streamSession);
+            ++index;
+            iter.remove();
+        }
+
+        return result;
+    }
+
+    private HostStreamingData getHostData(InetAddress peer)
+    {
+        HostStreamingData data = peerSessions.get(peer);
+        if (data == null)
+            throw new IllegalArgumentException("Unknown peer requested: " + peer.toString());
+        return data;
+    }
+
+    private HostStreamingData getOrCreateHostData(InetAddress peer)
+    {
+        HostStreamingData data = peerSessions.get(peer);
+        if (data == null)
+        {
+            data = new HostStreamingData();
+            peerSessions.put(peer, data);
+        }
+        return data;
+    }
+
+    private static class StreamSessionConnector implements Runnable
+    {
+        private final StreamSession session;
+        public StreamSessionConnector(StreamSession session)
+        {
+            this.session = session;
+        }
+
+        @Override
+        public void run()
+        {
+            session.start();
+            logger.info("[Stream #{}, ID#{}] Beginning stream session with {}", session.planId(), session.sessionIndex(), session.peer);
+        }
+    }
+
+    private class HostStreamingData
+    {
+        private Map<Integer, StreamSession> streamSessions = new HashMap<>();
+        private Map<Integer, SessionInfo> sessionInfos = new HashMap<>();
+
+        private int lastReturned = -1;
+
+        public boolean hasActiveSessions()
+        {
+            for (StreamSession session : streamSessions.values())
+            {
+                StreamSession.State state = session.state();
+                if (state != StreamSession.State.COMPLETE && state != StreamSession.State.FAILED)
+                    return true;
+            }
+            return false;
+        }
+
+        public StreamSession getOrCreateNextSession(InetAddress peer)
+        {
+            // create
+            if (streamSessions.size() < connectionsPerHost)
+            {
+                StreamSession session = new StreamSession(peer, factory, streamSessions.size());
+                streamSessions.put(++lastReturned, session);
+                return session;
+            }
+            // get
+            else
+            {
+                if (lastReturned >= streamSessions.size() - 1)
+                    lastReturned = 0;
+
+                return streamSessions.get(lastReturned++);
+            }
+        }
+
+        public void connectAllStreamSessions()
+        {
+            for (StreamSession session : streamSessions.values())
+            {
+                streamExecutor.execute(new StreamSessionConnector(session));
+            }
+        }
+
+        public Collection<StreamSession> getAllStreamSessions()
+        {
+            return Collections.unmodifiableCollection(streamSessions.values());
+        }
+
+        public StreamSession getOrCreateSessionById(InetAddress peer, int id)
+        {
+            StreamSession session = streamSessions.get(id);
+            if (session == null)
+            {
+                session = new StreamSession(peer, factory, id);
+                streamSessions.put(id, session);
+            }
+            return session;
+        }
+
+        public void updateProgress(ProgressInfo info)
+        {
+            sessionInfos.get(info.sessionIndex).updateProgress(info);
+        }
+
+        public void addSessionInfo(SessionInfo info)
+        {
+            sessionInfos.put(info.sessionIndex, info);
+        }
+
+        public Collection<SessionInfo> getAllSessionInfo()
+        {
+            return sessionInfos.values();
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/streaming/StreamEvent.java b/src/java/org/apache/cassandra/streaming/StreamEvent.java
index 9af1fbd..8089323 100644
--- a/src/java/org/apache/cassandra/streaming/StreamEvent.java
+++ b/src/java/org/apache/cassandra/streaming/StreamEvent.java

@@ -42,12 +42,14 @@
     {
         public final InetAddress peer;
         public final boolean success;
+        public final int sessionIndex;
 
         public SessionCompleteEvent(StreamSession session)
         {
             super(Type.STREAM_COMPLETE, session.planId());
             this.peer = session.peer;
             this.success = session.isSuccess();
+            this.sessionIndex = session.sessionIndex();
         }
     }
 

diff --git a/src/java/org/apache/cassandra/streaming/StreamException.java b/src/java/org/apache/cassandra/streaming/StreamException.java
index 6e22db2..fdf61e2 100644
--- a/src/java/org/apache/cassandra/streaming/StreamException.java
+++ b/src/java/org/apache/cassandra/streaming/StreamException.java

@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.streaming;
 
-public class StreamException extends Throwable
+public class StreamException extends Exception
 {
     public final StreamState finalState;
 

diff --git a/src/java/org/apache/cassandra/streaming/StreamLockfile.java b/src/java/org/apache/cassandra/streaming/StreamLockfile.java
index 0eb01c5..4d20479 100644
--- a/src/java/org/apache/cassandra/streaming/StreamLockfile.java
+++ b/src/java/org/apache/cassandra/streaming/StreamLockfile.java

@@ -21,15 +21,20 @@
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.StandardOpenOption;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.UUID;
 
 import com.google.common.base.Charsets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableWriter;
 import org.apache.cassandra.io.util.FileUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 /**
  * Encapsulates the behavior for 'locking' any streamed sttables to a node.
@@ -69,7 +74,7 @@
             /* write out the file names *without* the 'tmp-file' flag in the file name.
                this class will not need to clean up tmp files (on restart), CassandraDaemon does that already,
                just make sure we delete the fully-formed SSTRs. */
-            sstablePaths.add(writer.descriptor.asTemporary(false).baseFilename());
+            sstablePaths.add(writer.descriptor.asType(Descriptor.Type.FINAL).baseFilename());
         }
 
         try

diff --git a/src/java/org/apache/cassandra/streaming/StreamManager.java b/src/java/org/apache/cassandra/streaming/StreamManager.java
index b5b1c7f..f40be99 100644
--- a/src/java/org/apache/cassandra/streaming/StreamManager.java
+++ b/src/java/org/apache/cassandra/streaming/StreamManager.java

@@ -63,7 +63,7 @@
 
     public static class StreamRateLimiter
     {
-        private static final double ONE_MEGA_BIT = 1024 * 1024 * 8;
+        private static final double ONE_MEGA_BIT = (1024 * 1024) / 8; // from bits
         private static final RateLimiter limiter = RateLimiter.create(Double.MAX_VALUE);
         private static final RateLimiter interDCLimiter = RateLimiter.create(Double.MAX_VALUE);
         private final boolean isLocalDC;

diff --git a/src/java/org/apache/cassandra/streaming/StreamPlan.java b/src/java/org/apache/cassandra/streaming/StreamPlan.java
index e582c79..f7b6203 100644
--- a/src/java/org/apache/cassandra/streaming/StreamPlan.java
+++ b/src/java/org/apache/cassandra/streaming/StreamPlan.java

@@ -22,6 +22,7 @@
 
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.UUIDGen;
 
 /**
@@ -34,9 +35,8 @@
     private final UUID planId = UUIDGen.getTimeUUID();
     private final String description;
     private final List<StreamEventHandler> handlers = new ArrayList<>();
-
-    // sessions per InetAddress of the other end.
-    private final Map<InetAddress, StreamSession> sessions = new HashMap<>();
+    private final long repairedAt;
+    private final StreamCoordinator coordinator;
 
     private StreamConnectionFactory connectionFactory = new DefaultConnectionFactory();
 
@@ -49,7 +49,14 @@
      */
     public StreamPlan(String description)
     {
+        this(description, ActiveRepairService.UNREPAIRED_SSTABLE, 1);
+    }
+
+    public StreamPlan(String description, long repairedAt, int connectionsPerHost)
+    {
         this.description = description;
+        this.repairedAt = repairedAt;
+        this.coordinator = new StreamCoordinator(connectionsPerHost, connectionFactory);
     }
 
     /**
@@ -76,8 +83,8 @@
      */
     public StreamPlan requestRanges(InetAddress from, String keyspace, Collection<Range<Token>> ranges, String... columnFamilies)
     {
-        StreamSession session = getOrCreateSession(from);
-        session.addStreamRequest(keyspace, ranges, Arrays.asList(columnFamilies));
+        StreamSession session = coordinator.getOrCreateNextSession(from);
+        session.addStreamRequest(keyspace, ranges, Arrays.asList(columnFamilies), repairedAt);
         return this;
     }
 
@@ -105,8 +112,8 @@
      */
     public StreamPlan transferRanges(InetAddress to, String keyspace, Collection<Range<Token>> ranges, String... columnFamilies)
     {
-        StreamSession session = getOrCreateSession(to);
-        session.addTransferRanges(keyspace, ranges, Arrays.asList(columnFamilies), flushBeforeTransfer);
+        StreamSession session = coordinator.getOrCreateNextSession(to);
+        session.addTransferRanges(keyspace, ranges, Arrays.asList(columnFamilies), flushBeforeTransfer, repairedAt);
         return this;
     }
 
@@ -120,9 +127,9 @@
      */
     public StreamPlan transferFiles(InetAddress to, Collection<StreamSession.SSTableStreamingSections> sstableDetails)
     {
-        StreamSession session = getOrCreateSession(to);
-        session.addTransferFiles(sstableDetails);
+        coordinator.transferFiles(to, sstableDetails);
         return this;
+
     }
 
     public StreamPlan listeners(StreamEventHandler handler, StreamEventHandler... handlers)
@@ -141,7 +148,7 @@
      */
     public StreamPlan connectionFactory(StreamConnectionFactory factory)
     {
-        this.connectionFactory = factory;
+        this.coordinator.setConnectionFactory(factory);
         return this;
     }
 
@@ -150,7 +157,7 @@
      */
     public boolean isEmpty()
     {
-        return sessions.isEmpty();
+        return !coordinator.hasActiveSessions();
     }
 
     /**
@@ -160,7 +167,7 @@
      */
     public StreamResultFuture execute()
     {
-        return StreamResultFuture.init(planId, description, sessions.values(), handlers);
+        return StreamResultFuture.init(planId, description, handlers, coordinator);
     }
 
     /**
@@ -176,15 +183,4 @@
         return this;
     }
 
-    private StreamSession getOrCreateSession(InetAddress peer)
-    {
-        StreamSession session = sessions.get(peer);
-        if (session == null)
-        {
-            session = new StreamSession(peer, connectionFactory);
-            sessions.put(peer, session);
-        }
-        return session;
-    }
-
 }

diff --git a/src/java/org/apache/cassandra/streaming/StreamReader.java b/src/java/org/apache/cassandra/streaming/StreamReader.java
index 3b2a924..3014549 100644
--- a/src/java/org/apache/cassandra/streaming/StreamReader.java
+++ b/src/java/org/apache/cassandra/streaming/StreamReader.java

@@ -27,6 +27,9 @@
 import java.util.UUID;
 
 import com.google.common.base.Throwables;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import com.ning.compress.lzf.LZFInputStream;
 
 import org.apache.cassandra.config.Schema;
@@ -37,6 +40,7 @@
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableWriter;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.messages.FileMessageHeader;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -48,11 +52,13 @@
  */
 public class StreamReader
 {
+    private static final Logger logger = LoggerFactory.getLogger(StreamReader.class);
     protected final UUID cfId;
     protected final long estimatedKeys;
     protected final Collection<Pair<Long, Long>> sections;
     protected final StreamSession session;
     protected final Descriptor.Version inputVersion;
+    protected final long repairedAt;
 
     protected Descriptor desc;
 
@@ -63,6 +69,7 @@
         this.estimatedKeys = header.estimatedKeys;
         this.sections = header.sections;
         this.inputVersion = new Descriptor.Version(header.version);
+        this.repairedAt = header.repairedAt;
     }
 
     /**
@@ -72,6 +79,7 @@
      */
     public SSTableWriter read(ReadableByteChannel channel) throws IOException
     {
+        logger.debug("reading file from {}, repairedAt = {}", session.peer, repairedAt);
         long totalSize = totalSize();
 
         Pair<String, String> kscf = Schema.instance.getCF(cfId);
@@ -82,7 +90,7 @@
         }
         ColumnFamilyStore cfs = Keyspace.open(kscf.left).getColumnFamilyStore(kscf.right);
 
-        SSTableWriter writer = createWriter(cfs, totalSize);
+        SSTableWriter writer = createWriter(cfs, totalSize, repairedAt);
         DataInputStream dis = new DataInputStream(new LZFInputStream(Channels.newInputStream(channel)));
         BytesReadTracker in = new BytesReadTracker(dis);
         try
@@ -106,14 +114,14 @@
         }
     }
 
-    protected SSTableWriter createWriter(ColumnFamilyStore cfs, long totalSize) throws IOException
+    protected SSTableWriter createWriter(ColumnFamilyStore cfs, long totalSize, long repairedAt) throws IOException
     {
         Directories.DataDirectory localDir = cfs.directories.getWriteableLocation();
         if (localDir == null)
             throw new IOException("Insufficient disk space to store " + totalSize + " bytes");
         desc = Descriptor.fromFilename(cfs.getTempSSTablePath(cfs.directories.getLocationForDisk(localDir)));
 
-        return new SSTableWriter(desc.filenameFor(Component.DATA), estimatedKeys);
+        return new SSTableWriter(desc.filenameFor(Component.DATA), estimatedKeys, repairedAt);
     }
 
     protected void drain(InputStream dis, long bytesRead) throws IOException

diff --git a/src/java/org/apache/cassandra/streaming/StreamRequest.java b/src/java/org/apache/cassandra/streaming/StreamRequest.java
index 9d3fdb2..9c5b974 100644
--- a/src/java/org/apache/cassandra/streaming/StreamRequest.java
+++ b/src/java/org/apache/cassandra/streaming/StreamRequest.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.streaming;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
@@ -29,6 +28,7 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class StreamRequest
 {
@@ -37,19 +37,21 @@
     public final String keyspace;
     public final Collection<Range<Token>> ranges;
     public final Collection<String> columnFamilies = new HashSet<>();
-
-    public StreamRequest(String keyspace, Collection<Range<Token>> ranges, Collection<String> columnFamilies)
+    public final long repairedAt;
+    public StreamRequest(String keyspace, Collection<Range<Token>> ranges, Collection<String> columnFamilies, long repairedAt)
     {
         this.keyspace = keyspace;
         this.ranges = ranges;
         this.columnFamilies.addAll(columnFamilies);
+        this.repairedAt = repairedAt;
     }
 
     public static class StreamRequestSerializer implements IVersionedSerializer<StreamRequest>
     {
-        public void serialize(StreamRequest request, DataOutput out, int version) throws IOException
+        public void serialize(StreamRequest request, DataOutputPlus out, int version) throws IOException
         {
             out.writeUTF(request.keyspace);
+            out.writeLong(request.repairedAt);
             out.writeInt(request.ranges.size());
             for (Range<Token> range : request.ranges)
             {
@@ -64,6 +66,7 @@
         public StreamRequest deserialize(DataInput in, int version) throws IOException
         {
             String keyspace = in.readUTF();
+            long repairedAt = in.readLong();
             int rangeCount = in.readInt();
             List<Range<Token>> ranges = new ArrayList<>(rangeCount);
             for (int i = 0; i < rangeCount; i++)
@@ -76,12 +79,13 @@
             List<String> columnFamilies = new ArrayList<>(cfCount);
             for (int i = 0; i < cfCount; i++)
                 columnFamilies.add(in.readUTF());
-            return new StreamRequest(keyspace, ranges, columnFamilies);
+            return new StreamRequest(keyspace, ranges, columnFamilies, repairedAt);
         }
 
         public long serializedSize(StreamRequest request, int version)
         {
             int size = TypeSizes.NATIVE.sizeof(request.keyspace);
+            size += TypeSizes.NATIVE.sizeof(request.repairedAt);
             size += TypeSizes.NATIVE.sizeof(request.ranges.size());
             for (Range<Token> range : request.ranges)
             {

diff --git a/src/java/org/apache/cassandra/streaming/StreamResultFuture.java b/src/java/org/apache/cassandra/streaming/StreamResultFuture.java
index add14f7..f28a937 100644
--- a/src/java/org/apache/cassandra/streaming/StreamResultFuture.java
+++ b/src/java/org/apache/cassandra/streaming/StreamResultFuture.java

@@ -23,10 +23,8 @@
 import java.util.*;
 import java.util.concurrent.ConcurrentLinkedQueue;
 
-import com.google.common.collect.ImmutableSet;
 import com.google.common.util.concurrent.AbstractFuture;
 import com.google.common.util.concurrent.Futures;
-import org.cliffc.high_scale_lib.NonBlockingHashMap;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -49,11 +47,9 @@
 
     public final UUID planId;
     public final String description;
+    private final StreamCoordinator coordinator;
     private final Collection<StreamEventHandler> eventListeners = new ConcurrentLinkedQueue<>();
 
-    private final Map<InetAddress, StreamSession> ongoingSessions;
-    private final Map<InetAddress, SessionInfo> sessionStates = new NonBlockingHashMap<>();
-
     /**
      * Create new StreamResult of given {@code planId} and type.
      *
@@ -62,22 +58,25 @@
      * @param planId Stream plan ID
      * @param description Stream description
      */
-    private StreamResultFuture(UUID planId, String description, Collection<StreamSession> sessions)
+    private StreamResultFuture(UUID planId, String description, StreamCoordinator coordinator)
     {
         this.planId = planId;
         this.description = description;
-        this.ongoingSessions = new HashMap<>(sessions.size());
-        for (StreamSession session : sessions)
-            this.ongoingSessions.put(session.peer, session);
+        this.coordinator = coordinator;
 
         // if there is no session to listen to, we immediately set result for returning
-        if (sessions.isEmpty())
+        if (!coordinator.isReceiving() && !coordinator.hasActiveSessions())
             set(getCurrentState());
     }
 
-    static StreamResultFuture init(UUID planId, String description, Collection<StreamSession> sessions, Collection<StreamEventHandler> listeners)
+    private StreamResultFuture(UUID planId, String description)
     {
-        StreamResultFuture future = createAndRegister(planId, description, sessions);
+        this(planId, description, new StreamCoordinator(0, new DefaultConnectionFactory()));
+    }
+
+    static StreamResultFuture init(UUID planId, String description, Collection<StreamEventHandler> listeners, StreamCoordinator coordinator)
+    {
+        StreamResultFuture future = createAndRegister(planId, description, coordinator);
         if (listeners != null)
         {
             for (StreamEventHandler listener : listeners)
@@ -85,18 +84,19 @@
         }
 
         logger.info("[Stream #{}] Executing streaming plan for {}", planId,  description);
-        // start sessions
-        for (final StreamSession session : sessions)
+
+        // Initialize and start all sessions
+        for (final StreamSession session : coordinator.getAllStreamSessions())
         {
-            logger.info("[Stream #{}] Beginning stream session with {}", planId, session.peer);
             session.init(future);
-            session.start();
         }
+        coordinator.connectAllStreamSessions();
 
         return future;
     }
 
-    public static synchronized StreamResultFuture initReceivingSide(UUID planId,
+    public static synchronized StreamResultFuture initReceivingSide(int sessionIndex,
+                                                                    UUID planId,
                                                                     String description,
                                                                     InetAddress from,
                                                                     Socket socket,
@@ -106,35 +106,28 @@
         StreamResultFuture future = StreamManager.instance.getReceivingStream(planId);
         if (future == null)
         {
-            final StreamSession session = new StreamSession(from, null);
+            logger.info("[Stream #{} ID#{}] Creating new streaming plan for {}", planId, sessionIndex, description);
 
             // The main reason we create a StreamResultFuture on the receiving side is for JMX exposure.
-            future = new StreamResultFuture(planId, description, Collections.singleton(session));
+            future = new StreamResultFuture(planId, description);
             StreamManager.instance.registerReceiving(future);
-
-            session.init(future);
-            session.handler.initiateOnReceivingSide(socket, isForOutgoing, version);
         }
-        else
-        {
-            future.attachSocket(from, socket, isForOutgoing, version);
-            logger.info("[Stream #{}] Received streaming plan for {}", planId,  description);
-        }
+        future.attachSocket(from, sessionIndex, socket, isForOutgoing, version);
+        logger.info("[Stream #{}, ID#{}] Received streaming plan for {}", planId, sessionIndex, description);
         return future;
     }
 
-    private static StreamResultFuture createAndRegister(UUID planId, String description, Collection<StreamSession> sessions)
+    private static StreamResultFuture createAndRegister(UUID planId, String description, StreamCoordinator coordinator)
     {
-        StreamResultFuture future = new StreamResultFuture(planId, description, sessions);
+        StreamResultFuture future = new StreamResultFuture(planId, description, coordinator);
         StreamManager.instance.register(future);
         return future;
     }
 
-    public void attachSocket(InetAddress from, Socket socket, boolean isForOutgoing, int version) throws IOException
+    private void attachSocket(InetAddress from, int sessionIndex, Socket socket, boolean isForOutgoing, int version) throws IOException
     {
-        StreamSession session = ongoingSessions.get(from);
-        if (session == null)
-            throw new RuntimeException(String.format("Got connection from %s for stream session %s but no such session locally", from, planId));
+        StreamSession session = coordinator.getOrCreateSessionById(from, sessionIndex);
+        session.init(this);
         session.handler.initiateOnReceivingSide(socket, isForOutgoing, version);
     }
 
@@ -149,7 +142,7 @@
      */
     public StreamState getCurrentState()
     {
-        return new StreamState(planId, description, ImmutableSet.copyOf(sessionStates.values()));
+        return new StreamState(planId, description, coordinator.getAllSessionInfo());
     }
 
     @Override
@@ -170,44 +163,43 @@
     void handleSessionPrepared(StreamSession session)
     {
         SessionInfo sessionInfo = session.getSessionInfo();
-        logger.info("[Stream #{}] Prepare completed. Receiving {} files({} bytes), sending {} files({} bytes)",
+        logger.info("[Stream #{} ID#{}] Prepare completed. Receiving {} files({} bytes), sending {} files({} bytes)",
                               session.planId(),
+                              session.sessionIndex(),
                               sessionInfo.getTotalFilesToReceive(),
                               sessionInfo.getTotalSizeToReceive(),
                               sessionInfo.getTotalFilesToSend(),
                               sessionInfo.getTotalSizeToSend());
         StreamEvent.SessionPreparedEvent event = new StreamEvent.SessionPreparedEvent(planId, sessionInfo);
-        sessionStates.put(sessionInfo.peer, sessionInfo);
+        coordinator.addSessionInfo(sessionInfo);
         fireStreamEvent(event);
     }
 
     void handleSessionComplete(StreamSession session)
     {
         logger.info("[Stream #{}] Session with {} is complete", session.planId(), session.peer);
-
-        SessionInfo sessionInfo = session.getSessionInfo();
-        sessionStates.put(sessionInfo.peer, sessionInfo);
         fireStreamEvent(new StreamEvent.SessionCompleteEvent(session));
-        maybeComplete(session);
+        SessionInfo sessionInfo = session.getSessionInfo();
+        coordinator.addSessionInfo(sessionInfo);
+        maybeComplete();
     }
 
     public void handleProgress(ProgressInfo progress)
     {
-        sessionStates.get(progress.peer).updateProgress(progress);
+        coordinator.updateProgress(progress);
         fireStreamEvent(new StreamEvent.ProgressEvent(planId, progress));
     }
 
-    void fireStreamEvent(StreamEvent event)
+    synchronized void fireStreamEvent(StreamEvent event)
     {
         // delegate to listener
         for (StreamEventHandler listener : eventListeners)
             listener.handleStreamEvent(event);
     }
 
-    private synchronized void maybeComplete(StreamSession session)
+    private synchronized void maybeComplete()
     {
-        ongoingSessions.remove(session.peer);
-        if (ongoingSessions.isEmpty())
+        if (!coordinator.hasActiveSessions())
         {
             StreamState finalState = getCurrentState();
             if (finalState.hasFailedSession())

diff --git a/src/java/org/apache/cassandra/streaming/StreamSession.java b/src/java/org/apache/cassandra/streaming/StreamSession.java
index 4fcbe36..2efa00d 100644
--- a/src/java/org/apache/cassandra/streaming/StreamSession.java
+++ b/src/java/org/apache/cassandra/streaming/StreamSession.java

@@ -28,7 +28,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
@@ -41,6 +40,7 @@
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.metrics.StreamingMetrics;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.streaming.messages.*;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
@@ -109,15 +109,11 @@
  *       session is done is is closed (closeSession()). Otherwise, the node switch to the WAIT_COMPLETE state and
  *       send a CompleteMessage to the other side.
  */
-public class StreamSession implements IEndpointStateChangeSubscriber, IFailureDetectionEventListener
+public class StreamSession implements IEndpointStateChangeSubscriber
 {
     private static final Logger logger = LoggerFactory.getLogger(StreamSession.class);
-
-    // Executor that establish the streaming connection. Once we're connected to the other end, the rest of the streaming
-    // is directly handled by the ConnectionHandler incoming and outgoing threads.
-    private static final DebuggableThreadPoolExecutor streamExecutor = DebuggableThreadPoolExecutor.createWithFixedPoolSize("StreamConnectionEstablisher",
-                                                                                                                            FBUtilities.getAvailableProcessors());
     public final InetAddress peer;
+    private final int index;
 
     // should not be null when session is started
     private StreamResultFuture streamResult;
@@ -157,9 +153,10 @@
      * @param peer Address of streaming peer
      * @param factory is used for establishing connection
      */
-    public StreamSession(InetAddress peer, StreamConnectionFactory factory)
+    public StreamSession(InetAddress peer, StreamConnectionFactory factory, int index)
     {
         this.peer = peer;
+        this.index = index;
         this.factory = factory;
         this.handler = new ConnectionHandler(this);
         this.metrics = StreamingMetrics.get(peer);
@@ -170,6 +167,11 @@
         return streamResult == null ? null : streamResult.planId;
     }
 
+    public int sessionIndex()
+    {
+        return index;
+    }
+
     public String description()
     {
         return streamResult == null ? null : streamResult.description;
@@ -184,10 +186,6 @@
     public void init(StreamResultFuture streamResult)
     {
         this.streamResult = streamResult;
-
-        // register to gossiper/FD to fail on node failure
-        Gossiper.instance.register(this);
-        FailureDetector.instance.registerFailureDetectionEventListener(this);
     }
 
     public void start()
@@ -199,21 +197,16 @@
             return;
         }
 
-        streamExecutor.execute(new Runnable()
+        try
         {
-            public void run()
-            {
-                try
-                {
-                    handler.initiate();
-                    onInitializationComplete();
-                }
-                catch (IOException e)
-                {
-                    onError(e);
-                }
-            }
-        });
+            logger.info("[Stream #{}, ID#{}] Beginning stream session with {}", planId(), sessionIndex(), peer);
+            handler.initiate();
+            onInitializationComplete();
+        }
+        catch (Exception e)
+        {
+            onError(e);
+        }
     }
 
     public Socket createConnection() throws IOException
@@ -229,19 +222,42 @@
      * @param ranges Ranges to retrieve data
      * @param columnFamilies ColumnFamily names. Can be empty if requesting all CF under the keyspace.
      */
-    public void addStreamRequest(String keyspace, Collection<Range<Token>> ranges, Collection<String> columnFamilies)
+    public void addStreamRequest(String keyspace, Collection<Range<Token>> ranges, Collection<String> columnFamilies, long repairedAt)
     {
-        requests.add(new StreamRequest(keyspace, ranges, columnFamilies));
+        requests.add(new StreamRequest(keyspace, ranges, columnFamilies, repairedAt));
     }
 
     /**
      * Set up transfer for specific keyspace/ranges/CFs
      *
+     * Used in repair - a streamed sstable in repair will be marked with the given repairedAt time
+     *
      * @param keyspace Transfer keyspace
      * @param ranges Transfer ranges
      * @param columnFamilies Transfer ColumnFamilies
+     * @param flushTables flush tables?
+     * @param repairedAt the time the repair started.
      */
-    public void addTransferRanges(String keyspace, Collection<Range<Token>> ranges, Collection<String> columnFamilies, boolean flushTables)
+    public void addTransferRanges(String keyspace, Collection<Range<Token>> ranges, Collection<String> columnFamilies, boolean flushTables, long repairedAt)
+    {
+        Collection<ColumnFamilyStore> stores = getColumnFamilyStores(keyspace, columnFamilies);
+        if (flushTables)
+            flushSSTables(stores);
+
+        List<Range<Token>> normalizedRanges = Range.normalize(ranges);
+        List<SSTableStreamingSections> sections = getSSTableSectionsForRanges(normalizedRanges, stores, repairedAt);
+        try
+        {
+            addTransferFiles(sections);
+        }
+        finally
+        {
+            for (SSTableStreamingSections release : sections)
+                release.sstable.releaseReference();
+        }
+    }
+
+    private Collection<ColumnFamilyStore> getColumnFamilyStores(String keyspace, Collection<String> columnFamilies)
     {
         Collection<ColumnFamilyStore> stores = new HashSet<>();
         // if columnfamilies are not specified, we add all cf under the keyspace
@@ -254,24 +270,10 @@
             for (String cf : columnFamilies)
                 stores.add(Keyspace.open(keyspace).getColumnFamilyStore(cf));
         }
-
-        if (flushTables)
-            flushSSTables(stores);
-
-        List<Range<Token>> normalizedRanges = Range.normalize(ranges);
-        List<SSTableStreamingSections> sections = getSSTableSectionsForRanges(normalizedRanges, stores);
-        try
-        {
-            addTransferFiles(sections);
-        }
-        finally
-        {
-            for (SSTableStreamingSections release : sections)
-                release.sstable.releaseReference();
-        }
+        return stores;
     }
 
-    private List<SSTableStreamingSections> getSSTableSectionsForRanges(Collection<Range<Token>> ranges, Collection<ColumnFamilyStore> stores)
+    private List<SSTableStreamingSections> getSSTableSectionsForRanges(Collection<Range<Token>> ranges, Collection<ColumnFamilyStore> stores, long overriddenRepairedAt)
     {
         List<SSTableReader> sstables = new ArrayList<>();
         try
@@ -281,16 +283,20 @@
                 List<AbstractBounds<RowPosition>> rowBoundsList = new ArrayList<>(ranges.size());
                 for (Range<Token> range : ranges)
                     rowBoundsList.add(range.toRowBounds());
-                ColumnFamilyStore.ViewFragment view = cfStore.markReferenced(rowBoundsList);
+                ColumnFamilyStore.ViewFragment view = cfStore.selectAndReference(cfStore.viewFilter(rowBoundsList));
                 sstables.addAll(view.sstables);
             }
 
             List<SSTableStreamingSections> sections = new ArrayList<>(sstables.size());
             for (SSTableReader sstable : sstables)
             {
+                long repairedAt = overriddenRepairedAt;
+                if (overriddenRepairedAt == ActiveRepairService.UNREPAIRED_SSTABLE)
+                    repairedAt = sstable.getSSTableMetadata().repairedAt;
                 sections.add(new SSTableStreamingSections(sstable,
                                                           sstable.getPositionsForRanges(ranges),
-                                                          sstable.estimatedKeysForRanges(ranges)));
+                                                          sstable.estimatedKeysForRanges(ranges),
+                                                          repairedAt));
             }
             return sections;
         }
@@ -301,8 +307,6 @@
         }
     }
 
-
-
     public void addTransferFiles(Collection<SSTableStreamingSections> sstableDetails)
     {
         Iterator<SSTableStreamingSections> iter = sstableDetails.iterator();
@@ -324,7 +328,7 @@
                 task = new StreamTransferTask(this, cfId);
                 transfers.put(cfId, task);
             }
-            task.addTransferFile(details.sstable, details.estimatedKeys, details.sections);
+            task.addTransferFile(details.sstable, details.estimatedKeys, details.sections, details.repairedAt);
             iter.remove();
         }
     }
@@ -334,12 +338,14 @@
         public final SSTableReader sstable;
         public final List<Pair<Long, Long>> sections;
         public final long estimatedKeys;
+        public final long repairedAt;
 
-        public SSTableStreamingSections(SSTableReader sstable, List<Pair<Long, Long>> sections, long estimatedKeys)
+        public SSTableStreamingSections(SSTableReader sstable, List<Pair<Long, Long>> sections, long estimatedKeys, long repairedAt)
         {
             this.sstable = sstable;
             this.sections = sections;
             this.estimatedKeys = estimatedKeys;
+            this.repairedAt = repairedAt;
         }
     }
 
@@ -359,8 +365,6 @@
             // incoming thread (so we would deadlock).
             handler.close();
 
-            Gossiper.instance.unregister(this);
-            FailureDetector.instance.unregisterFailureDetectionEventListener(this);
             streamResult.handleSessionComplete(this);
         }
     }
@@ -444,14 +448,14 @@
             startStreamingFiles();
     }
 
-    /**
+    /**l
      * Call back for handling exception during streaming.
      *
      * @param e thrown exception
      */
     public void onError(Throwable e)
     {
-        logger.error("[Stream #" + planId() + "] Streaming error occurred", e);
+        logger.error("[Stream #{}] Streaming error occurred", planId(), e);
         // send session failure message
         if (handler.isOutgoingConnected())
             handler.sendMessage(new SessionFailedMessage());
@@ -467,7 +471,7 @@
         // prepare tasks
         state(State.PREPARING);
         for (StreamRequest request : requests)
-            addTransferRanges(request.keyspace, request.ranges, request.columnFamilies, true); // always flush on stream request
+            addTransferRanges(request.keyspace, request.ranges, request.columnFamilies, true, request.repairedAt); // always flush on stream request
         for (StreamSummary summary : summaries)
             prepareReceiving(summary);
 
@@ -520,7 +524,7 @@
 
     public void progress(Descriptor desc, ProgressInfo.Direction direction, long bytes, long total)
     {
-        ProgressInfo progress = new ProgressInfo(peer, desc.filenameFor(Component.DATA), direction, bytes, total);
+        ProgressInfo progress = new ProgressInfo(peer, index, desc.filenameFor(Component.DATA), direction, bytes, total);
         streamResult.handleProgress(progress);
     }
 
@@ -571,7 +575,7 @@
 
     public void doRetry(FileMessageHeader header, Throwable e)
     {
-        logger.warn("[Stream #" + planId() + "] Retrying for following error", e);
+        logger.warn("[Stream #{}] Retrying for following error", planId(), e);
         // retry
         retries++;
         if (retries > DatabaseDescriptor.getMaxStreamingRetries())
@@ -591,7 +595,7 @@
         List<StreamSummary> transferSummaries = Lists.newArrayList();
         for (StreamTask transfer : transfers.values())
             transferSummaries.add(transfer.getSummary());
-        return new SessionInfo(peer, receivingSummaries, transferSummaries, state);
+        return new SessionInfo(peer, index, receivingSummaries, transferSummaries, state);
     }
 
     public synchronized void taskCompleted(StreamReceiveTask completedTask)
@@ -614,23 +618,11 @@
 
     public void onRemove(InetAddress endpoint)
     {
-        convict(endpoint, Double.MAX_VALUE);
+        closeSession(State.FAILED);
     }
 
     public void onRestart(InetAddress endpoint, EndpointState epState)
     {
-        convict(endpoint, Double.MAX_VALUE);
-    }
-
-    public void convict(InetAddress endpoint, double phi)
-    {
-        if (!endpoint.equals(peer))
-            return;
-
-        // We want a higher confidence in the failure detection than usual because failing a streaming wrongly has a high cost (CASSANDRA-7063)
-        if (phi < 100 * DatabaseDescriptor.getPhiConvictThreshold())
-            return;
-
         closeSession(State.FAILED);
     }
 

diff --git a/src/java/org/apache/cassandra/streaming/StreamSummary.java b/src/java/org/apache/cassandra/streaming/StreamSummary.java
index a31e333..dc332cb 100644
--- a/src/java/org/apache/cassandra/streaming/StreamSummary.java
+++ b/src/java/org/apache/cassandra/streaming/StreamSummary.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.streaming;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.UUID;
@@ -27,6 +26,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
 
@@ -81,7 +81,7 @@
     public static class StreamSummarySerializer implements IVersionedSerializer<StreamSummary>
     {
         // arbitrary version is fine for UUIDSerializer for now...
-        public void serialize(StreamSummary summary, DataOutput out, int version) throws IOException
+        public void serialize(StreamSummary summary, DataOutputPlus out, int version) throws IOException
         {
             UUIDSerializer.serializer.serialize(summary.cfId, out, MessagingService.current_version);
             out.writeInt(summary.files);

diff --git a/src/java/org/apache/cassandra/streaming/StreamTransferTask.java b/src/java/org/apache/cassandra/streaming/StreamTransferTask.java
index a543d01..48a7d89 100644
--- a/src/java/org/apache/cassandra/streaming/StreamTransferTask.java
+++ b/src/java/org/apache/cassandra/streaming/StreamTransferTask.java

@@ -20,6 +20,7 @@
 import java.util.*;
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.streaming.messages.OutgoingFileMessage;
@@ -33,6 +34,7 @@
     private final ScheduledExecutorService timeoutExecutor = Executors.newSingleThreadScheduledExecutor();
 
     private final AtomicInteger sequenceNumber = new AtomicInteger(0);
+    private AtomicBoolean aborted = new AtomicBoolean(false);
 
     private final Map<Integer, OutgoingFileMessage> files = new ConcurrentHashMap<>();
 
@@ -45,10 +47,10 @@
         super(session, cfId);
     }
 
-    public void addTransferFile(SSTableReader sstable, long estimatedKeys, List<Pair<Long, Long>> sections)
+    public void addTransferFile(SSTableReader sstable, long estimatedKeys, List<Pair<Long, Long>> sections, long repairedAt)
     {
         assert sstable != null && cfId.equals(sstable.metadata.cfId);
-        OutgoingFileMessage message = new OutgoingFileMessage(sstable, sequenceNumber.getAndIncrement(), estimatedKeys, sections);
+        OutgoingFileMessage message = new OutgoingFileMessage(sstable, sequenceNumber.getAndIncrement(), estimatedKeys, sections, repairedAt);
         files.put(message.header.sequenceNumber, message);
         totalSize += message.header.size();
     }
@@ -75,11 +77,15 @@
 
     public void abort()
     {
-        for (OutgoingFileMessage file : files.values())
+        // Prevent releasing reference multiple times
+        if (aborted.compareAndSet(false, true))
         {
-            file.sstable.releaseReference();
+            for (OutgoingFileMessage file : files.values())
+            {
+                file.sstable.releaseReference();
+            }
+            timeoutExecutor.shutdownNow();
         }
-        timeoutExecutor.shutdownNow();
     }
 
     public int getTotalNumberOfFiles()

diff --git a/src/java/org/apache/cassandra/streaming/compress/CompressedInputStream.java b/src/java/org/apache/cassandra/streaming/compress/CompressedInputStream.java
index ef019c2..449546f 100644
--- a/src/java/org/apache/cassandra/streaming/compress/CompressedInputStream.java
+++ b/src/java/org/apache/cassandra/streaming/compress/CompressedInputStream.java

@@ -23,6 +23,7 @@
 import java.util.Iterator;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ThreadLocalRandom;
 import java.util.zip.Adler32;
 import java.util.zip.CRC32;
 import java.util.zip.Checksum;
@@ -114,7 +115,7 @@
         totalCompressedBytesRead += compressed.length;
 
         // validate crc randomly
-        if (info.parameters.getCrcCheckChance() > FBUtilities.threadLocalRandom().nextDouble())
+        if (info.parameters.getCrcCheckChance() > ThreadLocalRandom.current().nextDouble())
         {
             if (hasPostCompressionAdlerChecksums)
             {

diff --git a/src/java/org/apache/cassandra/streaming/compress/CompressedStreamReader.java b/src/java/org/apache/cassandra/streaming/compress/CompressedStreamReader.java
index 219cabb..fb2599f 100644
--- a/src/java/org/apache/cassandra/streaming/compress/CompressedStreamReader.java
+++ b/src/java/org/apache/cassandra/streaming/compress/CompressedStreamReader.java

@@ -24,11 +24,13 @@
 
 import com.google.common.base.Throwables;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.io.compress.CompressionMetadata;
-import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.sstable.SSTableWriter;
 import org.apache.cassandra.streaming.ProgressInfo;
 import org.apache.cassandra.streaming.StreamReader;
@@ -42,6 +44,8 @@
  */
 public class CompressedStreamReader extends StreamReader
 {
+    private static final Logger logger = LoggerFactory.getLogger(CompressedStreamReader.class);
+
     protected final CompressionInfo compressionInfo;
 
     public CompressedStreamReader(FileMessageHeader header, StreamSession session)
@@ -57,6 +61,7 @@
     @Override
     public SSTableWriter read(ReadableByteChannel channel) throws IOException
     {
+        logger.debug("reading file from {}, repairedAt = {}", session.peer, repairedAt);
         long totalSize = totalSize();
 
         Pair<String, String> kscf = Schema.instance.getCF(cfId);
@@ -67,7 +72,7 @@
         }
         ColumnFamilyStore cfs = Keyspace.open(kscf.left).getColumnFamilyStore(kscf.right);
 
-        SSTableWriter writer = createWriter(cfs, totalSize);
+        SSTableWriter writer = createWriter(cfs, totalSize, repairedAt);
 
         CompressedInputStream cis = new CompressedInputStream(Channels.newInputStream(channel), compressionInfo, inputVersion.hasPostCompressionAdlerChecksums);
         BytesReadTracker in = new BytesReadTracker(new DataInputStream(cis));

diff --git a/src/java/org/apache/cassandra/streaming/compress/CompressionInfo.java b/src/java/org/apache/cassandra/streaming/compress/CompressionInfo.java
index 3f0ef3a..8cfcd95 100644
--- a/src/java/org/apache/cassandra/streaming/compress/CompressionInfo.java
+++ b/src/java/org/apache/cassandra/streaming/compress/CompressionInfo.java

@@ -25,6 +25,7 @@
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.compress.CompressionMetadata;
 import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
  * Container that carries compression parameters and chunks to decompress data from stream.
@@ -45,7 +46,7 @@
 
     static class CompressionInfoSerializer implements IVersionedSerializer<CompressionInfo>
     {
-        public void serialize(CompressionInfo info, DataOutput out, int version) throws IOException
+        public void serialize(CompressionInfo info, DataOutputPlus out, int version) throws IOException
         {
             if (info == null)
             {

diff --git a/src/java/org/apache/cassandra/streaming/management/ProgressInfoCompositeData.java b/src/java/org/apache/cassandra/streaming/management/ProgressInfoCompositeData.java
index b361b1b..a54498d 100644
--- a/src/java/org/apache/cassandra/streaming/management/ProgressInfoCompositeData.java
+++ b/src/java/org/apache/cassandra/streaming/management/ProgressInfoCompositeData.java

@@ -32,18 +32,21 @@
 {
     private static final String[] ITEM_NAMES = new String[]{"planId",
                                                             "peer",
+                                                            "sessionIndex",
                                                             "fileName",
                                                             "direction",
                                                             "currentBytes",
                                                             "totalBytes"};
     private static final String[] ITEM_DESCS = new String[]{"String representation of Plan ID",
                                                             "Session peer",
+                                                            "Index of session",
                                                             "Name of the file",
                                                             "Direction('IN' or 'OUT')",
                                                             "Current bytes transferred",
                                                             "Total bytes to transfer"};
     private static final OpenType<?>[] ITEM_TYPES = new OpenType[]{SimpleType.STRING,
                                                                    SimpleType.STRING,
+                                                                   SimpleType.INTEGER,
                                                                    SimpleType.STRING,
                                                                    SimpleType.STRING,
                                                                    SimpleType.LONG,
@@ -70,10 +73,11 @@
         Map<String, Object> valueMap = new HashMap<>();
         valueMap.put(ITEM_NAMES[0], planId.toString());
         valueMap.put(ITEM_NAMES[1], progressInfo.peer.getHostAddress());
-        valueMap.put(ITEM_NAMES[2], progressInfo.fileName);
-        valueMap.put(ITEM_NAMES[3], progressInfo.direction.name());
-        valueMap.put(ITEM_NAMES[4], progressInfo.currentBytes);
-        valueMap.put(ITEM_NAMES[5], progressInfo.totalBytes);
+        valueMap.put(ITEM_NAMES[2], progressInfo.sessionIndex);
+        valueMap.put(ITEM_NAMES[3], progressInfo.fileName);
+        valueMap.put(ITEM_NAMES[4], progressInfo.direction.name());
+        valueMap.put(ITEM_NAMES[5], progressInfo.currentBytes);
+        valueMap.put(ITEM_NAMES[6], progressInfo.totalBytes);
         try
         {
             return new CompositeDataSupport(COMPOSITE_TYPE, valueMap);
@@ -90,10 +94,11 @@
         try
         {
             return new ProgressInfo(InetAddress.getByName((String) values[1]),
-                                    (String) values[2],
-                                    ProgressInfo.Direction.valueOf((String)values[3]),
-                                    (long) values[4],
-                                    (long) values[5]);
+                                    (int) values[2],
+                                    (String) values[3],
+                                    ProgressInfo.Direction.valueOf((String)values[4]),
+                                    (long) values[5],
+                                    (long) values[6]);
         }
         catch (UnknownHostException e)
         {

diff --git a/src/java/org/apache/cassandra/streaming/management/SessionInfoCompositeData.java b/src/java/org/apache/cassandra/streaming/management/SessionInfoCompositeData.java
index 658facf..bef6682 100644
--- a/src/java/org/apache/cassandra/streaming/management/SessionInfoCompositeData.java
+++ b/src/java/org/apache/cassandra/streaming/management/SessionInfoCompositeData.java

@@ -40,14 +40,16 @@
                                                             "sendingSummaries",
                                                             "state",
                                                             "receivingFiles",
-                                                            "sendingFiles"};
+                                                            "sendingFiles",
+                                                            "sessionIndex"};
     private static final String[] ITEM_DESCS = new String[]{"Plan ID",
                                                             "Session peer",
                                                             "Summaries of receiving data",
                                                             "Summaries of sending data",
                                                             "Current session state",
                                                             "Receiving files",
-                                                            "Sending files"};
+                                                            "Sending files",
+                                                            "Session index"};
     private static final OpenType<?>[] ITEM_TYPES;
 
     public static final CompositeType COMPOSITE_TYPE;
@@ -60,7 +62,8 @@
                                         ArrayType.getArrayType(StreamSummaryCompositeData.COMPOSITE_TYPE),
                                         SimpleType.STRING,
                                         ArrayType.getArrayType(ProgressInfoCompositeData.COMPOSITE_TYPE),
-                                        ArrayType.getArrayType(ProgressInfoCompositeData.COMPOSITE_TYPE)};
+                                        ArrayType.getArrayType(ProgressInfoCompositeData.COMPOSITE_TYPE),
+                                        SimpleType.INTEGER};
             COMPOSITE_TYPE = new CompositeType(SessionInfo.class.getName(),
                                                "SessionInfo",
                                                ITEM_NAMES,
@@ -97,6 +100,7 @@
         };
         valueMap.put(ITEM_NAMES[5], toArrayOfCompositeData(sessionInfo.getReceivingFiles(), fromProgressInfo));
         valueMap.put(ITEM_NAMES[6], toArrayOfCompositeData(sessionInfo.getSendingFiles(), fromProgressInfo));
+        valueMap.put(ITEM_NAMES[7], sessionInfo.sessionIndex);
         try
         {
             return new CompositeDataSupport(COMPOSITE_TYPE, valueMap);
@@ -129,6 +133,7 @@
             }
         };
         SessionInfo info = new SessionInfo(peer,
+                                           (int)values[7],
                                            fromArrayOfCompositeData((CompositeData[]) values[2], toStreamSummary),
                                            fromArrayOfCompositeData((CompositeData[]) values[3], toStreamSummary),
                                            StreamSession.State.valueOf((String) values[4]));

diff --git a/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java b/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java
index 495b88a..ec9c66c 100644
--- a/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java

@@ -19,8 +19,8 @@
 
 import java.io.IOException;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
 
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.streaming.StreamSession;
 
 public class CompleteMessage extends StreamMessage
@@ -32,7 +32,7 @@
             return new CompleteMessage();
         }
 
-        public void serialize(CompleteMessage message, WritableByteChannel out, int version, StreamSession session) throws IOException {}
+        public void serialize(CompleteMessage message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException {}
     };
 
     public CompleteMessage()

diff --git a/src/java/org/apache/cassandra/streaming/messages/FileMessageHeader.java b/src/java/org/apache/cassandra/streaming/messages/FileMessageHeader.java
index 24f1e04..284820e 100644
--- a/src/java/org/apache/cassandra/streaming/messages/FileMessageHeader.java
+++ b/src/java/org/apache/cassandra/streaming/messages/FileMessageHeader.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.streaming.messages;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
@@ -27,6 +26,7 @@
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.compress.CompressionMetadata;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.streaming.compress.CompressionInfo;
 import org.apache.cassandra.utils.Pair;
@@ -46,13 +46,15 @@
     public final long estimatedKeys;
     public final List<Pair<Long, Long>> sections;
     public final CompressionInfo compressionInfo;
+    public final long repairedAt;
 
     public FileMessageHeader(UUID cfId,
                              int sequenceNumber,
                              String version,
                              long estimatedKeys,
                              List<Pair<Long, Long>> sections,
-                             CompressionInfo compressionInfo)
+                             CompressionInfo compressionInfo,
+                             long repairedAt)
     {
         this.cfId = cfId;
         this.sequenceNumber = sequenceNumber;
@@ -60,6 +62,7 @@
         this.estimatedKeys = estimatedKeys;
         this.sections = sections;
         this.compressionInfo = compressionInfo;
+        this.repairedAt = repairedAt;
     }
 
     /**
@@ -92,6 +95,7 @@
         sb.append(", estimated keys: ").append(estimatedKeys);
         sb.append(", transfer size: ").append(size());
         sb.append(", compressed?: ").append(compressionInfo != null);
+        sb.append(", repairedAt: ").append(repairedAt);
         sb.append(')');
         return sb.toString();
     }
@@ -115,7 +119,7 @@
 
     static class FileMessageHeaderSerializer implements IVersionedSerializer<FileMessageHeader>
     {
-        public void serialize(FileMessageHeader header, DataOutput out, int version) throws IOException
+        public void serialize(FileMessageHeader header, DataOutputPlus out, int version) throws IOException
         {
             UUIDSerializer.serializer.serialize(header.cfId, out, version);
             out.writeInt(header.sequenceNumber);
@@ -129,6 +133,7 @@
                 out.writeLong(section.right);
             }
             CompressionInfo.serializer.serialize(header.compressionInfo, out, version);
+            out.writeLong(header.repairedAt);
         }
 
         public FileMessageHeader deserialize(DataInput in, int version) throws IOException
@@ -142,7 +147,8 @@
             for (int k = 0; k < count; k++)
                 sections.add(Pair.create(in.readLong(), in.readLong()));
             CompressionInfo compressionInfo = CompressionInfo.serializer.deserialize(in, MessagingService.current_version);
-            return new FileMessageHeader(cfId, sequenceNumber, sstableVersion, estimatedKeys, sections, compressionInfo);
+            long repairedAt = in.readLong();
+            return new FileMessageHeader(cfId, sequenceNumber, sstableVersion, estimatedKeys, sections, compressionInfo, repairedAt);
         }
 
         public long serializedSize(FileMessageHeader header, int version)

diff --git a/src/java/org/apache/cassandra/streaming/messages/IncomingFileMessage.java b/src/java/org/apache/cassandra/streaming/messages/IncomingFileMessage.java
index a403390..8569b88 100644
--- a/src/java/org/apache/cassandra/streaming/messages/IncomingFileMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/IncomingFileMessage.java

@@ -21,12 +21,13 @@
 import java.io.IOException;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
 
 import org.apache.cassandra.io.sstable.SSTableWriter;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.streaming.StreamReader;
 import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.streaming.compress.CompressedStreamReader;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
  * IncomingFileMessage is used to receive the part(or whole) of a SSTable data file.
@@ -46,14 +47,15 @@
             {
                 return new IncomingFileMessage(reader.read(in), header);
             }
-            catch (Throwable e)
+            catch (Throwable t)
             {
-                session.doRetry(header, e);
+                JVMStabilityInspector.inspectThrowable(t);
+                session.doRetry(header, t);
                 return null;
             }
         }
 
-        public void serialize(IncomingFileMessage message, WritableByteChannel out, int version, StreamSession session) throws IOException
+        public void serialize(IncomingFileMessage message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException
         {
             throw new UnsupportedOperationException("Not allowed to call serialize on an incoming file");
         }

diff --git a/src/java/org/apache/cassandra/streaming/messages/OutgoingFileMessage.java b/src/java/org/apache/cassandra/streaming/messages/OutgoingFileMessage.java
index 1fa115f..b012869 100644
--- a/src/java/org/apache/cassandra/streaming/messages/OutgoingFileMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/OutgoingFileMessage.java

@@ -17,16 +17,14 @@
  */
 package org.apache.cassandra.streaming.messages;
 
-import java.io.DataOutput;
-import java.io.DataOutputStream;
 import java.io.IOException;
-import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
 import java.util.List;
 
 import org.apache.cassandra.io.compress.CompressionMetadata;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.streaming.StreamWriter;
 import org.apache.cassandra.streaming.compress.CompressedStreamWriter;
@@ -45,10 +43,9 @@
             throw new UnsupportedOperationException("Not allowed to call deserialize on an outgoing file");
         }
 
-        public void serialize(OutgoingFileMessage message, WritableByteChannel out, int version, StreamSession session) throws IOException
+        public void serialize(OutgoingFileMessage message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException
         {
-            DataOutput output = new DataOutputStream(Channels.newOutputStream(out));
-            FileMessageHeader.serializer.serialize(message.header, output, version);
+            FileMessageHeader.serializer.serialize(message.header, out, version);
 
             final SSTableReader reader = message.sstable;
             StreamWriter writer = message.header.compressionInfo == null ?
@@ -56,7 +53,7 @@
                     new CompressedStreamWriter(reader,
                             message.header.sections,
                             message.header.compressionInfo, session);
-            writer.write(out);
+            writer.write(out.getChannel());
             session.fileSent(message.header);
         }
     };
@@ -64,7 +61,7 @@
     public FileMessageHeader header;
     public SSTableReader sstable;
 
-    public OutgoingFileMessage(SSTableReader sstable, int sequenceNumber, long estimatedKeys, List<Pair<Long, Long>> sections)
+    public OutgoingFileMessage(SSTableReader sstable, int sequenceNumber, long estimatedKeys, List<Pair<Long, Long>> sections, long repairedAt)
     {
         super(Type.FILE);
         this.sstable = sstable;
@@ -76,11 +73,12 @@
             compressionInfo = new CompressionInfo(meta.getChunksForSections(sections), meta.parameters);
         }
         this.header = new FileMessageHeader(sstable.metadata.cfId,
-                sequenceNumber,
-                sstable.descriptor.version.toString(),
-                estimatedKeys,
-                sections,
-                compressionInfo);
+                                            sequenceNumber,
+                                            sstable.descriptor.version.toString(),
+                                            estimatedKeys,
+                                            sections,
+                                            compressionInfo,
+                                            repairedAt);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/streaming/messages/PrepareMessage.java b/src/java/org/apache/cassandra/streaming/messages/PrepareMessage.java
index 16d156d..0bc5982 100644
--- a/src/java/org/apache/cassandra/streaming/messages/PrepareMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/PrepareMessage.java

@@ -20,10 +20,11 @@
 import java.io.*;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
 import java.util.ArrayList;
 import java.util.Collection;
 
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.streaming.StreamRequest;
 import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.streaming.StreamSummary;
@@ -47,17 +48,16 @@
             return message;
         }
 
-        public void serialize(PrepareMessage message, WritableByteChannel out, int version, StreamSession session) throws IOException
+        public void serialize(PrepareMessage message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException
         {
-            DataOutput output = new DataOutputStream(Channels.newOutputStream(out));
             // requests
-            output.writeInt(message.requests.size());
+            out.writeInt(message.requests.size());
             for (StreamRequest request : message.requests)
-                StreamRequest.serializer.serialize(request, output, version);
+                StreamRequest.serializer.serialize(request, out, version);
             // summaries
-            output.writeInt(message.summaries.size());
+            out.writeInt(message.summaries.size());
             for (StreamSummary summary : message.summaries)
-                StreamSummary.serializer.serialize(summary, output, version);
+                StreamSummary.serializer.serialize(summary, out, version);
         }
     };
 

diff --git a/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java b/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java
index a210348..e556651 100644
--- a/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java

@@ -20,9 +20,10 @@
 import java.io.*;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
 import java.util.UUID;
 
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -37,11 +38,10 @@
             return new ReceivedMessage(UUIDSerializer.serializer.deserialize(input, MessagingService.current_version), input.readInt());
         }
 
-        public void serialize(ReceivedMessage message, WritableByteChannel out, int version, StreamSession session) throws IOException
+        public void serialize(ReceivedMessage message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException
         {
-            DataOutput output = new DataOutputStream(Channels.newOutputStream(out));
-            UUIDSerializer.serializer.serialize(message.cfId, output, MessagingService.current_version);
-            output.writeInt(message.sequenceNumber);
+            UUIDSerializer.serializer.serialize(message.cfId, out, MessagingService.current_version);
+            out.writeInt(message.sequenceNumber);
         }
     };
 

diff --git a/src/java/org/apache/cassandra/streaming/messages/RetryMessage.java b/src/java/org/apache/cassandra/streaming/messages/RetryMessage.java
index 666257f..50b8873 100644
--- a/src/java/org/apache/cassandra/streaming/messages/RetryMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/RetryMessage.java

@@ -20,9 +20,10 @@
 import java.io.*;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
 import java.util.UUID;
 
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -37,11 +38,10 @@
             return new RetryMessage(UUIDSerializer.serializer.deserialize(input, MessagingService.current_version), input.readInt());
         }
 
-        public void serialize(RetryMessage message, WritableByteChannel out, int version, StreamSession session) throws IOException
+        public void serialize(RetryMessage message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException
         {
-            DataOutput output = new DataOutputStream(Channels.newOutputStream(out));
-            UUIDSerializer.serializer.serialize(message.cfId, output, MessagingService.current_version);
-            output.writeInt(message.sequenceNumber);
+            UUIDSerializer.serializer.serialize(message.cfId, out, MessagingService.current_version);
+            out.writeInt(message.sequenceNumber);
         }
     };
 

diff --git a/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java b/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java
index 7144a14..ae15620 100644
--- a/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java

@@ -19,8 +19,8 @@
 
 import java.io.IOException;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
 
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.streaming.StreamSession;
 
 public class SessionFailedMessage extends StreamMessage
@@ -32,7 +32,7 @@
             return new SessionFailedMessage();
         }
 
-        public void serialize(SessionFailedMessage message, WritableByteChannel out, int version, StreamSession session) throws IOException {}
+        public void serialize(SessionFailedMessage message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException {}
     };
 
     public SessionFailedMessage()

diff --git a/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java b/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java
index 025daab..a9ec4ae 100644
--- a/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.streaming.messages;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.nio.ByteBuffer;
@@ -27,6 +26,7 @@
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -40,15 +40,17 @@
     public static IVersionedSerializer<StreamInitMessage> serializer = new StreamInitMessageSerializer();
 
     public final InetAddress from;
+    public final int sessionIndex;
     public final UUID planId;
     public final String description;
 
     // true if this init message is to connect for outgoing message on receiving side
     public final boolean isForOutgoing;
 
-    public StreamInitMessage(InetAddress from, UUID planId, String description, boolean isForOutgoing)
+    public StreamInitMessage(InetAddress from, int sessionIndex, UUID planId, String description, boolean isForOutgoing)
     {
         this.from = from;
+        this.sessionIndex = sessionIndex;
         this.planId = planId;
         this.description = description;
         this.isForOutgoing = isForOutgoing;
@@ -96,9 +98,10 @@
 
     private static class StreamInitMessageSerializer implements IVersionedSerializer<StreamInitMessage>
     {
-        public void serialize(StreamInitMessage message, DataOutput out, int version) throws IOException
+        public void serialize(StreamInitMessage message, DataOutputPlus out, int version) throws IOException
         {
             CompactEndpointSerializationHelper.serialize(message.from, out);
+            out.writeInt(message.sessionIndex);
             UUIDSerializer.serializer.serialize(message.planId, out, MessagingService.current_version);
             out.writeUTF(message.description);
             out.writeBoolean(message.isForOutgoing);
@@ -107,15 +110,17 @@
         public StreamInitMessage deserialize(DataInput in, int version) throws IOException
         {
             InetAddress from = CompactEndpointSerializationHelper.deserialize(in);
+            int sessionIndex = in.readInt();
             UUID planId = UUIDSerializer.serializer.deserialize(in, MessagingService.current_version);
             String description = in.readUTF();
             boolean sentByInitiator = in.readBoolean();
-            return new StreamInitMessage(from, planId, description, sentByInitiator);
+            return new StreamInitMessage(from, sessionIndex, planId, description, sentByInitiator);
         }
 
         public long serializedSize(StreamInitMessage message, int version)
         {
             long size = CompactEndpointSerializationHelper.serializedSize(message.from);
+            size += TypeSizes.NATIVE.sizeof(message.sessionIndex);
             size += UUIDSerializer.serializer.serializedSize(message.planId, MessagingService.current_version);
             size += TypeSizes.NATIVE.sizeof(message.description);
             size += TypeSizes.NATIVE.sizeof(message.isForOutgoing);

diff --git a/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java
index 7010c95..360b59e 100644
--- a/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java

@@ -22,6 +22,7 @@
 import java.nio.channels.ReadableByteChannel;
 import java.nio.channels.WritableByteChannel;
 
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.streaming.StreamSession;
 
 /**
@@ -32,16 +33,15 @@
 public abstract class StreamMessage
 {
     /** Streaming protocol version */
-    public static final int CURRENT_VERSION = 1;
+    public static final int CURRENT_VERSION = 2;
 
-    public static void serialize(StreamMessage message, WritableByteChannel out, int version, StreamSession session) throws IOException
+    public static void serialize(StreamMessage message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException
     {
         ByteBuffer buff = ByteBuffer.allocate(1);
         // message type
         buff.put(message.type.type);
         buff.flip();
-        while (buff.hasRemaining())
-            out.write(buff);
+        out.write(buff);
         message.type.outSerializer.serialize(message, out, version, session);
     }
 
@@ -66,7 +66,7 @@
     public static interface Serializer<V extends StreamMessage>
     {
         V deserialize(ReadableByteChannel in, int version, StreamSession session) throws IOException;
-        void serialize(V message, WritableByteChannel out, int version, StreamSession session) throws IOException;
+        void serialize(V message, DataOutputStreamAndChannel out, int version, StreamSession session) throws IOException;
     }
 
     /** StreamMessage types */

diff --git a/src/java/org/apache/cassandra/thrift/CassandraServer.java b/src/java/org/apache/cassandra/thrift/CassandraServer.java
index cfb0e80..2e76ee4 100644
--- a/src/java/org/apache/cassandra/thrift/CassandraServer.java
+++ b/src/java/org/apache/cassandra/thrift/CassandraServer.java

@@ -21,6 +21,7 @@
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
+import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.concurrent.Callable;
 import java.util.concurrent.TimeoutException;
@@ -34,6 +35,7 @@
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.google.common.primitives.Longs;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -46,12 +48,14 @@
 import org.apache.cassandra.cql.CQLStatement;
 import org.apache.cassandra.cql.QueryProcessor;
 import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.statements.ParsedStatement;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.filter.ColumnSlice;
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
 import org.apache.cassandra.db.filter.NamesQueryFilter;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.db.marshal.TimeUUIDType;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.*;
@@ -60,7 +64,7 @@
 import org.apache.cassandra.metrics.ClientMetrics;
 import org.apache.cassandra.scheduler.IRequestScheduler;
 import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.service.CASConditions;
+import org.apache.cassandra.service.CASRequest;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.service.StorageProxy;
@@ -130,65 +134,65 @@
         return columnFamilyKeyMap;
     }
 
-    public List<ColumnOrSuperColumn> thriftifyColumns(Collection<org.apache.cassandra.db.Column> columns, boolean reverseOrder, long now)
+    public List<ColumnOrSuperColumn> thriftifyColumns(Collection<Cell> cells, boolean reverseOrder, long now)
     {
-        ArrayList<ColumnOrSuperColumn> thriftColumns = new ArrayList<ColumnOrSuperColumn>(columns.size());
-        for (org.apache.cassandra.db.Column column : columns)
+        ArrayList<ColumnOrSuperColumn> thriftColumns = new ArrayList<ColumnOrSuperColumn>(cells.size());
+        for (Cell cell : cells)
         {
-            if (column.isMarkedForDelete(now))
+            if (!cell.isLive(now))
                 continue;
 
-            thriftColumns.add(thriftifyColumnWithName(column, column.name()));
+            thriftColumns.add(thriftifyColumnWithName(cell, cell.name().toByteBuffer()));
         }
 
         // we have to do the reversing here, since internally we pass results around in ColumnFamily
-        // objects, which always sort their columns in the "natural" order
+        // objects, which always sort their cells in the "natural" order
         // TODO this is inconvenient for direct users of StorageProxy
         if (reverseOrder)
             Collections.reverse(thriftColumns);
         return thriftColumns;
     }
 
-    private ColumnOrSuperColumn thriftifyColumnWithName(org.apache.cassandra.db.Column column, ByteBuffer newName)
+    private ColumnOrSuperColumn thriftifyColumnWithName(Cell cell, ByteBuffer newName)
     {
-        if (column instanceof org.apache.cassandra.db.CounterColumn)
-            return new ColumnOrSuperColumn().setCounter_column(thriftifySubCounter(column).setName(newName));
+        if (cell instanceof CounterCell)
+            return new ColumnOrSuperColumn().setCounter_column(thriftifySubCounter(cell).setName(newName));
         else
-            return new ColumnOrSuperColumn().setColumn(thriftifySubColumn(column).setName(newName));
+            return new ColumnOrSuperColumn().setColumn(thriftifySubColumn(cell).setName(newName));
     }
 
-    private Column thriftifySubColumn(org.apache.cassandra.db.Column column)
+    private Column thriftifySubColumn(Cell cell)
     {
-        assert !(column instanceof org.apache.cassandra.db.CounterColumn);
+        assert !(cell instanceof CounterCell);
 
-        Column thrift_column = new Column(column.name()).setValue(column.value()).setTimestamp(column.timestamp());
-        if (column instanceof ExpiringColumn)
+        Column thrift_column = new Column(cell.name().toByteBuffer()).setValue(cell.value()).setTimestamp(cell.timestamp());
+        if (cell instanceof ExpiringCell)
         {
-            thrift_column.setTtl(((ExpiringColumn) column).getTimeToLive());
+            thrift_column.setTtl(((ExpiringCell) cell).getTimeToLive());
         }
         return thrift_column;
     }
 
-    private List<Column> thriftifyColumnsAsColumns(Collection<org.apache.cassandra.db.Column> columns, long now)
+    private List<Column> thriftifyColumnsAsColumns(Collection<Cell> cells, long now)
     {
-        List<Column> thriftColumns = new ArrayList<Column>(columns.size());
-        for (org.apache.cassandra.db.Column column : columns)
+        List<Column> thriftColumns = new ArrayList<Column>(cells.size());
+        for (Cell cell : cells)
         {
-            if (column.isMarkedForDelete(now))
+            if (!cell.isLive(now))
                 continue;
 
-            thriftColumns.add(thriftifySubColumn(column));
+            thriftColumns.add(thriftifySubColumn(cell));
         }
         return thriftColumns;
     }
 
-    private CounterColumn thriftifySubCounter(org.apache.cassandra.db.Column column)
+    private CounterColumn thriftifySubCounter(Cell cell)
     {
-        assert column instanceof org.apache.cassandra.db.CounterColumn;
-        return new CounterColumn(column.name(), CounterContext.instance().total(column.value()));
+        assert cell instanceof CounterCell;
+        return new CounterColumn(cell.name().toByteBuffer(), CounterContext.instance().total(cell.value()));
     }
 
-    private List<ColumnOrSuperColumn> thriftifySuperColumns(Collection<org.apache.cassandra.db.Column> columns,
+    private List<ColumnOrSuperColumn> thriftifySuperColumns(Collection<Cell> cells,
                                                             boolean reverseOrder,
                                                             long now,
                                                             boolean subcolumnsOnly,
@@ -196,13 +200,13 @@
     {
         if (subcolumnsOnly)
         {
-            ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(columns.size());
-            for (org.apache.cassandra.db.Column column : columns)
+            ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(cells.size());
+            for (Cell cell : cells)
             {
-                if (column.isMarkedForDelete(now))
+                if (!cell.isLive(now))
                     continue;
 
-                thriftSuperColumns.add(thriftifyColumnWithName(column, SuperColumns.subName(column.name())));
+                thriftSuperColumns.add(thriftifyColumnWithName(cell, SuperColumns.subName(cell.name())));
             }
             if (reverseOrder)
                 Collections.reverse(thriftSuperColumns);
@@ -211,28 +215,28 @@
         else
         {
             if (isCounterCF)
-                return thriftifyCounterSuperColumns(columns, reverseOrder, now);
+                return thriftifyCounterSuperColumns(cells, reverseOrder, now);
             else
-                return thriftifySuperColumns(columns, reverseOrder, now);
+                return thriftifySuperColumns(cells, reverseOrder, now);
         }
     }
 
-    private List<ColumnOrSuperColumn> thriftifySuperColumns(Collection<org.apache.cassandra.db.Column> columns, boolean reverseOrder, long now)
+    private List<ColumnOrSuperColumn> thriftifySuperColumns(Collection<Cell> cells, boolean reverseOrder, long now)
     {
-        ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(columns.size());
+        ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(cells.size());
         SuperColumn current = null;
-        for (org.apache.cassandra.db.Column column : columns)
+        for (Cell cell : cells)
         {
-            if (column.isMarkedForDelete(now))
+            if (!cell.isLive(now))
                 continue;
 
-            ByteBuffer scName = SuperColumns.scName(column.name());
+            ByteBuffer scName = SuperColumns.scName(cell.name());
             if (current == null || !scName.equals(current.bufferForName()))
             {
                 current = new SuperColumn(scName, new ArrayList<Column>());
                 thriftSuperColumns.add(new ColumnOrSuperColumn().setSuper_column(current));
             }
-            current.getColumns().add(thriftifySubColumn(column).setName(SuperColumns.subName(column.name())));
+            current.getColumns().add(thriftifySubColumn(cell).setName(SuperColumns.subName(cell.name())));
         }
 
         if (reverseOrder)
@@ -241,22 +245,22 @@
         return thriftSuperColumns;
     }
 
-    private List<ColumnOrSuperColumn> thriftifyCounterSuperColumns(Collection<org.apache.cassandra.db.Column> columns, boolean reverseOrder, long now)
+    private List<ColumnOrSuperColumn> thriftifyCounterSuperColumns(Collection<Cell> cells, boolean reverseOrder, long now)
     {
-        ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(columns.size());
+        ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(cells.size());
         CounterSuperColumn current = null;
-        for (org.apache.cassandra.db.Column column : columns)
+        for (Cell cell : cells)
         {
-            if (column.isMarkedForDelete(now))
+            if (!cell.isLive(now))
                 continue;
 
-            ByteBuffer scName = SuperColumns.scName(column.name());
+            ByteBuffer scName = SuperColumns.scName(cell.name());
             if (current == null || !scName.equals(current.bufferForName()))
             {
                 current = new CounterSuperColumn(scName, new ArrayList<CounterColumn>());
                 thriftSuperColumns.add(new ColumnOrSuperColumn().setCounter_super_column(current));
             }
-            current.getColumns().add(thriftifySubCounter(column).setName(SuperColumns.subName(column.name())));
+            current.getColumns().add(thriftifySubCounter(cell).setName(SuperColumns.subName(cell.name())));
         }
 
         if (reverseOrder)
@@ -283,12 +287,12 @@
 
     private List<ColumnOrSuperColumn> thriftifyColumnFamily(ColumnFamily cf, boolean subcolumnsOnly, boolean reverseOrder, long now)
     {
-        if (cf == null || cf.getColumnCount() == 0)
+        if (cf == null || !cf.hasColumns())
             return EMPTY_COLUMNS;
 
         if (cf.metadata().isSuper())
         {
-            boolean isCounterCF = cf.metadata().getDefaultValidator().isCommutative();
+            boolean isCounterCF = cf.metadata().isCounter();
             return thriftifySuperColumns(cf.getSortedColumns(), reverseOrder, now, subcolumnsOnly, isCounterCF);
         }
         else
@@ -379,28 +383,39 @@
 
     private SliceQueryFilter toInternalFilter(CFMetaData metadata, ColumnParent parent, SliceRange range)
     {
-        SliceQueryFilter filter = new SliceQueryFilter(range.start, range.finish, range.reversed, range.count);
         if (metadata.isSuper())
-            filter = SuperColumns.fromSCSliceFilter((CompositeType)metadata.comparator, parent.bufferForSuper_column(), filter);
-        return filter;
+        {
+            CellNameType columnType = new SimpleDenseCellNameType(metadata.comparator.subtype(parent.isSetSuper_column() ? 1 : 0));
+            Composite start = columnType.fromByteBuffer(range.start);
+            Composite finish = columnType.fromByteBuffer(range.finish);
+            SliceQueryFilter filter = new SliceQueryFilter(start, finish, range.reversed, range.count);
+            return SuperColumns.fromSCSliceFilter(metadata.comparator, parent.bufferForSuper_column(), filter);
+        }
+
+        Composite start = metadata.comparator.fromByteBuffer(range.start);
+        Composite finish = metadata.comparator.fromByteBuffer(range.finish);
+        return new SliceQueryFilter(start, finish, range.reversed, range.count);
     }
 
     private IDiskAtomFilter toInternalFilter(CFMetaData metadata, ColumnParent parent, SlicePredicate predicate)
     {
         IDiskAtomFilter filter;
+
         if (predicate.column_names != null)
         {
             if (metadata.isSuper())
             {
-                CompositeType type = (CompositeType)metadata.comparator;
-                SortedSet s = new TreeSet<ByteBuffer>(parent.isSetSuper_column() ? type.types.get(1) : type.types.get(0));
-                s.addAll(predicate.column_names);
-                filter = SuperColumns.fromSCNamesFilter(type, parent.bufferForSuper_column(), new NamesQueryFilter(s));
+                CellNameType columnType = new SimpleDenseCellNameType(metadata.comparator.subtype(parent.isSetSuper_column() ? 1 : 0));
+                SortedSet<CellName> s = new TreeSet<>(columnType);
+                for (ByteBuffer bb : predicate.column_names)
+                    s.add(columnType.cellFromByteBuffer(bb));
+                filter = SuperColumns.fromSCNamesFilter(metadata.comparator, parent.bufferForSuper_column(), new NamesQueryFilter(s));
             }
             else
             {
-                SortedSet s = new TreeSet<ByteBuffer>(metadata.comparator);
-                s.addAll(predicate.column_names);
+                SortedSet<CellName> s = new TreeSet<CellName>(metadata.comparator);
+                for (ByteBuffer bb : predicate.column_names)
+                    s.add(metadata.comparator.cellFromByteBuffer(bb));
                 filter = new NamesQueryFilter(s);
             }
         }
@@ -471,15 +486,15 @@
             IDiskAtomFilter filter;
             if (metadata.isSuper())
             {
-                CompositeType type = (CompositeType)metadata.comparator;
-                SortedSet names = new TreeSet<ByteBuffer>(column_path.column == null ? type.types.get(0) : type.types.get(1));
-                names.add(column_path.column == null ? column_path.super_column : column_path.column);
-                filter = SuperColumns.fromSCNamesFilter(type, column_path.column == null ? null : column_path.bufferForSuper_column(), new NamesQueryFilter(names));
+                CellNameType columnType = new SimpleDenseCellNameType(metadata.comparator.subtype(column_path.column == null ? 0 : 1));
+                SortedSet<CellName> names = new TreeSet<CellName>(columnType);
+                names.add(columnType.cellFromByteBuffer(column_path.column == null ? column_path.super_column : column_path.column));
+                filter = SuperColumns.fromSCNamesFilter(metadata.comparator, column_path.column == null ? null : column_path.bufferForSuper_column(), new NamesQueryFilter(names));
             }
             else
             {
-                SortedSet<ByteBuffer> names = new TreeSet<ByteBuffer>(metadata.comparator);
-                names.add(column_path.column);
+                SortedSet<CellName> names = new TreeSet<CellName>(metadata.comparator);
+                names.add(metadata.comparator.cellFromByteBuffer(column_path.column));
                 filter = new NamesQueryFilter(names);
             }
 
@@ -541,7 +556,7 @@
             if (cfs.getMeanColumns() > 0)
             {
                 int averageColumnSize = (int) (cfs.getMeanRowSize() / cfs.getMeanColumns());
-                pageSize = Math.min(COUNT_PAGE_SIZE, DatabaseDescriptor.getInMemoryCompactionLimit() / averageColumnSize);
+                pageSize = Math.min(COUNT_PAGE_SIZE, 4 * 1024 * 1024 / averageColumnSize);
                 pageSize = Math.max(2, pageSize);
                 logger.debug("average row column size is {}; using pageSize of {}", averageColumnSize, pageSize);
             }
@@ -654,24 +669,24 @@
             throw new org.apache.cassandra.exceptions.InvalidRequestException("missing mandatory super column name for super CF " + column_parent.column_family);
         }
         ThriftValidation.validateColumnNames(metadata, column_parent, Arrays.asList(column.name));
-        ThriftValidation.validateColumnData(metadata, column, column_parent.super_column != null);
+        ThriftValidation.validateColumnData(metadata, column_parent.super_column, column);
 
-        RowMutation rm;
+        org.apache.cassandra.db.Mutation mutation;
         try
         {
-            ByteBuffer name = column.name;
-            if (metadata.isSuper())
-                name = CompositeType.build(column_parent.super_column, name);
+            CellName name = metadata.isSuper()
+                          ? metadata.comparator.makeCellName(column_parent.super_column, column.name)
+                          : metadata.comparator.cellFromByteBuffer(column.name);
 
             ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cState.getKeyspace(), column_parent.column_family);
             cf.addColumn(name, column.value, column.timestamp, column.ttl);
-            rm = new RowMutation(cState.getKeyspace(), key, cf);
+            mutation = new org.apache.cassandra.db.Mutation(cState.getKeyspace(), key, cf);
         }
         catch (MarshalException e)
         {
             throw new org.apache.cassandra.exceptions.InvalidRequestException(e.getMessage());
         }
-        doInsert(consistency_level, Arrays.asList(rm));
+        doInsert(consistency_level, Arrays.asList(mutation));
     }
 
     public void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level)
@@ -747,12 +762,12 @@
             });
             ThriftValidation.validateColumnNames(metadata, new ColumnParent(column_family), names);
             for (Column column : updates)
-                ThriftValidation.validateColumnData(metadata, column, false);
+                ThriftValidation.validateColumnData(metadata, null, column);
 
             CFMetaData cfm = Schema.instance.getCFMetaData(cState.getKeyspace(), column_family);
-            UnsortedColumns cfUpdates = UnsortedColumns.factory.create(cfm);
+            ColumnFamily cfUpdates = ArrayBackedSortedColumns.factory.create(cfm);
             for (Column column : updates)
-                cfUpdates.addColumn(column.name, column.value, column.timestamp);
+                cfUpdates.addColumn(cfm.comparator.cellFromByteBuffer(column.name), column.value, column.timestamp);
 
             ColumnFamily cfExpected;
             if (expected.isEmpty())
@@ -761,17 +776,16 @@
             }
             else
             {
-                cfExpected = TreeMapBackedSortedColumns.factory.create(cfm);
+                cfExpected = ArrayBackedSortedColumns.factory.create(cfm);
                 for (Column column : expected)
-                    cfExpected.addColumn(column.name, column.value, column.timestamp);
+                    cfExpected.addColumn(cfm.comparator.cellFromByteBuffer(column.name), column.value, column.timestamp);
             }
 
             schedule(DatabaseDescriptor.getWriteRpcTimeout());
             ColumnFamily result = StorageProxy.cas(cState.getKeyspace(),
                                                    column_family,
                                                    key,
-                                                   new ThriftCASConditions(cfExpected),
-                                                   cfUpdates,
+                                                   new ThriftCASRequest(cfExpected, cfUpdates),
                                                    ThriftConversion.fromThrift(serial_consistency_level),
                                                    ThriftConversion.fromThrift(commit_consistency_level));
             return result == null
@@ -801,7 +815,7 @@
                                                boolean allowCounterMutations)
     throws RequestValidationException
     {
-        List<IMutation> rowMutations = new ArrayList<IMutation>();
+        List<IMutation> mutations = new ArrayList<>();
         ThriftClientState cState = state();
         String keyspace = cState.getKeyspace();
 
@@ -809,10 +823,10 @@
         {
             ByteBuffer key = mutationEntry.getKey();
 
-            // We need to separate row mutation for standard cf and counter cf (that will be encapsulated in a
+            // We need to separate mutation for standard cf and counter cf (that will be encapsulated in a
             // CounterMutation) because it doesn't follow the same code path
-            RowMutation rmStandard = null;
-            RowMutation rmCounter = null;
+            org.apache.cassandra.db.Mutation standardMutation = null;
+            org.apache.cassandra.db.Mutation counterMutation = null;
 
             Map<String, List<Mutation>> columnFamilyToMutations = mutationEntry.getValue();
             for (Map.Entry<String, List<Mutation>> columnFamilyMutations : columnFamilyToMutations.entrySet())
@@ -824,109 +838,112 @@
                 CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, cfName);
                 ThriftValidation.validateKey(metadata, key);
 
-                RowMutation rm;
-                if (metadata.getDefaultValidator().isCommutative())
+                org.apache.cassandra.db.Mutation mutation;
+                if (metadata.isCounter())
                 {
                     ThriftConversion.fromThrift(consistency_level).validateCounterForWrite(metadata);
-                    rmCounter = rmCounter == null ? new RowMutation(keyspace, key) : rmCounter;
-                    rm = rmCounter;
+                    counterMutation = counterMutation == null ? new org.apache.cassandra.db.Mutation(keyspace, key) : counterMutation;
+                    mutation = counterMutation;
                 }
                 else
                 {
-                    rmStandard = rmStandard == null ? new RowMutation(keyspace, key) : rmStandard;
-                    rm = rmStandard;
+                    standardMutation = standardMutation == null ? new org.apache.cassandra.db.Mutation(keyspace, key) : standardMutation;
+                    mutation = standardMutation;
                 }
 
-                for (Mutation mutation : columnFamilyMutations.getValue())
+                for (Mutation m : columnFamilyMutations.getValue())
                 {
-                    ThriftValidation.validateMutation(metadata, mutation);
+                    ThriftValidation.validateMutation(metadata, m);
 
-                    if (mutation.deletion != null)
+                    if (m.deletion != null)
                     {
-                        deleteColumnOrSuperColumn(rm, cfName, mutation.deletion);
+                        deleteColumnOrSuperColumn(mutation, metadata, m.deletion);
                     }
-                    if (mutation.column_or_supercolumn != null)
+                    if (m.column_or_supercolumn != null)
                     {
-                        addColumnOrSuperColumn(rm, cfName, mutation.column_or_supercolumn);
+                        addColumnOrSuperColumn(mutation, metadata, m.column_or_supercolumn);
                     }
                 }
             }
-            if (rmStandard != null && !rmStandard.isEmpty())
-                rowMutations.add(rmStandard);
+            if (standardMutation != null && !standardMutation.isEmpty())
+                mutations.add(standardMutation);
 
-            if (rmCounter != null && !rmCounter.isEmpty())
+            if (counterMutation != null && !counterMutation.isEmpty())
             {
                 if (allowCounterMutations)
-                    rowMutations.add(new CounterMutation(rmCounter, ThriftConversion.fromThrift(consistency_level)));
+                    mutations.add(new CounterMutation(counterMutation, ThriftConversion.fromThrift(consistency_level)));
                 else
                     throw new org.apache.cassandra.exceptions.InvalidRequestException("Counter mutations are not allowed in atomic batches");
             }
         }
 
-        return rowMutations;
+        return mutations;
     }
 
-    private void addColumnOrSuperColumn(RowMutation rm, String cfName, ColumnOrSuperColumn cosc)
+    private void addColumnOrSuperColumn(org.apache.cassandra.db.Mutation mutation, CFMetaData cfm, ColumnOrSuperColumn cosc)
     {
         if (cosc.super_column != null)
         {
             for (Column column : cosc.super_column.columns)
             {
-                rm.add(cfName, CompositeType.build(cosc.super_column.name, column.name), column.value, column.timestamp, column.ttl);
+                mutation.add(cfm.cfName, cfm.comparator.makeCellName(cosc.super_column.name, column.name), column.value, column.timestamp, column.ttl);
             }
         }
         else if (cosc.column != null)
         {
-            rm.add(cfName, cosc.column.name, cosc.column.value, cosc.column.timestamp, cosc.column.ttl);
+            mutation.add(cfm.cfName, cfm.comparator.cellFromByteBuffer(cosc.column.name), cosc.column.value, cosc.column.timestamp, cosc.column.ttl);
         }
         else if (cosc.counter_super_column != null)
         {
             for (CounterColumn column : cosc.counter_super_column.columns)
             {
-                rm.addCounter(cfName, CompositeType.build(cosc.counter_super_column.name, column.name), column.value);
+                mutation.addCounter(cfm.cfName, cfm.comparator.makeCellName(cosc.counter_super_column.name, column.name), column.value);
             }
         }
         else // cosc.counter_column != null
         {
-            rm.addCounter(cfName, cosc.counter_column.name, cosc.counter_column.value);
+            mutation.addCounter(cfm.cfName, cfm.comparator.cellFromByteBuffer(cosc.counter_column.name), cosc.counter_column.value);
         }
     }
 
-    private void deleteColumnOrSuperColumn(RowMutation rm, String cfName, Deletion del)
+    private void deleteColumnOrSuperColumn(org.apache.cassandra.db.Mutation mutation, CFMetaData cfm, Deletion del)
     {
         if (del.predicate != null && del.predicate.column_names != null)
         {
             for (ByteBuffer c : del.predicate.column_names)
             {
-                if (del.super_column == null && Schema.instance.getColumnFamilyType(rm.getKeyspaceName(), cfName) == ColumnFamilyType.Super)
-                    rm.deleteRange(cfName, SuperColumns.startOf(c), SuperColumns.endOf(c), del.timestamp);
+                if (del.super_column == null && cfm.isSuper())
+                    mutation.deleteRange(cfm.cfName, SuperColumns.startOf(c), SuperColumns.endOf(c), del.timestamp);
                 else if (del.super_column != null)
-                    rm.delete(cfName, CompositeType.build(del.super_column, c), del.timestamp);
+                    mutation.delete(cfm.cfName, cfm.comparator.makeCellName(del.super_column, c), del.timestamp);
                 else
-                    rm.delete(cfName, c, del.timestamp);
+                    mutation.delete(cfm.cfName, cfm.comparator.cellFromByteBuffer(c), del.timestamp);
             }
         }
         else if (del.predicate != null && del.predicate.slice_range != null)
         {
-            if (del.super_column == null && Schema.instance.getColumnFamilyType(rm.getKeyspaceName(), cfName) == ColumnFamilyType.Super)
-                rm.deleteRange(cfName,
-                               SuperColumns.startOf(del.predicate.getSlice_range().start),
-                               SuperColumns.endOf(del.predicate.getSlice_range().finish),
-                               del.timestamp);
+            if (del.super_column == null && cfm.isSuper())
+                mutation.deleteRange(cfm.cfName,
+                                     SuperColumns.startOf(del.predicate.getSlice_range().start),
+                                     SuperColumns.endOf(del.predicate.getSlice_range().finish),
+                                     del.timestamp);
             else if (del.super_column != null)
-                rm.deleteRange(cfName,
-                               CompositeType.build(del.super_column, del.predicate.getSlice_range().start),
-                               CompositeType.build(del.super_column, del.predicate.getSlice_range().finish),
-                               del.timestamp);
+                mutation.deleteRange(cfm.cfName,
+                                     cfm.comparator.makeCellName(del.super_column, del.predicate.getSlice_range().start),
+                                     cfm.comparator.makeCellName(del.super_column, del.predicate.getSlice_range().finish),
+                                     del.timestamp);
             else
-                rm.deleteRange(cfName, del.predicate.getSlice_range().start, del.predicate.getSlice_range().finish, del.timestamp);
+                mutation.deleteRange(cfm.cfName,
+                                     cfm.comparator.fromByteBuffer(del.predicate.getSlice_range().start),
+                                     cfm.comparator.fromByteBuffer(del.predicate.getSlice_range().finish),
+                                     del.timestamp);
         }
         else
         {
             if (del.super_column != null)
-                rm.deleteRange(cfName, SuperColumns.startOf(del.super_column), SuperColumns.endOf(del.super_column), del.timestamp);
+                mutation.deleteRange(cfm.cfName, SuperColumns.startOf(del.super_column), SuperColumns.endOf(del.super_column), del.timestamp);
             else
-                rm.delete(cfName, del.timestamp);
+                mutation.delete(cfm.cfName, del.timestamp);
         }
     }
 
@@ -1009,20 +1026,20 @@
         if (isCommutativeOp)
             ThriftConversion.fromThrift(consistency_level).validateCounterForWrite(metadata);
 
-        RowMutation rm = new RowMutation(keyspace, key);
+        org.apache.cassandra.db.Mutation mutation = new org.apache.cassandra.db.Mutation(keyspace, key);
         if (column_path.super_column == null && column_path.column == null)
-            rm.delete(column_path.column_family, timestamp);
+            mutation.delete(column_path.column_family, timestamp);
         else if (column_path.super_column == null)
-            rm.delete(column_path.column_family, column_path.column, timestamp);
+            mutation.delete(column_path.column_family, metadata.comparator.cellFromByteBuffer(column_path.column), timestamp);
         else if (column_path.column == null)
-            rm.deleteRange(column_path.column_family, SuperColumns.startOf(column_path.super_column), SuperColumns.endOf(column_path.super_column), timestamp);
+            mutation.deleteRange(column_path.column_family, SuperColumns.startOf(column_path.super_column), SuperColumns.endOf(column_path.super_column), timestamp);
         else
-            rm.delete(column_path.column_family, CompositeType.build(column_path.super_column, column_path.column), timestamp);
+            mutation.delete(column_path.column_family, metadata.comparator.makeCellName(column_path.super_column, column_path.column), timestamp);
 
         if (isCommutativeOp)
-            doInsert(consistency_level, Arrays.asList(new CounterMutation(rm, ThriftConversion.fromThrift(consistency_level))));
+            doInsert(consistency_level, Arrays.asList(new CounterMutation(mutation, ThriftConversion.fromThrift(consistency_level))));
         else
-            doInsert(consistency_level, Arrays.asList(rm));
+            doInsert(consistency_level, Arrays.asList(mutation));
     }
 
     public void remove(ByteBuffer key, ColumnPath column_path, long timestamp, ConsistencyLevel consistency_level)
@@ -1069,7 +1086,11 @@
         if (mutations.isEmpty())
             return;
 
-        schedule(DatabaseDescriptor.getWriteRpcTimeout());
+        long timeout = Long.MAX_VALUE;
+        for (IMutation m : mutations)
+            timeout = Longs.min(timeout, m.getTimeout());
+
+        schedule(timeout);
         try
         {
             StorageProxy.mutateWithTriggers(mutations, consistencyLevel, mutateAtomically);
@@ -1153,8 +1174,8 @@
             {
                 RowPosition end = range.end_key == null
                                 ? p.getTokenFactory().fromString(range.end_token).maxKeyBound(p)
-                                : RowPosition.forKey(range.end_key, p);
-                bounds = new Bounds<RowPosition>(RowPosition.forKey(range.start_key, p), end);
+                                : RowPosition.ForKey.get(range.end_key, p);
+                bounds = new Bounds<RowPosition>(RowPosition.ForKey.get(range.start_key, p), end);
             }
             long now = System.currentTimeMillis();
             schedule(DatabaseDescriptor.getRangeRpcTimeout());
@@ -1166,7 +1187,7 @@
                                                                         now,
                                                                         filter,
                                                                         bounds,
-                                                                        range.row_filter,
+                                                                        ThriftConversion.fromThrift(range.row_filter),
                                                                         range.count),
                                                   consistencyLevel);
             }
@@ -1241,8 +1262,8 @@
             {
                 RowPosition end = range.end_key == null
                                 ? p.getTokenFactory().fromString(range.end_token).maxKeyBound(p)
-                                : RowPosition.forKey(range.end_key, p);
-                bounds = new Bounds<RowPosition>(RowPosition.forKey(range.start_key, p), end);
+                                : RowPosition.ForKey.get(range.end_key, p);
+                bounds = new Bounds<RowPosition>(RowPosition.ForKey.get(range.start_key, p), end);
             }
 
             if (range.row_filter != null && !range.row_filter.isEmpty())
@@ -1289,7 +1310,7 @@
         for (Row row : rows)
         {
             List<ColumnOrSuperColumn> thriftifiedColumns = thriftifyColumnFamily(row.cf, column_parent.super_column != null, reversed, now);
-            keySlices.add(new KeySlice(row.key.key, thriftifiedColumns));
+            keySlices.add(new KeySlice(row.key.getKey(), thriftifiedColumns));
         }
 
         return keySlices;
@@ -1324,7 +1345,7 @@
             consistencyLevel.validateForRead(keyspace);
 
             IPartitioner p = StorageService.getPartitioner();
-            AbstractBounds<RowPosition> bounds = new Bounds<RowPosition>(RowPosition.forKey(index_clause.start_key, p),
+            AbstractBounds<RowPosition> bounds = new Bounds<RowPosition>(RowPosition.ForKey.get(index_clause.start_key, p),
                                                                          p.getMinimumToken().minKeyBound());
 
             IDiskAtomFilter filter = ThriftValidation.asIFilter(column_predicate, metadata, column_parent.super_column);
@@ -1334,7 +1355,7 @@
                                                               now,
                                                               filter,
                                                               bounds,
-                                                              index_clause.expressions,
+                                                              ThriftConversion.fromThrift(index_clause.expressions),
                                                               index_clause.count);
 
             List<Row> rows = StorageProxy.getRangeSlice(command, consistencyLevel);
@@ -1372,7 +1393,7 @@
             }
             catch (NotFoundException nfe)
             {
-                logger.info("Failed to find metadata for keyspace '" + ks + "'. Continuing... ");
+                logger.info("Failed to find metadata for keyspace '{}'. Continuing... ", ks);
             }
         }
         return ksset;
@@ -1452,7 +1473,7 @@
             Token.TokenFactory tf = StorageService.getPartitioner().getTokenFactory();
             Range<Token> tr = new Range<Token>(tf.fromString(start_token), tf.fromString(end_token));
             List<Pair<Range<Token>, Long>> splits =
-                    StorageService.instance.getSplits(state().getKeyspace(), cfName, tr, keys_per_split, Schema.instance.getCFMetaData(state().getKeyspace(), cfName));
+                    StorageService.instance.getSplits(state().getKeyspace(), cfName, tr, keys_per_split);
             List<CfSplit> result = new ArrayList<CfSplit>(splits.size());
             for (Pair<Range<Token>, Long> split : splits)
                 result.add(new CfSplit(split.left.left.toString(), split.left.right.toString(), split.right));
@@ -1769,19 +1790,19 @@
 
             ThriftValidation.validateColumnNames(metadata, column_parent, Arrays.asList(column.name));
 
-            RowMutation rm = new RowMutation(keyspace, key);
+            org.apache.cassandra.db.Mutation mutation = new org.apache.cassandra.db.Mutation(keyspace, key);
             try
             {
                 if (metadata.isSuper())
-                    rm.addCounter(column_parent.column_family, CompositeType.build(column_parent.super_column, column.name), column.value);
+                    mutation.addCounter(column_parent.column_family, metadata.comparator.makeCellName(column_parent.super_column, column.name), column.value);
                 else
-                    rm.addCounter(column_parent.column_family, column.name, column.value);
+                    mutation.addCounter(column_parent.column_family, metadata.comparator.cellFromByteBuffer(column.name), column.value);
             }
             catch (MarshalException e)
             {
                 throw new InvalidRequestException(e.getMessage());
             }
-            doInsert(consistency_level, Arrays.asList(new CounterMutation(rm, ThriftConversion.fromThrift(consistency_level))));
+            doInsert(consistency_level, Arrays.asList(new CounterMutation(mutation, ThriftConversion.fromThrift(consistency_level))));
         }
         catch (RequestValidationException e)
         {
@@ -1855,7 +1876,7 @@
 
                     decompressor.end();
 
-                    queryString = new String(decompressed.getData(), 0, decompressed.size(), "UTF-8");
+                    queryString = new String(decompressed.getData(), 0, decompressed.getLength(), StandardCharsets.UTF_8);
                     break;
                 case NONE:
                     try
@@ -1873,10 +1894,6 @@
         {
             throw new InvalidRequestException("Error deflating query string.");
         }
-        catch (UnsupportedEncodingException e)
-        {
-            throw new InvalidRequestException("Unknown query string encoding.");
-        }
         return queryString;
     }
 
@@ -1955,7 +1972,7 @@
             }
 
             ThriftClientState cState = state();
-            return cState.getCQLQueryHandler().process(queryString, cState.getQueryState(), new QueryOptions(ThriftConversion.fromThrift(cLevel), Collections.<ByteBuffer>emptyList())).toThriftResult();
+            return cState.getCQLQueryHandler().process(queryString, cState.getQueryState(), QueryOptions.fromProtocolV2(ThriftConversion.fromThrift(cLevel), Collections.<ByteBuffer>emptyList())).toThriftResult();
         }
         catch (RequestExecutionException e)
         {
@@ -2016,6 +2033,79 @@
         }
     }
 
+    @Override
+    public List<ColumnOrSuperColumn> get_multi_slice(MultiSliceRequest request)
+            throws InvalidRequestException, UnavailableException, TimedOutException
+    {
+        if (startSessionIfRequested())
+        {
+            Map<String, String> traceParameters = ImmutableMap.of("key", ByteBufferUtil.bytesToHex(request.key),
+                                                                  "column_parent", request.column_parent.toString(),
+                                                                  "consistency_level", request.consistency_level.name(),
+                                                                  "count", String.valueOf(request.count),
+                                                                  "column_slices", request.column_slices.toString());
+            Tracing.instance.begin("get_multi_slice", traceParameters);
+        }
+        else
+        {
+            logger.debug("get_multi_slice");
+        }
+        try
+        {
+            ClientState cState = state();
+            String keyspace = cState.getKeyspace();
+            state().hasColumnFamilyAccess(keyspace, request.getColumn_parent().column_family, Permission.SELECT);
+            CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, request.getColumn_parent().column_family);
+            if (metadata.cfType == ColumnFamilyType.Super)
+                throw new org.apache.cassandra.exceptions.InvalidRequestException("get_multi_slice does not support super columns");
+            ThriftValidation.validateColumnParent(metadata, request.getColumn_parent());
+            org.apache.cassandra.db.ConsistencyLevel consistencyLevel = ThriftConversion.fromThrift(request.getConsistency_level());
+            consistencyLevel.validateForRead(keyspace);
+            List<ReadCommand> commands = new ArrayList<>(1);
+            ColumnSlice[] slices = new ColumnSlice[request.getColumn_slices().size()];
+            for (int i = 0 ; i < request.getColumn_slices().size() ; i++)
+            {
+                fixOptionalSliceParameters(request.getColumn_slices().get(i));
+                Composite start = metadata.comparator.fromByteBuffer(request.getColumn_slices().get(i).start);
+                Composite finish = metadata.comparator.fromByteBuffer(request.getColumn_slices().get(i).finish);
+                if (!start.isEmpty() && !finish.isEmpty())
+                {
+                    int compare = metadata.comparator.compare(start, finish);
+                    if (!request.reversed && compare > 0)
+                        throw new InvalidRequestException(String.format("Column slice at index %d had start greater than finish", i));
+                    else if (request.reversed && compare < 0)
+                        throw new InvalidRequestException(String.format("Reversed column slice at index %d had start less than finish", i));
+                }
+                slices[i] = new ColumnSlice(start, finish);
+            }
+
+            ColumnSlice[] deoverlapped = ColumnSlice.deoverlapSlices(slices, request.reversed ? metadata.comparator.reverseComparator() : metadata.comparator);
+            SliceQueryFilter filter = new SliceQueryFilter(deoverlapped, request.reversed, request.count);
+            ThriftValidation.validateKey(metadata, request.key);
+            commands.add(ReadCommand.create(keyspace, request.key, request.column_parent.getColumn_family(), System.currentTimeMillis(), filter));
+            return getSlice(commands, request.column_parent.isSetSuper_column(), consistencyLevel).entrySet().iterator().next().getValue();
+        }
+        catch (RequestValidationException e)
+        {
+            throw ThriftConversion.toThrift(e);
+        }
+        finally
+        {
+            Tracing.instance.stopSession();
+        }
+    }
+
+    /**
+     * Set the to start-of end-of value of "" for start and finish.
+     * @param columnSlice
+     */
+    private static void fixOptionalSliceParameters(org.apache.cassandra.thrift.ColumnSlice columnSlice) {
+        if (!columnSlice.isSetStart())
+            columnSlice.setStart(new byte[0]);
+        if (!columnSlice.isSetFinish())
+            columnSlice.setFinish(new byte[0]);
+    }
+
     public CqlResult execute_prepared_cql_query(int itemId, List<ByteBuffer> bindVariables)
     throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
     {
@@ -2075,18 +2165,18 @@
         try
         {
             ThriftClientState cState = state();
-            org.apache.cassandra.cql3.CQLStatement statement = cState.getCQLQueryHandler().getPreparedForThrift(itemId);
+            ParsedStatement.Prepared prepared = cState.getCQLQueryHandler().getPreparedForThrift(itemId);
 
-            if (statement == null)
+            if (prepared == null)
                 throw new InvalidRequestException(String.format("Prepared query with ID %d not found" +
                                                                 " (either the query was not prepared on this host (maybe the host has been restarted?)" +
                                                                 " or you have prepared too many queries and it has been evicted from the internal cache)",
                                                                 itemId));
-            logger.trace("Retrieved prepared statement #{} with {} bind markers", itemId, statement.getBoundTerms());
+            logger.trace("Retrieved prepared statement #{} with {} bind markers", itemId, prepared.statement.getBoundTerms());
 
-            return cState.getCQLQueryHandler().processPrepared(statement,
+            return cState.getCQLQueryHandler().processPrepared(prepared.statement,
                                                                cState.getQueryState(),
-                                                               new QueryOptions(ThriftConversion.fromThrift(cLevel), bindVariables)).toThriftResult();
+                                                               QueryOptions.fromProtocolV2(ThriftConversion.fromThrift(cLevel), bindVariables)).toThriftResult();
         }
         catch (RequestExecutionException e)
         {
@@ -2159,19 +2249,21 @@
         });
     }
 
-    private static class ThriftCASConditions implements CASConditions
+    private static class ThriftCASRequest implements CASRequest
     {
         private final ColumnFamily expected;
+        private final ColumnFamily updates;
 
-        private ThriftCASConditions(ColumnFamily expected)
+        private ThriftCASRequest(ColumnFamily expected, ColumnFamily updates)
         {
             this.expected = expected;
+            this.updates = updates;
         }
 
         public IDiskAtomFilter readFilter()
         {
             return expected == null || expected.isEmpty()
-                 ? new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1)
+                 ? new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 1)
                  : new NamesQueryFilter(ImmutableSortedSet.copyOf(expected.getComparator(), expected.getColumnNames()));
         }
 
@@ -2179,18 +2271,18 @@
         {
             long now = System.currentTimeMillis();
 
-            if (!hasLiveColumns(expected, now))
-                return !hasLiveColumns(current, now);
-            else if (!hasLiveColumns(current, now))
+            if (!hasLiveCells(expected, now))
+                return !hasLiveCells(current, now);
+            else if (!hasLiveCells(current, now))
                 return false;
 
             // current has been built from expected, so we know that it can't have columns
             // that excepted don't have. So we just check that for each columns in expected:
             //   - if it is a tombstone, whether current has no column or a tombstone;
             //   - otherwise, that current has a live column with the same value.
-            for (org.apache.cassandra.db.Column e : expected)
+            for (Cell e : expected)
             {
-                org.apache.cassandra.db.Column c = current.getColumn(e.name());
+                Cell c = current.getColumn(e.name());
                 if (e.isLive(now))
                 {
                     if (c == null || !c.isLive(now) || !c.value().equals(e.value()))
@@ -2205,15 +2297,14 @@
             return true;
         }
 
-        private static boolean hasLiveColumns(ColumnFamily cf, long now)
+        private static boolean hasLiveCells(ColumnFamily cf, long now)
         {
             return cf != null && !cf.hasOnlyTombstones(now);
         }
 
-        @Override
-        public String toString()
+        public ColumnFamily makeUpdates(ColumnFamily current)
         {
-            return expected.toString();
+            return updates;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/thrift/CustomTThreadPoolServer.java b/src/java/org/apache/cassandra/thrift/CustomTThreadPoolServer.java
index d1a3304..f7a602c 100644
--- a/src/java/org/apache/cassandra/thrift/CustomTThreadPoolServer.java
+++ b/src/java/org/apache/cassandra/thrift/CustomTThreadPoolServer.java

@@ -126,7 +126,7 @@
             }
 
             if (activeClients.get() >= args.maxWorkerThreads)
-                logger.warn("Maximum number of clients " + args.maxWorkerThreads + " reached");
+                logger.warn("Maximum number of clients {} reached", args.maxWorkerThreads);
         }
 
         executorService.shutdown();
@@ -255,7 +255,7 @@
                 }
                 else
                 {
-                    serverTransport = new TCustomServerSocket(addr, args.keepAlive, args.sendBufferSize, args.recvBufferSize);
+                    serverTransport = new TCustomServerSocket(addr, args.keepAlive, args.sendBufferSize, args.recvBufferSize, args.listenBacklog);
                 }
             }
             catch (TTransportException e)

diff --git a/src/java/org/apache/cassandra/thrift/TCustomServerSocket.java b/src/java/org/apache/cassandra/thrift/TCustomServerSocket.java
index c30cec0..d88cf71 100644
--- a/src/java/org/apache/cassandra/thrift/TCustomServerSocket.java
+++ b/src/java/org/apache/cassandra/thrift/TCustomServerSocket.java

@@ -57,7 +57,7 @@
      * @throws TTransportException
      */
     public TCustomServerSocket(InetSocketAddress bindAddr, boolean keepAlive, Integer sendBufferSize,
-            Integer recvBufferSize)
+            Integer recvBufferSize, Integer listenBacklog)
             throws TTransportException
     {
         try
@@ -67,7 +67,7 @@
             // Prevent 2MSL delay problem on server restarts
             serverSocket.setReuseAddress(true);
             // Bind to listening port
-            serverSocket.bind(bindAddr);
+            serverSocket.bind(bindAddr, listenBacklog);
         }
         catch (IOException ioe)
         {

diff --git a/src/java/org/apache/cassandra/thrift/TServerCustomFactory.java b/src/java/org/apache/cassandra/thrift/TServerCustomFactory.java
index 3c21d3a..3c5b967 100644
--- a/src/java/org/apache/cassandra/thrift/TServerCustomFactory.java
+++ b/src/java/org/apache/cassandra/thrift/TServerCustomFactory.java

@@ -44,7 +44,6 @@
         if (ThriftServer.SYNC.equalsIgnoreCase(serverType))
         {
             server = new CustomTThreadPoolServer.Factory().buildTServer(args);
-            logger.info(String.format("Using synchronous/threadpool thrift server on %s : %s", args.addr.getHostName(), args.addr.getPort()));
         }
         else if(ThriftServer.ASYNC.equalsIgnoreCase(serverType))
         {

diff --git a/src/java/org/apache/cassandra/thrift/TServerFactory.java b/src/java/org/apache/cassandra/thrift/TServerFactory.java
index 2e2acb8..09014ce 100644
--- a/src/java/org/apache/cassandra/thrift/TServerFactory.java
+++ b/src/java/org/apache/cassandra/thrift/TServerFactory.java

@@ -32,7 +32,7 @@
     public static class Args
     {
         public InetSocketAddress addr;
-        public CassandraServer cassandraServer;
+        public Integer listenBacklog;
         public TProcessor processor;
         public TProtocolFactory tProtocolFactory;
         public TTransportFactory inTransportFactory;

diff --git a/src/java/org/apache/cassandra/thrift/ThriftConversion.java b/src/java/org/apache/cassandra/thrift/ThriftConversion.java
index 24ce045..0c75d2c 100644
--- a/src/java/org/apache/cassandra/thrift/ThriftConversion.java
+++ b/src/java/org/apache/cassandra/thrift/ThriftConversion.java

@@ -17,6 +17,10 @@
  */
 package org.apache.cassandra.thrift;
 
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
 import org.apache.cassandra.db.WriteType;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.RequestTimeoutException;
@@ -47,6 +51,25 @@
         throw new AssertionError();
     }
 
+    public static ConsistencyLevel toThrift(org.apache.cassandra.db.ConsistencyLevel cl)
+    {
+        switch (cl)
+        {
+            case ANY: return ConsistencyLevel.ANY;
+            case ONE: return ConsistencyLevel.ONE;
+            case TWO: return ConsistencyLevel.TWO;
+            case THREE: return ConsistencyLevel.THREE;
+            case QUORUM: return ConsistencyLevel.QUORUM;
+            case ALL: return ConsistencyLevel.ALL;
+            case LOCAL_QUORUM: return ConsistencyLevel.LOCAL_QUORUM;
+            case EACH_QUORUM: return ConsistencyLevel.EACH_QUORUM;
+            case SERIAL: return ConsistencyLevel.SERIAL;
+            case LOCAL_SERIAL: return ConsistencyLevel.LOCAL_SERIAL;
+            case LOCAL_ONE: return ConsistencyLevel.LOCAL_ONE;
+        }
+        throw new AssertionError();
+    }
+
     // We never return, but returning a RuntimeException allows to write "throw rethrow(e)" without java complaining
     // for methods that have a return value.
     public static RuntimeException rethrow(RequestExecutionException e) throws UnavailableException, TimedOutException
@@ -62,11 +85,6 @@
         return new InvalidRequestException(e.getMessage());
     }
 
-    public static InvalidRequestException toThrift(org.apache.cassandra.exceptions.InvalidRequestException e)
-    {
-        return new InvalidRequestException(e.getMessage());
-    }
-
     public static UnavailableException toThrift(org.apache.cassandra.exceptions.UnavailableException e)
     {
         return new UnavailableException();
@@ -93,4 +111,22 @@
         }
         return toe;
     }
+
+    public static List<org.apache.cassandra.db.IndexExpression> fromThrift(List<IndexExpression> exprs)
+    {
+        if (exprs == null)
+            return null;
+
+        if (exprs.isEmpty())
+            return Collections.emptyList();
+
+        List<org.apache.cassandra.db.IndexExpression> converted = new ArrayList<>(exprs.size());
+        for (IndexExpression expr : exprs)
+        {
+            converted.add(new org.apache.cassandra.db.IndexExpression(expr.column_name,
+                                                                      org.apache.cassandra.db.IndexExpression.Operator.findByOrdinal(expr.op.getValue()),
+                                                                      expr.value));
+        }
+        return converted;
+    }
 }

diff --git a/src/java/org/apache/cassandra/thrift/ThriftServer.java b/src/java/org/apache/cassandra/thrift/ThriftServer.java
index dbd3824..2aef2e3 100644
--- a/src/java/org/apache/cassandra/thrift/ThriftServer.java
+++ b/src/java/org/apache/cassandra/thrift/ThriftServer.java

@@ -40,12 +40,14 @@
 
     protected final InetAddress address;
     protected final int port;
+    protected final int backlog;
     private volatile ThriftServerThread server;
 
-    public ThriftServer(InetAddress address, int port)
+    public ThriftServer(InetAddress address, int port, int backlog)
     {
         this.address = address;
         this.port = port;
+        this.backlog = backlog;
     }
 
     public void start()
@@ -53,7 +55,7 @@
         if (server == null)
         {
             CassandraServer iface = getCassandraServer();
-            server = new ThriftServerThread(address, port, iface, getProcessor(iface), getTransportFactory());
+            server = new ThriftServerThread(address, port, backlog, getProcessor(iface), getTransportFactory());
             server.start();
         }
     }
@@ -96,7 +98,6 @@
     protected TTransportFactory getTransportFactory()
     {
         int tFramedTransportSize = DatabaseDescriptor.getThriftFramedTransportSize();
-        logger.info("Using TFramedTransport with a max frame size of {} bytes.", tFramedTransportSize);
         return new TFramedTransport.Factory(tFramedTransportSize);
     }
 
@@ -110,7 +111,7 @@
 
         public ThriftServerThread(InetAddress listenAddr,
                                   int listenPort,
-                                  CassandraServer server,
+                                  int listenBacklog,
                                   TProcessor processor,
                                   TTransportFactory transportFactory)
         {
@@ -120,7 +121,7 @@
             TServerFactory.Args args = new TServerFactory.Args();
             args.tProtocolFactory = new TBinaryProtocol.Factory(true, true);
             args.addr = new InetSocketAddress(listenAddr, listenPort);
-            args.cassandraServer = server;
+            args.listenBacklog = listenBacklog;
             args.processor = processor;
             args.keepAlive = DatabaseDescriptor.getRpcKeepAlive();
             args.sendBufferSize = DatabaseDescriptor.getRpcSendBufferSize();

diff --git a/src/java/org/apache/cassandra/thrift/ThriftValidation.java b/src/java/org/apache/cassandra/thrift/ThriftValidation.java
index b387871..3b5663b 100644
--- a/src/java/org/apache/cassandra/thrift/ThriftValidation.java
+++ b/src/java/org/apache/cassandra/thrift/ThriftValidation.java

@@ -20,24 +20,23 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.cassandra.serializers.MarshalException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.*;
-import org.apache.cassandra.cql3.CFDefinition;
 import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
 import org.apache.cassandra.db.filter.NamesQueryFilter;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.ColumnToCollectionType;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
@@ -93,12 +92,12 @@
 
         if (isCommutativeOp)
         {
-            if (!metadata.getDefaultValidator().isCommutative())
+            if (!metadata.isCounter())
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("invalid operation for non commutative columnfamily " + cfName);
         }
         else
         {
-            if (metadata.getDefaultValidator().isCommutative())
+            if (metadata.isCounter())
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("invalid operation for commutative columnfamily " + cfName);
         }
         return metadata;
@@ -199,7 +198,7 @@
     private static void validateColumnNames(CFMetaData metadata, ByteBuffer superColumnName, Iterable<ByteBuffer> column_names)
     throws org.apache.cassandra.exceptions.InvalidRequestException
     {
-        int maxNameLength = org.apache.cassandra.db.Column.MAX_NAME_LENGTH;
+        int maxNameLength = Cell.MAX_NAME_LENGTH;
 
         if (superColumnName != null)
         {
@@ -211,7 +210,6 @@
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("supercolumn specified to ColumnFamily " + metadata.cfName + " containing normal columns");
         }
         AbstractType<?> comparator = SuperColumns.getComparatorFor(metadata, superColumnName);
-        CFDefinition cfDef = metadata.getCfDef();
         boolean isCQL3Table = !metadata.isThriftCompatible();
         for (ByteBuffer name : column_names)
         {
@@ -231,30 +229,28 @@
             if (isCQL3Table)
             {
                 // CQL3 table don't support having only part of their composite column names set
-                CompositeType composite = (CompositeType)comparator;
-                ByteBuffer[] components = composite.split(name);
-                int minComponents = composite.types.size() - (cfDef.hasCollections ? 1 : 0);
-                if (components.length < minComponents)
+                Composite composite = metadata.comparator.fromByteBuffer(name);
+
+                int minComponents = metadata.comparator.clusteringPrefixSize() + 1;
+                if (composite.size() < minComponents)
                     throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Not enough components (found %d but %d expected) for column name since %s is a CQL3 table",
-                                                                                                    components.length, minComponents, metadata.cfName));
+                                                                                                    composite.size(), minComponents, metadata.cfName));
 
                 // Furthermore, the column name must be a declared one.
-                int columnIndex = composite.types.size() - (cfDef.hasCollections ? 2 : 1);
-                ByteBuffer CQL3ColumnName = components[columnIndex];
+                int columnIndex = metadata.comparator.clusteringPrefixSize();
+                ByteBuffer CQL3ColumnName = composite.get(columnIndex);
                 if (!CQL3ColumnName.hasRemaining())
                     continue; // Row marker, ok
 
-                ColumnIdentifier columnId = new ColumnIdentifier(CQL3ColumnName, composite.types.get(columnIndex));
-                CFDefinition.Name columnName = cfDef.get(columnId);
-                if (columnName == null || columnName.isPrimaryKeyColumn())
+                ColumnIdentifier columnId = new ColumnIdentifier(CQL3ColumnName, metadata.comparator.subtype(columnIndex));
+                if (metadata.getColumnDefinition(columnId) == null)
                     throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Invalid cell for CQL3 table %s. The CQL3 column component (%s) does not correspond to a defined CQL3 column",
                                                                                                     metadata.cfName, columnId));
 
                 // On top of that, if we have a collection component, he (CQL3) column must be a collection
-                if (cfDef.hasCollections && components.length == composite.types.size())
+                if (metadata.comparator.hasCollections() && composite.size() == metadata.comparator.size())
                 {
-                    assert components.length >= 2;
-                    ColumnToCollectionType collectionType = (ColumnToCollectionType)composite.types.get(composite.types.size() - 1);
+                    ColumnToCollectionType collectionType = metadata.comparator.collectionType();
                     if (!collectionType.defined.containsKey(CQL3ColumnName))
                         throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Invalid collection component, %s is not a collection", UTF8Type.instance.getString(CQL3ColumnName)));
                 }
@@ -272,7 +268,7 @@
         if (range.count < 0)
             throw new org.apache.cassandra.exceptions.InvalidRequestException("get_slice requires non-negative count");
 
-        int maxNameLength = org.apache.cassandra.db.Column.MAX_NAME_LENGTH;
+        int maxNameLength = Cell.MAX_NAME_LENGTH;
         if (range.start.remaining() > maxNameLength)
             throw new org.apache.cassandra.exceptions.InvalidRequestException("range start length cannot be larger than " + maxNameLength);
         if (range.finish.remaining() > maxNameLength)
@@ -301,7 +297,7 @@
     public static void validateColumnOrSuperColumn(CFMetaData metadata, ColumnOrSuperColumn cosc)
             throws org.apache.cassandra.exceptions.InvalidRequestException
     {
-        boolean isCommutative = metadata.getDefaultValidator().isCommutative();
+        boolean isCommutative = metadata.isCounter();
 
         int nulls = 0;
         if (cosc.column == null) nulls++;
@@ -319,7 +315,7 @@
 
             validateTtl(cosc.column);
             validateColumnPath(metadata, new ColumnPath(metadata.cfName).setSuper_column((ByteBuffer)null).setColumn(cosc.column.name));
-            validateColumnData(metadata, cosc.column, false);
+            validateColumnData(metadata, null, cosc.column);
         }
 
         if (cosc.super_column != null)
@@ -330,7 +326,7 @@
             for (Column c : cosc.super_column.columns)
             {
                 validateColumnPath(metadata, new ColumnPath(metadata.cfName).setSuper_column(cosc.super_column.name).setColumn(c.name));
-                validateColumnData(metadata, c, true);
+                validateColumnData(metadata, cosc.super_column.name, c);
             }
         }
 
@@ -359,8 +355,8 @@
             if (column.ttl <= 0)
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("ttl must be positive");
 
-            if (column.ttl > ExpiringColumn.MAX_TTL)
-                throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("ttl is too large. requested (%d) maximum (%d)", column.ttl, ExpiringColumn.MAX_TTL));
+            if (column.ttl > ExpiringCell.MAX_TTL)
+                throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("ttl is too large. requested (%d) maximum (%d)", column.ttl, ExpiringCell.MAX_TTL));
         }
         else
         {
@@ -409,7 +405,7 @@
             throw new org.apache.cassandra.exceptions.InvalidRequestException(msg);
         }
 
-        if (metadata.getDefaultValidator().isCommutative())
+        if (metadata.isCounter())
         {
             // forcing server timestamp even if a timestamp was set for coherence with other counter operation
             del.timestamp = System.currentTimeMillis();
@@ -433,9 +429,9 @@
     }
 
     /**
-     * Validates the data part of the column (everything in the Column object but the name, which is assumed to be valid)
+     * Validates the data part of the column (everything in the column object but the name, which is assumed to be valid)
      */
-    public static void validateColumnData(CFMetaData metadata, Column column, boolean isSubColumn) throws org.apache.cassandra.exceptions.InvalidRequestException
+    public static void validateColumnData(CFMetaData metadata, ByteBuffer scName, Column column) throws org.apache.cassandra.exceptions.InvalidRequestException
     {
         validateTtl(column);
         if (!column.isSetValue())
@@ -443,39 +439,42 @@
         if (!column.isSetTimestamp())
             throw new org.apache.cassandra.exceptions.InvalidRequestException("Column timestamp is required");
 
+        CellName cn = scName == null
+                    ? metadata.comparator.cellFromByteBuffer(column.name)
+                    : metadata.comparator.makeCellName(scName, column.name);
         try
         {
-            AbstractType<?> validator = metadata.getValueValidatorFromColumnName(column.name);
+            AbstractType<?> validator = metadata.getValueValidator(cn);
             if (validator != null)
                 validator.validate(column.value);
         }
         catch (MarshalException me)
         {
             if (logger.isDebugEnabled())
-                logger.debug("rejecting invalid value " + ByteBufferUtil.bytesToHex(summarize(column.value)));
+                logger.debug("rejecting invalid value {}", ByteBufferUtil.bytesToHex(summarize(column.value)));
 
             throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("(%s) [%s][%s][%s] failed validation",
                                                                       me.getMessage(),
                                                                       metadata.ksName,
                                                                       metadata.cfName,
-                                                                      (SuperColumns.getComparatorFor(metadata, isSubColumn)).getString(column.name)));
+                                                                      (SuperColumns.getComparatorFor(metadata, scName != null)).getString(column.name)));
         }
 
         // Indexed column values cannot be larger than 64K.  See CASSANDRA-3057/4240 for more details
-        if (!Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validate(asDBColumn(column)))
+        if (!Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validate(asDBColumn(cn, column)))
                     throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Can't index column value of size %d for index %s in CF %s of KS %s",
                                                                               column.value.remaining(),
-                                                                              metadata.getColumnDefinitionFromColumnName(column.name).getIndexName(),
+                                                                              metadata.getColumnDefinition(cn).getIndexName(),
                                                                               metadata.cfName,
                                                                               metadata.ksName));
     }
 
-    private static org.apache.cassandra.db.Column asDBColumn(Column column)
+    private static Cell asDBColumn(CellName name, Column column)
     {
         if (column.ttl <= 0)
-            return new org.apache.cassandra.db.Column(column.name, column.value, column.timestamp);
+            return new BufferCell(name, column.value, column.timestamp);
         else
-            return new org.apache.cassandra.db.ExpiringColumn(column.name, column.value, column.timestamp, column.ttl);
+            return new BufferExpiringCell(name, column.value, column.timestamp, column.ttl);
     }
 
     /**
@@ -535,7 +534,7 @@
         {
             // start_token/end_token can wrap, but key/token should not
             RowPosition stop = p.getTokenFactory().fromString(range.end_token).maxKeyBound(p);
-            if (RowPosition.forKey(range.start_key, p).compareTo(stop) > 0 && !stop.isMinimum())
+            if (RowPosition.ForKey.get(range.start_key, p).compareTo(stop) > 0 && !stop.isMinimum())
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("Start key's token sorts after end token");
         }
 
@@ -596,7 +595,8 @@
             if (expression.value.remaining() > 0xFFFF)
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("Index expression values may not be larger than 64K");
 
-            AbstractType<?> valueValidator = metadata.getValueValidatorFromColumnName(expression.column_name);
+            CellName name = metadata.comparator.cellFromByteBuffer(expression.column_name);
+            AbstractType<?> valueValidator = metadata.getValueValidator(name);
             try
             {
                 valueValidator.validate(expression.value);
@@ -609,7 +609,7 @@
                                                                                   me.getMessage()));
             }
 
-            isIndexed |= (expression.op == IndexOperator.EQ) && idxManager.indexes(expression.column_name);
+            isIndexed |= (expression.op == IndexOperator.EQ) && idxManager.indexes(name);
         }
 
         return isIndexed;
@@ -639,23 +639,28 @@
     {
         SliceRange sr = sp.slice_range;
         IDiskAtomFilter filter;
+
+        CellNameType comparator = metadata.isSuper()
+                                ? new SimpleDenseCellNameType(metadata.comparator.subtype(superColumn == null ? 0 : 1))
+                                : metadata.comparator;
         if (sr == null)
         {
-            AbstractType<?> comparator = metadata.isSuper()
-                    ? ((CompositeType)metadata.comparator).types.get(superColumn == null ? 0 : 1)
-                    : metadata.comparator;
 
-            SortedSet<ByteBuffer> ss = new TreeSet<ByteBuffer>(comparator);
-            ss.addAll(sp.column_names);
+            SortedSet<CellName> ss = new TreeSet<CellName>(comparator);
+            for (ByteBuffer bb : sp.column_names)
+                ss.add(comparator.cellFromByteBuffer(bb));
             filter = new NamesQueryFilter(ss);
         }
         else
         {
-            filter = new SliceQueryFilter(sr.start, sr.finish, sr.reversed, sr.count);
+            filter = new SliceQueryFilter(comparator.fromByteBuffer(sr.start),
+                                          comparator.fromByteBuffer(sr.finish),
+                                          sr.reversed,
+                                          sr.count);
         }
 
         if (metadata.isSuper())
-            filter = SuperColumns.fromSCFilter((CompositeType)metadata.comparator, superColumn, filter);
+            filter = SuperColumns.fromSCFilter(metadata.comparator, superColumn, filter);
         return filter;
     }
 }

diff --git a/src/java/org/apache/cassandra/tools/BulkLoader.java b/src/java/org/apache/cassandra/tools/BulkLoader.java
index 4077722..50e340b 100644
--- a/src/java/org/apache/cassandra/tools/BulkLoader.java
+++ b/src/java/org/apache/cassandra/tools/BulkLoader.java

@@ -20,22 +20,22 @@
 import java.io.File;
 import java.net.*;
 import java.util.*;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.TimeUnit;
 
 import com.google.common.base.Joiner;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
 import com.google.common.collect.Sets;
 
-import org.apache.cassandra.config.*;
-
 import org.apache.commons.cli.*;
 import org.apache.thrift.protocol.TBinaryProtocol;
 import org.apache.thrift.protocol.TProtocol;
 import org.apache.thrift.transport.TTransport;
 
 import org.apache.cassandra.auth.IAuthenticator;
+import org.apache.cassandra.config.*;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
@@ -49,7 +49,6 @@
 {
     private static final String TOOL_NAME = "sstableloader";
     private static final String VERBOSE_OPTION  = "verbose";
-    private static final String DEBUG_OPTION  = "debug";
     private static final String HELP_OPTION  = "help";
     private static final String NOPROGRESS_OPTION  = "no-progress";
     private static final String IGNORE_NODES_OPTION  = "ignore";
@@ -70,55 +69,67 @@
     private static final String SSL_ALGORITHM = "ssl-alg";
     private static final String SSL_STORE_TYPE = "store-type";
     private static final String SSL_CIPHER_SUITES = "ssl-ciphers";
+    private static final String CONNECTIONS_PER_HOST = "connections-per-host";
     private static final String CONFIG_PATH = "conf-path";
 
     public static void main(String args[])
     {
         LoaderOptions options = LoaderOptions.parseArgs(args);
         OutputHandler handler = new OutputHandler.SystemOutput(options.verbose, options.debug);
-        SSTableLoader loader = new SSTableLoader(options.directory, new ExternalClient(options.hosts,
-                                                                                       options.rpcPort,
-                                                                                       options.user,
-                                                                                       options.passwd,
-                                                                                       options.transportFactory,
-                                                                                       options.storagePort,
-                                                                                       options.sslStoragePort,
-                                                                                       options.serverEncOptions), handler);
+        SSTableLoader loader = new SSTableLoader(
+                options.directory,
+                new ExternalClient(
+                        options.hosts,
+                        options.rpcPort,
+                        options.user,
+                        options.passwd,
+                        options.transportFactory,
+                        options.storagePort,
+                        options.sslStoragePort,
+                        options.serverEncOptions),
+                handler,
+                options.connectionsPerHost);
         DatabaseDescriptor.setStreamThroughputOutboundMegabitsPerSec(options.throttle);
         StreamResultFuture future = null;
+
+        ProgressIndicator indicator = new ProgressIndicator();
         try
         {
             if (options.noProgress)
+            {
                 future = loader.stream(options.ignores);
+            }
             else
-                future = loader.stream(options.ignores, new ProgressIndicator());
+            {
+                future = loader.stream(options.ignores, indicator);
+            }
+
         }
         catch (Exception e)
         {
             System.err.println(e.getMessage());
             if (e.getCause() != null)
                 System.err.println(e.getCause());
-            if (options.debug)
-                e.printStackTrace(System.err);
-            else
-                System.err.println("Run with --debug to get full stack trace or --help to get help.");
+            e.printStackTrace(System.err);
             System.exit(1);
         }
 
-        handler.output(String.format("Streaming session ID: %s", future.planId));
-
         try
         {
             future.get();
+
+            if (!options.noProgress)
+                indicator.printSummary(options.connectionsPerHost);
+
+            // Give sockets time to gracefully close
+            Thread.sleep(1000);
             System.exit(0); // We need that to stop non daemonized threads
         }
         catch (Exception e)
         {
             System.err.println("Streaming to the following hosts failed:");
             System.err.println(loader.getFailedHosts());
-            System.err.println(e);
-            if (options.debug)
-                e.printStackTrace(System.err);
+            e.printStackTrace(System.err);
             System.exit(1);
         }
     }
@@ -126,13 +137,15 @@
     // Return true when everything is at 100%
     static class ProgressIndicator implements StreamEventHandler
     {
-        private final Map<InetAddress, SessionInfo> sessionsByHost = new ConcurrentHashMap<>();
-        private final Map<InetAddress, Set<ProgressInfo>> progressByHost = new ConcurrentHashMap<>();
-
         private long start;
         private long lastProgress;
         private long lastTime;
 
+        private int peak = 0;
+        private int totalFiles = 0;
+
+        private final Multimap<InetAddress, SessionInfo> sessionsByHost = HashMultimap.create();
+
         public ProgressIndicator()
         {
             start = lastTime = System.nanoTime();
@@ -141,70 +154,100 @@
         public void onSuccess(StreamState finalState) {}
         public void onFailure(Throwable t) {}
 
-        public void handleStreamEvent(StreamEvent event)
+        public synchronized void handleStreamEvent(StreamEvent event)
         {
             if (event.eventType == StreamEvent.Type.STREAM_PREPARED)
             {
                 SessionInfo session = ((StreamEvent.SessionPreparedEvent) event).session;
                 sessionsByHost.put(session.peer, session);
             }
-            else if (event.eventType == StreamEvent.Type.FILE_PROGRESS)
+            else if (event.eventType == StreamEvent.Type.FILE_PROGRESS || event.eventType == StreamEvent.Type.STREAM_COMPLETE)
             {
-                ProgressInfo progressInfo = ((StreamEvent.ProgressEvent) event).progress;
-
-                // update progress
-                Set<ProgressInfo> progresses = progressByHost.get(progressInfo.peer);
-                if (progresses == null)
+                ProgressInfo progressInfo = null;
+                if (event.eventType == StreamEvent.Type.FILE_PROGRESS)
                 {
-                    progresses = Sets.newSetFromMap(new ConcurrentHashMap<ProgressInfo, Boolean>());
-                    progressByHost.put(progressInfo.peer, progresses);
+                    progressInfo = ((StreamEvent.ProgressEvent) event).progress;
                 }
-                if (progresses.contains(progressInfo))
-                    progresses.remove(progressInfo);
-                progresses.add(progressInfo);
+
+                long time = System.nanoTime();
+                long deltaTime = time - lastTime;
 
                 StringBuilder sb = new StringBuilder();
                 sb.append("\rprogress: ");
 
                 long totalProgress = 0;
                 long totalSize = 0;
-                for (Map.Entry<InetAddress, Set<ProgressInfo>> entry : progressByHost.entrySet())
-                {
-                    SessionInfo session = sessionsByHost.get(entry.getKey());
 
-                    long size = session.getTotalSizeToSend();
-                    long current = 0;
-                    int completed = 0;
-                    for (ProgressInfo progress : entry.getValue())
+                boolean updateTotalFiles = totalFiles == 0;
+                // recalculate progress across all sessions in all hosts and display
+                for (InetAddress peer : sessionsByHost.keySet())
+                {
+                    sb.append("[").append(peer.toString()).append("]");
+
+                    for (SessionInfo session : sessionsByHost.get(peer))
                     {
-                        if (progress.currentBytes == progress.totalBytes)
-                            completed++;
-                        current += progress.currentBytes;
+                        long size = session.getTotalSizeToSend();
+                        long current = 0;
+                        int completed = 0;
+
+                        if (progressInfo != null && session.peer.equals(progressInfo.peer) && (session.sessionIndex == progressInfo.sessionIndex))
+                        {
+                            session.updateProgress(progressInfo);
+                        }
+                        for (ProgressInfo progress : session.getSendingFiles())
+                        {
+                            if (progress.isCompleted())
+                                completed++;
+                            current += progress.currentBytes;
+                        }
+                        totalProgress += current;
+
+                        totalSize += size;
+
+                        sb.append(session.sessionIndex).append(":");
+                        sb.append(completed).append("/").append(session.getTotalFilesToSend());
+                        sb.append(" ").append(String.format("%-3d", size == 0 ? 100L : current * 100L / size)).append("% ");
+
+                        if (updateTotalFiles)
+                            totalFiles += session.getTotalFilesToSend();
                     }
-                    totalProgress += current;
-                    totalSize += size;
-                    sb.append("[").append(entry.getKey());
-                    sb.append(" ").append(completed).append("/").append(session.getTotalFilesToSend());
-                    sb.append(" (").append(size == 0 ? 100L : current * 100L / size).append("%)] ");
                 }
-                long time = System.nanoTime();
-                long deltaTime = TimeUnit.NANOSECONDS.toMillis(time - lastTime);
+
                 lastTime = time;
                 long deltaProgress = totalProgress - lastProgress;
                 lastProgress = totalProgress;
 
-                sb.append("[total: ").append(totalSize == 0 ? 100L : totalProgress * 100L / totalSize).append("% - ");
-                sb.append(mbPerSec(deltaProgress, deltaTime)).append("MB/s");
-                sb.append(" (avg: ").append(mbPerSec(totalProgress, TimeUnit.NANOSECONDS.toMillis(time - start))).append("MB/s)]");
+                sb.append("total: ").append(totalSize == 0 ? 100L : totalProgress * 100L / totalSize).append("% ");
+                sb.append(String.format("%-3d", mbPerSec(deltaProgress, deltaTime))).append("MB/s");
+                int average = mbPerSec(totalProgress, (time - start));
+                if (average > peak)
+                    peak = average;
+                sb.append("(avg: ").append(average).append(" MB/s)");
 
-                System.out.print(sb.toString());
+                System.err.print(sb.toString());
             }
         }
 
-        private int mbPerSec(long bytes, long timeInMs)
+        private int mbPerSec(long bytes, long timeInNano)
         {
-            double bytesPerMs = ((double)bytes) / timeInMs;
-            return (int)((bytesPerMs * 1000) / (1024 * 2024));
+            double bytesPerNano = ((double)bytes) / timeInNano;
+            return (int)((bytesPerNano * 1000 * 1000 * 1000) / (1024 * 2024));
+        }
+
+        private void printSummary(int connectionsPerHost)
+        {
+            long end = System.nanoTime();
+            long durationMS = ((end - start) / (1000000));
+            int average = mbPerSec(lastProgress, (end - start));
+            StringBuilder sb = new StringBuilder();
+            sb.append("\nSummary statistics: \n");
+            sb.append(String.format("   %-30s: %-10d%n", "Connections per host: ", connectionsPerHost));
+            sb.append(String.format("   %-30s: %-10d%n", "Total files transferred: ", totalFiles));
+            sb.append(String.format("   %-30s: %-10d%n", "Total bytes transferred: ", lastProgress));
+            sb.append(String.format("   %-30s: %-10d%n", "Total duration (ms): ", durationMS));
+            sb.append(String.format("   %-30s: %-10d%n", "Average transfer rate (MB/s): ", + average));
+            sb.append(String.format("   %-30s: %-10d%n", "Peak transfer rate (MB/s): ", + peak));
+            System.err.println(sb.toString());
         }
     }
 
@@ -264,14 +307,24 @@
                         }
                     }
 
-                    String query = String.format("SELECT * FROM %s.%s WHERE keyspace_name = '%s'",
+                    String cfQuery = String.format("SELECT * FROM %s.%s WHERE keyspace_name = '%s'",
                                                  Keyspace.SYSTEM_KS,
                                                  SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF,
                                                  keyspace);
-                    CqlResult result = client.execute_cql3_query(ByteBufferUtil.bytes(query), Compression.NONE, ConsistencyLevel.ONE);
-                    for (CqlRow row : result.rows)
+                    CqlResult cfRes = client.execute_cql3_query(ByteBufferUtil.bytes(cfQuery), Compression.NONE, ConsistencyLevel.ONE);
+
+
+                    for (CqlRow row : cfRes.rows)
                     {
-                        CFMetaData metadata = CFMetaData.fromThriftCqlRow(row);
+                        String columnFamily = UTF8Type.instance.getString(row.columns.get(1).bufferForName());
+                        String columnsQuery = String.format("SELECT * FROM %s.%s WHERE keyspace_name = '%s' AND columnfamily_name = '%s'",
+                                                            Keyspace.SYSTEM_KS,
+                                                            SystemKeyspace.SCHEMA_COLUMNS_CF,
+                                                            keyspace,
+                                                            columnFamily);
+                        CqlResult columnsRes = client.execute_cql3_query(ByteBufferUtil.bytes(columnsQuery), Compression.NONE, ConsistencyLevel.ONE);
+
+                        CFMetaData metadata = CFMetaData.fromThriftCqlRow(row, columnsRes);
                         knownCfs.put(metadata.cfName, metadata);
                     }
                     break;
@@ -328,6 +381,7 @@
         public int sslStoragePort;
         public ITransportFactory transportFactory = new TFramedTransportFactory();
         public EncryptionOptions encOptions = new EncryptionOptions.ClientEncryptionOptions();
+        public int connectionsPerHost = 1;
         public EncryptionOptions.ServerEncryptionOptions serverEncOptions = new EncryptionOptions.ServerEncryptionOptions();
 
         public final Set<InetAddress> hosts = new HashSet<>();
@@ -378,7 +432,6 @@
 
                 LoaderOptions opts = new LoaderOptions(dir);
 
-                opts.debug = cmd.hasOption(DEBUG_OPTION);
                 opts.verbose = cmd.hasOption(VERBOSE_OPTION);
                 opts.noProgress = cmd.hasOption(NOPROGRESS_OPTION);
 
@@ -430,6 +483,9 @@
                     }
                 }
 
+                if (cmd.hasOption(CONNECTIONS_PER_HOST))
+                    opts.connectionsPerHost = Integer.parseInt(cmd.getOptionValue(CONNECTIONS_PER_HOST));
+
                 // try to load config file first, so that values can be rewritten with other option values.
                 // otherwise use default config.
                 Config config;
@@ -569,7 +625,6 @@
         private static CmdLineOptions getCmdLineOptions()
         {
             CmdLineOptions options = new CmdLineOptions();
-            options.addOption(null, DEBUG_OPTION,        "display stack traces");
             options.addOption("v",  VERBOSE_OPTION,      "verbose output");
             options.addOption("h",  HELP_OPTION,         "display this help message");
             options.addOption(null, NOPROGRESS_OPTION,   "don't display progress");
@@ -580,6 +635,7 @@
             options.addOption("u",  USER_OPTION, "username", "username for cassandra authentication");
             options.addOption("pw", PASSWD_OPTION, "password", "password for cassandra authentication");
             options.addOption("tf", TRANSPORT_FACTORY, "transport factory", "Fully-qualified ITransportFactory class name for creating a connection to cassandra");
+            options.addOption("cph", CONNECTIONS_PER_HOST, "connectionsPerHost", "number of concurrent connections-per-host.");
             // ssl connection-related options
             options.addOption("ts", SSL_TRUSTSTORE, "TRUSTSTORE", "Client SSL: full path to truststore");
             options.addOption("tspw", SSL_TRUSTSTORE_PW, "TRUSTSTORE-PASSWORD", "Client SSL: password of the truststore");

diff --git a/src/java/org/apache/cassandra/tools/NodeCmd.java b/src/java/org/apache/cassandra/tools/NodeCmd.java
deleted file mode 100644
index 27b50a7..0000000
--- a/src/java/org/apache/cassandra/tools/NodeCmd.java
+++ /dev/null

@@ -1,1877 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.tools;
-
-import java.io.*;
-import java.lang.management.MemoryUsage;
-import java.net.ConnectException;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.text.DecimalFormat;
-import java.text.SimpleDateFormat;
-import java.util.*;
-import java.util.Map.Entry;
-import java.util.concurrent.ExecutionException;
-import javax.management.openmbean.TabularData;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.LinkedHashMultimap;
-import com.google.common.collect.Maps;
-
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.commons.cli.*;
-import org.yaml.snakeyaml.Yaml;
-import org.yaml.snakeyaml.constructor.Constructor;
-import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutorMBean;
-import org.apache.cassandra.db.ColumnFamilyStoreMBean;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.compaction.CompactionManagerMBean;
-import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.locator.EndpointSnitchInfoMBean;
-import org.apache.cassandra.net.MessagingServiceMBean;
-import org.apache.cassandra.service.CacheServiceMBean;
-import org.apache.cassandra.service.StorageProxyMBean;
-import org.apache.cassandra.streaming.StreamState;
-import org.apache.cassandra.streaming.ProgressInfo;
-import org.apache.cassandra.streaming.SessionInfo;
-import org.apache.cassandra.utils.EstimatedHistogram;
-import org.apache.cassandra.utils.Pair;
-
-import static org.apache.commons.lang3.StringUtils.EMPTY;
-
-
-public class NodeCmd
-{
-    private static final String HISTORYFILE = "nodetool.history";
-    private static final Pair<String, String> SNAPSHOT_COLUMNFAMILY_OPT = Pair.create("cf", "column-family");
-    private static final Pair<String, String> HOST_OPT = Pair.create("h", "host");
-    private static final Pair<String, String> PORT_OPT = Pair.create("p", "port");
-    private static final Pair<String, String> USERNAME_OPT = Pair.create("u", "username");
-    private static final Pair<String, String> PASSWORD_OPT = Pair.create("pw", "password");
-    private static final Pair<String, String> TAG_OPT = Pair.create("t", "tag");
-    private static final Pair<String, String> TOKENS_OPT = Pair.create("T", "tokens");
-    private static final Pair<String, String> PRIMARY_RANGE_OPT = Pair.create("pr", "partitioner-range");
-    private static final Pair<String, String> PARALLEL_REPAIR_OPT = Pair.create("par", "parallel");
-    private static final Pair<String, String> LOCAL_DC_REPAIR_OPT = Pair.create("local", "in-local-dc");
-    private static final Pair<String, String> HOST_REPAIR_OPT = Pair.create("hosts", "in-host");
-    private static final Pair<String, String> DC_REPAIR_OPT = Pair.create("dc", "in-dc");
-    private static final Pair<String, String> START_TOKEN_OPT = Pair.create("st", "start-token");
-    private static final Pair<String, String> END_TOKEN_OPT = Pair.create("et", "end-token");
-    private static final Pair<String, String> UPGRADE_ALL_SSTABLE_OPT = Pair.create("a", "include-all-sstables");
-    private static final Pair<String, String> NO_SNAPSHOT = Pair.create("ns", "no-snapshot");
-    private static final Pair<String, String> CFSTATS_IGNORE_OPT = Pair.create("i", "ignore");
-    private static final Pair<String, String> RESOLVE_IP = Pair.create("r", "resolve-ip");
-    private static final Pair<String, String> SCRUB_SKIP_CORRUPTED_OPT = Pair.create("s", "skip-corrupted");
-    private static final Pair<String, String> COMPACT_OPT = Pair.create("c", "compact");
-
-    private static final String DEFAULT_HOST = "127.0.0.1";
-    private static final int DEFAULT_PORT = 7199;
-
-    private static final ToolOptions options = new ToolOptions();
-
-    private final NodeProbe probe;
-
-    static
-    {
-        options.addOption(SNAPSHOT_COLUMNFAMILY_OPT, true, "only take a snapshot of the specified table (column family)");
-        options.addOption(HOST_OPT,     true, "node hostname or ip address");
-        options.addOption(PORT_OPT,     true, "remote jmx agent port number");
-        options.addOption(USERNAME_OPT, true, "remote jmx agent username");
-        options.addOption(PASSWORD_OPT, true, "remote jmx agent password");
-        options.addOption(TAG_OPT,      true, "optional name to give a snapshot");
-        options.addOption(TOKENS_OPT,   false, "display all tokens");
-        options.addOption(PRIMARY_RANGE_OPT, false, "only repair the first range returned by the partitioner for the node");
-        options.addOption(PARALLEL_REPAIR_OPT, false, "repair nodes in parallel.");
-        options.addOption(LOCAL_DC_REPAIR_OPT, false, "only repair against nodes in the same datacenter");
-        options.addOption(DC_REPAIR_OPT, true, "only repair against nodes in the specified datacenters (comma separated)");
-        options.addOption(HOST_REPAIR_OPT, true, "only repair against specified nodes (comma separated)");
-        options.addOption(START_TOKEN_OPT, true, "token at which repair range starts");
-        options.addOption(END_TOKEN_OPT, true, "token at which repair range ends");
-        options.addOption(UPGRADE_ALL_SSTABLE_OPT, false, "includes sstables that are already on the most recent version during upgradesstables");
-        options.addOption(NO_SNAPSHOT, false, "disables snapshot creation for scrub");
-        options.addOption(CFSTATS_IGNORE_OPT, false, "ignore the supplied list of keyspace.columnfamiles in statistics");
-        options.addOption(RESOLVE_IP, false, "show node domain names instead of IPs");
-        options.addOption(SCRUB_SKIP_CORRUPTED_OPT, false, "when scrubbing counter tables, skip corrupted rows");
-        options.addOption(COMPACT_OPT, false, "print histograms in a more compact format");
-    }
-
-    public NodeCmd(NodeProbe probe)
-    {
-        this.probe = probe;
-    }
-
-    private enum NodeCommand
-    {
-        CFHISTOGRAMS,
-        CFSTATS,
-        CLEANUP,
-        CLEARSNAPSHOT,
-        COMPACT,
-        COMPACTIONSTATS,
-        COMPACTIONHISTORY,
-        DECOMMISSION,
-        DESCRIBECLUSTER,
-        DISABLEBINARY,
-        DISABLEGOSSIP,
-        DISABLEHANDOFF,
-        DISABLETHRIFT,
-        DRAIN,
-        ENABLEBINARY,
-        ENABLEGOSSIP,
-        ENABLEHANDOFF,
-        ENABLETHRIFT,
-        FLUSH,
-        GETCOMPACTIONTHRESHOLD,
-        DISABLEAUTOCOMPACTION,
-        ENABLEAUTOCOMPACTION,
-        GETCOMPACTIONTHROUGHPUT,
-        GETSTREAMTHROUGHPUT,
-        GETENDPOINTS,
-        GETSSTABLES,
-        GOSSIPINFO,
-        HELP,
-        INFO,
-        INVALIDATEKEYCACHE,
-        INVALIDATEROWCACHE,
-        JOIN,
-        MOVE,
-        NETSTATS,
-        PAUSEHANDOFF,
-        PROXYHISTOGRAMS,
-        REBUILD,
-        REFRESH,
-        REMOVETOKEN,
-        REMOVENODE,
-        REPAIR,
-        RESUMEHANDOFF,
-        RING,
-        SCRUB,
-        SETCACHECAPACITY,
-        SETCOMPACTIONTHRESHOLD,
-        SETCOMPACTIONTHROUGHPUT,
-        SETSTREAMTHROUGHPUT,
-        SETTRACEPROBABILITY,
-        SNAPSHOT,
-        STATUS,
-        STATUSBINARY,
-        STATUSTHRIFT,
-        STOP,
-        STOPDAEMON,
-        TPSTATS,
-        TRUNCATEHINTS,
-        UPGRADESSTABLES,
-        VERSION,
-        DESCRIBERING,
-        RANGEKEYSAMPLE,
-        REBUILD_INDEX,
-        RESETLOCALSCHEMA,
-        ENABLEBACKUP,
-        DISABLEBACKUP,
-        SETCACHEKEYSTOSAVE,
-        RELOADTRIGGERS,
-        SETLOGGINGLEVEL,
-        GETLOGGINGLEVELS,
-        SETHINTEDHANDOFFTHROTTLEKB
-    }
-
-
-    /**
-     * Prints usage information to stdout.
-     */
-    private static void printUsage()
-    {
-        HelpFormatter hf = new HelpFormatter();
-        StringBuilder header = new StringBuilder(512);
-        header.append("\nAvailable commands\n");
-        final NodeToolHelp ntHelp = loadHelp();
-        Collections.sort(ntHelp.commands, new Comparator<NodeToolHelp.NodeToolCommand>() 
-        {
-            @Override
-            public int compare(NodeToolHelp.NodeToolCommand o1, NodeToolHelp.NodeToolCommand o2) 
-            {
-                return o1.name.compareTo(o2.name);
-            }
-        });
-        for(NodeToolHelp.NodeToolCommand cmd : ntHelp.commands)
-            addCmdHelp(header, cmd);
-        String usage = String.format("java %s --host <arg> <command>%n", NodeCmd.class.getName());
-        hf.printHelp(usage, "", options, "");
-        System.out.println(header.toString());
-    }
-
-    private static NodeToolHelp loadHelp()
-    {
-        final InputStream is = NodeCmd.class.getClassLoader().getResourceAsStream("org/apache/cassandra/tools/NodeToolHelp.yaml");
-        assert is != null;
-
-        try
-        {
-            final Constructor constructor = new Constructor(NodeToolHelp.class);
-            final Yaml yaml = new Yaml(constructor);
-            return (NodeToolHelp)yaml.load(is);
-        }
-        finally
-        {
-            FileUtils.closeQuietly(is);
-        }
-    }
-
-    private static void addCmdHelp(StringBuilder sb, NodeToolHelp.NodeToolCommand cmd)
-    {
-        sb.append("  ").append(cmd.name);
-        // Ghetto indentation (trying, but not too hard, to not look too bad)
-        if (cmd.name.length() <= 20)
-            for (int i = cmd.name.length(); i < 22; ++i) sb.append(" ");
-        sb.append(" - ").append(cmd.help);
-  }
-
-
-    /**
-     * Write a textual representation of the Cassandra ring.
-     *
-     * @param outs
-     *            the stream to write to
-     */
-    public void printRing(PrintStream outs, String keyspace, boolean resolveIp)
-    {
-        Map<String, String> tokensToEndpoints = probe.getTokenToEndpointMap();
-        LinkedHashMultimap<String, String> endpointsToTokens = LinkedHashMultimap.create();
-        boolean haveVnodes = false;
-        for (Map.Entry<String, String> entry : tokensToEndpoints.entrySet())
-        {
-            haveVnodes |= endpointsToTokens.containsKey(entry.getValue());
-            endpointsToTokens.put(entry.getValue(), entry.getKey());
-        }
-
-        int maxAddressLength = Collections.max(endpointsToTokens.keys(), new Comparator<String>() {
-            @Override
-            public int compare(String first, String second)
-            {
-                return ((Integer)first.length()).compareTo((Integer)second.length());
-            }
-        }).length();
-
-        String formatPlaceholder = "%%-%ds  %%-12s%%-7s%%-8s%%-16s%%-20s%%-44s%%n";
-        String format = String.format(formatPlaceholder, maxAddressLength);
-
-        // Calculate per-token ownership of the ring
-        Map<InetAddress, Float> ownerships;
-        boolean keyspaceSelected;
-        try
-        {
-            ownerships = probe.effectiveOwnership(keyspace);
-            keyspaceSelected = true;
-        }
-        catch (IllegalStateException ex)
-        {
-            ownerships = probe.getOwnership();
-            outs.printf("Note: Ownership information does not include topology; for complete information, specify a keyspace%n");
-            keyspaceSelected = false;
-        }
-        try
-        {
-            outs.println();
-            for (Entry<String, SetHostStat> entry : getOwnershipByDc(resolveIp, tokensToEndpoints, ownerships).entrySet())
-                printDc(outs, format, entry.getKey(), endpointsToTokens, keyspaceSelected, entry.getValue());
-        }
-        catch (UnknownHostException e)
-        {
-            throw new RuntimeException(e);
-        }
-
-        if(haveVnodes)
-        {
-            outs.println("  Warning: \"nodetool ring\" is used to output all the tokens of a node.");
-            outs.println("  To view status related info of a node use \"nodetool status\" instead.\n");
-        }
-    }
-
-    private void printDc(PrintStream outs, String format, String dc, LinkedHashMultimap<String, String> endpointsToTokens,
-                         boolean keyspaceSelected, SetHostStat hoststats)
-    {
-        Collection<String> liveNodes = probe.getLiveNodes();
-        Collection<String> deadNodes = probe.getUnreachableNodes();
-        Collection<String> joiningNodes = probe.getJoiningNodes();
-        Collection<String> leavingNodes = probe.getLeavingNodes();
-        Collection<String> movingNodes = probe.getMovingNodes();
-        Map<String, String> loadMap = probe.getLoadMap();
-
-        outs.println("Datacenter: " + dc);
-        outs.println("==========");
-
-        // get the total amount of replicas for this dc and the last token in this dc's ring
-        List<String> tokens = new ArrayList<String>();
-        String lastToken = "";
-
-        for (HostStat stat : hoststats)
-        {
-            tokens.addAll(endpointsToTokens.get(stat.endpoint.getHostAddress()));
-            lastToken = tokens.get(tokens.size() - 1);
-        }
-
-        outs.printf(format, "Address", "Rack", "Status", "State", "Load", "Owns", "Token");
-
-        if (hoststats.size() > 1)
-            outs.printf(format, "", "", "", "", "", "", lastToken);
-        else
-            outs.println();
-
-        for (HostStat stat : hoststats)
-        {
-            String endpoint = stat.endpoint.getHostAddress();
-            String rack;
-            try
-            {
-                rack = probe.getEndpointSnitchInfoProxy().getRack(endpoint);
-            }
-            catch (UnknownHostException e)
-            {
-                rack = "Unknown";
-            }
-
-            String status = liveNodes.contains(endpoint)
-                    ? "Up"
-                    : deadNodes.contains(endpoint)
-                            ? "Down"
-                            : "?";
-
-            String state = "Normal";
-
-            if (joiningNodes.contains(endpoint))
-                state = "Joining";
-            else if (leavingNodes.contains(endpoint))
-                state = "Leaving";
-            else if (movingNodes.contains(endpoint))
-                state = "Moving";
-
-            String load = loadMap.containsKey(endpoint)
-                    ? loadMap.get(endpoint)
-                    : "?";
-            String owns = stat.owns != null ? new DecimalFormat("##0.00%").format(stat.owns) : "?";
-            outs.printf(format, stat.ipOrDns(), rack, status, state, load, owns, stat.token);
-        }
-        outs.println();
-    }
-
-    private class ClusterStatus
-    {
-        String kSpace = null, format = null;
-        int maxAddressLength;
-        Collection<String> joiningNodes, leavingNodes, movingNodes, liveNodes, unreachableNodes;
-        Map<String, String> loadMap, hostIDMap, tokensToEndpoints;
-        EndpointSnitchInfoMBean epSnitchInfo;
-        PrintStream outs;
-        private final boolean resolveIp;
-
-        ClusterStatus(PrintStream outs, String kSpace, boolean resolveIp)
-        {
-            this.kSpace = kSpace;
-            this.outs = outs;
-            this.resolveIp = resolveIp;
-            joiningNodes = probe.getJoiningNodes();
-            leavingNodes = probe.getLeavingNodes();
-            movingNodes = probe.getMovingNodes();
-            loadMap = probe.getLoadMap();
-            tokensToEndpoints = probe.getTokenToEndpointMap();
-            liveNodes = probe.getLiveNodes();
-            unreachableNodes = probe.getUnreachableNodes();
-            hostIDMap = probe.getHostIdMap();
-            epSnitchInfo = probe.getEndpointSnitchInfoProxy();
-        }
-
-        private void printStatusLegend()
-        {
-            outs.println("Status=Up/Down");
-            outs.println("|/ State=Normal/Leaving/Joining/Moving");
-        }
-
-        private String getFormat(boolean hasEffectiveOwns, boolean isTokenPerNode)
-        {
-            if (format == null)
-            {
-                StringBuilder buf = new StringBuilder();
-                String addressPlaceholder = String.format("%%-%ds  ", maxAddressLength);
-                buf.append("%s%s  ");                         // status
-                buf.append(addressPlaceholder);               // address
-                buf.append("%-9s  ");                         // load
-                if (!isTokenPerNode)  buf.append("%-6s  ");   // "Tokens"
-                if (hasEffectiveOwns) buf.append("%-16s  ");  // "Owns (effective)"
-                else                  buf.append("%-5s  ");   // "Owns
-                buf.append("%-36s  ");                        // Host ID
-                if (isTokenPerNode)   buf.append("%-39s  ");  // token
-                buf.append("%s%n");                           // "Rack"
-
-                format = buf.toString();
-            }
-
-            return format;
-        }
-
-        private void printNode(String endpoint, Float owns, List<HostStat> tokens, boolean hasEffectiveOwns, boolean isTokenPerNode) throws UnknownHostException
-        {
-            String status, state, load, strOwns, hostID, rack, fmt;
-            fmt = getFormat(hasEffectiveOwns, isTokenPerNode);
-            if      (liveNodes.contains(endpoint))        status = "U";
-            else if (unreachableNodes.contains(endpoint)) status = "D";
-            else                                          status = "?";
-            if      (joiningNodes.contains(endpoint))     state = "J";
-            else if (leavingNodes.contains(endpoint))     state = "L";
-            else if (movingNodes.contains(endpoint))      state = "M";
-            else                                          state = "N";
-
-            load = loadMap.containsKey(endpoint) ? loadMap.get(endpoint) : "?";
-            strOwns = owns != null ? new DecimalFormat("##0.0%").format(owns) : "?";
-            hostID = hostIDMap.get(endpoint);
-            rack = epSnitchInfo.getRack(endpoint);
-
-            String endpointDns = tokens.get(0).ipOrDns();
-            if (isTokenPerNode)
-            {
-                outs.printf(fmt, status, state, endpointDns, load, strOwns, hostID, tokens.get(0).token, rack);
-            }
-            else
-            {
-                outs.printf(fmt, status, state, endpointDns, load, tokens.size(), strOwns, hostID, rack);
-            }
-        }
-
-        private void printNodesHeader(boolean hasEffectiveOwns, boolean isTokenPerNode)
-        {
-            String fmt = getFormat(hasEffectiveOwns, isTokenPerNode);
-            String owns = hasEffectiveOwns ? "Owns (effective)" : "Owns";
-
-            if (isTokenPerNode)
-                outs.printf(fmt, "-", "-", "Address", "Load", owns, "Host ID", "Token", "Rack");
-            else
-                outs.printf(fmt, "-", "-", "Address", "Load", "Tokens", owns, "Host ID", "Rack");
-        }
-
-        void findMaxAddressLength(Map<String, SetHostStat> dcs) {
-            maxAddressLength = 0;
-            for (Map.Entry<String, SetHostStat> dc : dcs.entrySet())
-            {
-                for (HostStat stat : dc.getValue()) {
-                    maxAddressLength = Math.max(maxAddressLength, stat.ipOrDns().length());
-                }
-            }
-        }
-
-        void print() throws UnknownHostException
-        {
-            Map<InetAddress, Float> ownerships;
-            boolean hasEffectiveOwns = false, isTokenPerNode = true;
-
-            try
-            {
-                ownerships = probe.effectiveOwnership(kSpace);
-                hasEffectiveOwns = true;
-            }
-            catch (IllegalStateException e)
-            {
-                ownerships = probe.getOwnership();
-                outs.printf("Note: Ownership information does not include topology; for complete information, specify a keyspace%n");
-            }
-
-            Map<String, SetHostStat> dcs = getOwnershipByDc(resolveIp, tokensToEndpoints, ownerships);
-
-            // More tokens than nodes (aka vnodes)?
-            if (dcs.values().size() < tokensToEndpoints.keySet().size())
-                isTokenPerNode = false;
-
-            findMaxAddressLength(dcs);
-
-            // Datacenters
-            for (Map.Entry<String, SetHostStat> dc : dcs.entrySet())
-            {
-                String dcHeader = String.format("Datacenter: %s%n", dc.getKey());
-                outs.printf(dcHeader);
-                for (int i=0; i < (dcHeader.length() - 1); i++) outs.print('=');
-                outs.println();
-
-                printStatusLegend();
-                printNodesHeader(hasEffectiveOwns, isTokenPerNode);
-
-                ArrayListMultimap<InetAddress, HostStat> hostToTokens = ArrayListMultimap.create();
-                for (HostStat stat : dc.getValue())
-                    hostToTokens.put(stat.endpoint, stat);
-
-                // Nodes
-                for (InetAddress endpoint : hostToTokens.keySet())
-                {
-                    Float owns = ownerships.get(endpoint);
-                    List<HostStat> tokens = hostToTokens.get(endpoint);
-                    printNode(endpoint.getHostAddress(), owns, tokens, hasEffectiveOwns, isTokenPerNode);
-                }
-            }
-        }
-    }
-
-    private Map<String, SetHostStat> getOwnershipByDc(boolean resolveIp, Map<String, String> tokenToEndpoint, 
-                                                      Map<InetAddress, Float> ownerships) throws UnknownHostException
-    {
-        Map<String, SetHostStat> ownershipByDc = Maps.newLinkedHashMap();
-        EndpointSnitchInfoMBean epSnitchInfo = probe.getEndpointSnitchInfoProxy();
-
-        for (Entry<String, String> tokenAndEndPoint : tokenToEndpoint.entrySet())
-        {
-            String dc = epSnitchInfo.getDatacenter(tokenAndEndPoint.getValue());
-            if (!ownershipByDc.containsKey(dc))
-                ownershipByDc.put(dc, new SetHostStat(resolveIp));
-            ownershipByDc.get(dc).add(tokenAndEndPoint.getKey(), tokenAndEndPoint.getValue(), ownerships);
-        }
-
-        return ownershipByDc;
-    }
-
-    static class SetHostStat implements Iterable<HostStat> {
-        final List<HostStat> hostStats = new ArrayList<HostStat>();
-        final boolean resolveIp;
-
-        public SetHostStat(boolean resolveIp)
-        {
-            this.resolveIp = resolveIp;
-        }
-
-        public int size()
-        {
-            return hostStats.size();
-        }
-
-        @Override
-        public Iterator<HostStat> iterator() {
-            return hostStats.iterator();
-        }
-
-        public void add(String token, String host, Map<InetAddress, Float> ownerships) throws UnknownHostException {
-            InetAddress endpoint = InetAddress.getByName(host);
-            Float owns = ownerships.get(endpoint);
-            hostStats.add(new HostStat(token, endpoint, resolveIp, owns));
-        }
-    }
-
-    static class HostStat {
-        public final InetAddress endpoint;
-        public final boolean resolveIp;
-        public final Float owns;
-        public final String token;
-
-        public HostStat(String token, InetAddress endpoint, boolean resolveIp, Float owns) 
-        {
-            this.token = token;
-            this.endpoint = endpoint;
-            this.resolveIp = resolveIp;
-            this.owns = owns;
-        }
-
-        public String ipOrDns()
-        {
-            return (resolveIp) ? endpoint.getHostName() : endpoint.getHostAddress();
-        }
-    }
-
-    /** Writes a keyspaceName of cluster-wide node information to a PrintStream
-     * @throws UnknownHostException */
-    public void printClusterStatus(PrintStream outs, String keyspace, boolean resolveIp) throws UnknownHostException
-    {
-        new ClusterStatus(outs, keyspace, resolveIp).print();
-    }
-
-    public void printThreadPoolStats(PrintStream outs)
-    {
-        outs.printf("%-25s%10s%10s%15s%10s%18s%n", "Pool Name", "Active", "Pending", "Completed", "Blocked", "All time blocked");
-
-        Iterator<Map.Entry<String, JMXEnabledThreadPoolExecutorMBean>> threads = probe.getThreadPoolMBeanProxies();
-        while (threads.hasNext())
-        {
-            Entry<String, JMXEnabledThreadPoolExecutorMBean> thread = threads.next();
-            String poolName = thread.getKey();
-            JMXEnabledThreadPoolExecutorMBean threadPoolProxy = thread.getValue();
-            outs.printf("%-25s%10s%10s%15s%10s%18s%n",
-                        poolName,
-                        threadPoolProxy.getActiveCount(),
-                        threadPoolProxy.getPendingTasks(),
-                        threadPoolProxy.getCompletedTasks(),
-                        threadPoolProxy.getCurrentlyBlockedTasks(),
-                        threadPoolProxy.getTotalBlockedTasks());
-        }
-
-        outs.printf("%n%-20s%10s%n", "Message type", "Dropped");
-        for (Entry<String, Integer> entry : probe.getDroppedMessages().entrySet())
-            outs.printf("%-20s%10s%n", entry.getKey(), entry.getValue());
-    }
-
-    /**
-     * Write node information.
-     *
-     * @param outs the stream to write to
-     */
-    public void printInfo(PrintStream outs, ToolCommandLine cmd)
-    {
-        boolean gossipInitialized = probe.isInitialized();
-        List<String> toks = probe.getTokens();
-
-        // If there is just 1 token, print it now like we always have, otherwise,
-        // require that -T/--tokens be passed (that output is potentially verbose).
-        if (toks.size() == 1)
-            outs.printf("%-17s: %s%n", "Token", toks.get(0));
-        else if (!cmd.hasOption(TOKENS_OPT.left))
-            outs.printf("%-17s: (invoke with -T/--tokens to see all %d tokens)%n", "Token", toks.size());
-
-        outs.printf("%-17s: %s%n", "ID", probe.getLocalHostId());
-        outs.printf("%-17s: %s%n", "Gossip active", gossipInitialized);
-        outs.printf("%-17s: %s%n", "Thrift active", probe.isThriftServerRunning());
-        outs.printf("%-17s: %s%n", "Native Transport active", probe.isNativeTransportRunning());
-        outs.printf("%-17s: %s%n", "Load", probe.getLoadString());
-        if (gossipInitialized)
-            outs.printf("%-17s: %s%n", "Generation No", probe.getCurrentGenerationNumber());
-        else
-            outs.printf("%-17s: %s%n", "Generation No", 0);
-
-        // Uptime
-        long secondsUp = probe.getUptime() / 1000;
-        outs.printf("%-17s: %d%n", "Uptime (seconds)", secondsUp);
-
-        // Memory usage
-        MemoryUsage heapUsage = probe.getHeapMemoryUsage();
-        double memUsed = (double)heapUsage.getUsed() / (1024 * 1024);
-        double memMax = (double)heapUsage.getMax() / (1024 * 1024);
-        outs.printf("%-17s: %.2f / %.2f%n", "Heap Memory (MB)", memUsed, memMax);
-
-        // Data Center/Rack
-        outs.printf("%-17s: %s%n", "Data Center", probe.getDataCenter());
-        outs.printf("%-17s: %s%n", "Rack", probe.getRack());
-
-        // Exceptions
-        outs.printf("%-17s: %s%n", "Exceptions", probe.getExceptionCount());
-
-        CacheServiceMBean cacheService = probe.getCacheServiceMBean();
-
-        // Key Cache: Hits, Requests, RecentHitRate, SavePeriodInSeconds
-        outs.printf("%-17s: size %d (bytes), capacity %d (bytes), %d hits, %d requests, %.3f recent hit rate, %d save period in seconds%n",
-                    "Key Cache",
-                    cacheService.getKeyCacheSize(),
-                    cacheService.getKeyCacheCapacityInBytes(),
-                    cacheService.getKeyCacheHits(),
-                    cacheService.getKeyCacheRequests(),
-                    cacheService.getKeyCacheRecentHitRate(),
-                    cacheService.getKeyCacheSavePeriodInSeconds());
-
-        // Row Cache: Hits, Requests, RecentHitRate, SavePeriodInSeconds
-        outs.printf("%-17s: size %d (bytes), capacity %d (bytes), %d hits, %d requests, %.3f recent hit rate, %d save period in seconds%n",
-                    "Row Cache",
-                    cacheService.getRowCacheSize(),
-                    cacheService.getRowCacheCapacityInBytes(),
-                    cacheService.getRowCacheHits(),
-                    cacheService.getRowCacheRequests(),
-                    cacheService.getRowCacheRecentHitRate(),
-                    cacheService.getRowCacheSavePeriodInSeconds());
-
-        if (toks.size() > 1 && cmd.hasOption(TOKENS_OPT.left))
-        {
-            for (String tok : toks)
-                outs.printf("%-17s: %s%n", "Token", tok);
-        }
-    }
-
-    public void printReleaseVersion(PrintStream outs)
-    {
-        outs.println("ReleaseVersion: " + probe.getReleaseVersion());
-    }
-
-    public void printNetworkStats(final InetAddress addr, PrintStream outs)
-    {
-        outs.printf("Mode: %s%n", probe.getOperationMode());
-        Set<StreamState> statuses = probe.getStreamStatus();
-        if (statuses.isEmpty())
-            outs.println("Not sending any streams.");
-        for (StreamState status : statuses)
-        {
-            outs.printf("%s %s%n", status.description, status.planId.toString());
-            for (SessionInfo info : status.sessions)
-            {
-                outs.printf("    %s%n", info.peer.toString());
-                if (!info.receivingSummaries.isEmpty())
-                {
-                    outs.printf("        Receiving %d files, %d bytes total%n", info.getTotalFilesToReceive(), info.getTotalSizeToReceive());
-                    for (ProgressInfo progress : info.getReceivingFiles())
-                    {
-                        outs.printf("            %s%n", progress.toString());
-                    }
-                }
-                if (!info.sendingSummaries.isEmpty())
-                {
-                    outs.printf("        Sending %d files, %d bytes total%n", info.getTotalFilesToSend(), info.getTotalSizeToSend());
-                    for (ProgressInfo progress : info.getSendingFiles())
-                    {
-                        outs.printf("            %s%n", progress.toString());
-                    }
-                }
-            }
-        }
-
-        outs.printf("Read Repair Statistics:%nAttempted: %d%nMismatch (Blocking): %d%nMismatch (Background): %d%n", probe.getReadRepairAttempted(), probe.getReadRepairRepairedBlocking(), probe.getReadRepairRepairedBackground());
-
-        MessagingServiceMBean ms = probe.msProxy;
-        outs.printf("%-25s", "Pool Name");
-        outs.printf("%10s", "Active");
-        outs.printf("%10s", "Pending");
-        outs.printf("%15s%n", "Completed");
-
-        int pending;
-        long completed;
-
-        pending = 0;
-        for (int n : ms.getCommandPendingTasks().values())
-            pending += n;
-        completed = 0;
-        for (long n : ms.getCommandCompletedTasks().values())
-            completed += n;
-        outs.printf("%-25s%10s%10s%15s%n", "Commands", "n/a", pending, completed);
-
-        pending = 0;
-        for (int n : ms.getResponsePendingTasks().values())
-            pending += n;
-        completed = 0;
-        for (long n : ms.getResponseCompletedTasks().values())
-            completed += n;
-        outs.printf("%-25s%10s%10s%15s%n", "Responses", "n/a", pending, completed);
-    }
-
-    public void printCompactionStats(PrintStream outs)
-    {
-        int compactionThroughput = probe.getCompactionThroughput();
-        CompactionManagerMBean cm = probe.getCompactionManagerProxy();
-        outs.println("pending tasks: " + cm.getPendingTasks());
-        if (cm.getCompactions().size() > 0)
-            outs.printf("%25s%16s%16s%16s%16s%10s%10s%n", "compaction type", "keyspace", "table", "completed", "total", "unit", "progress");
-        long remainingBytes = 0;
-        for (Map<String, String> c : cm.getCompactions())
-        {
-            String percentComplete = new Long(c.get("total")) == 0
-                                   ? "n/a"
-                                   : new DecimalFormat("0.00").format((double) new Long(c.get("completed")) / new Long(c.get("total")) * 100) + "%";
-            outs.printf("%25s%16s%16s%16s%16s%10s%10s%n", c.get("taskType"), c.get("keyspace"), c.get("columnfamily"), c.get("completed"), c.get("total"), c.get("unit"), percentComplete);
-            if (c.get("taskType").equals(OperationType.COMPACTION.toString()))
-                remainingBytes += (new Long(c.get("total")) - new Long(c.get("completed")));
-        }
-        long remainingTimeInSecs = compactionThroughput == 0 || remainingBytes == 0
-                        ? -1
-                        : (remainingBytes) / (long) (1024L * 1024L * compactionThroughput);
-        String remainingTime = remainingTimeInSecs < 0
-                        ? "n/a"
-                        : String.format("%dh%02dm%02ds", remainingTimeInSecs / 3600, (remainingTimeInSecs % 3600) / 60, (remainingTimeInSecs % 60));
-
-        outs.printf("%25s%10s%n", "Active compaction remaining time : ", remainingTime);
-    }
-
-    /**
-     * Print the compaction threshold
-     *
-     * @param outs the stream to write to
-     */
-    public void printCompactionThreshold(PrintStream outs, String ks, String cf)
-    {
-        ColumnFamilyStoreMBean cfsProxy = probe.getCfsProxy(ks, cf);
-        outs.println("Current compaction thresholds for " + ks + "/" + cf + ": \n" +
-                     " min = " + cfsProxy.getMinimumCompactionThreshold() + ", " +
-                     " max = " + cfsProxy.getMaximumCompactionThreshold());
-    }
-
-    /**
-     * Print the compaction throughput
-     *
-     * @param outs the stream to write to
-     */
-    public void printCompactionThroughput(PrintStream outs)
-    {
-        outs.println("Current compaction throughput: " + probe.getCompactionThroughput() + " MB/s");
-    }
-
-    /**
-     * Print the stream throughput
-     *
-     * @param outs the stream to write to
-     */
-    public void printStreamThroughput(PrintStream outs)
-    {
-        outs.println("Current stream throughput: " + probe.getStreamThroughput() + " Mb/s");
-    }
-
-    /**
-     * Print the name, snitch, partitioner and schema version(s) of a cluster
-     *
-     * @param outs Output stream
-     * @param host Server address
-     */
-    public void printClusterDescription(PrintStream outs, String host)
-    {
-        // display cluster name, snitch and partitioner
-        outs.println("Cluster Information:");
-        outs.println("\tName: " + probe.getClusterName());
-        outs.println("\tSnitch: " + probe.getEndpointSnitchInfoProxy().getSnitchName());
-        outs.println("\tPartitioner: " + probe.getPartitioner());
-
-        // display schema version for each node
-        outs.println("\tSchema versions:");
-        Map<String, List<String>> schemaVersions = probe.getSpProxy().getSchemaVersions();
-        for (String version : schemaVersions.keySet())
-        {
-            outs.println(String.format("\t\t%s: %s%n", version, schemaVersions.get(version)));
-        }
-    }
-
-    public void printColumnFamilyStats(PrintStream outs, boolean ignoreMode, String [] filterList)
-    {
-        OptionFilter filter = new OptionFilter(ignoreMode, filterList);
-        Map <String, List <ColumnFamilyStoreMBean>> cfstoreMap = new HashMap <String, List <ColumnFamilyStoreMBean>>();
-
-        // get a list of column family stores
-        Iterator<Map.Entry<String, ColumnFamilyStoreMBean>> cfamilies = probe.getColumnFamilyStoreMBeanProxies();
-
-        while (cfamilies.hasNext())
-        {
-            Entry<String, ColumnFamilyStoreMBean> entry = cfamilies.next();
-            String keyspaceName = entry.getKey();
-            ColumnFamilyStoreMBean cfsProxy = entry.getValue();
-
-            if (!cfstoreMap.containsKey(keyspaceName) && filter.isColumnFamilyIncluded(entry.getKey(), cfsProxy.getColumnFamilyName()))
-            {
-                List<ColumnFamilyStoreMBean> columnFamilies = new ArrayList<ColumnFamilyStoreMBean>();
-                columnFamilies.add(cfsProxy);
-                cfstoreMap.put(keyspaceName, columnFamilies);
-            }
-            else if (filter.isColumnFamilyIncluded(entry.getKey(), cfsProxy.getColumnFamilyName()))
-            {
-                cfstoreMap.get(keyspaceName).add(cfsProxy);
-            }
-        }
-
-        // make sure all specified kss and cfs exist
-        filter.verifyKeyspaces(probe.getKeyspaces());
-        filter.verifyColumnFamilies();
-
-        // print out the table statistics
-        for (Entry<String, List<ColumnFamilyStoreMBean>> entry : cfstoreMap.entrySet())
-        {
-            String keyspaceName = entry.getKey();
-            List<ColumnFamilyStoreMBean> columnFamilies = entry.getValue();
-            long keyspaceReadCount = 0;
-            long keyspaceWriteCount = 0;
-            int keyspacePendingTasks = 0;
-            double keyspaceTotalReadTime = 0.0f;
-            double keyspaceTotalWriteTime = 0.0f;
-
-            outs.println("Keyspace: " + keyspaceName);
-            for (ColumnFamilyStoreMBean cfstore : columnFamilies)
-            {
-                long writeCount = cfstore.getWriteCount();
-                long readCount = cfstore.getReadCount();
-
-                if (readCount > 0)
-                {
-                    keyspaceReadCount += readCount;
-                    keyspaceTotalReadTime += cfstore.getTotalReadLatencyMicros();
-                }
-                if (writeCount > 0)
-                {
-                    keyspaceWriteCount += writeCount;
-                    keyspaceTotalWriteTime += cfstore.getTotalWriteLatencyMicros();
-                }
-                keyspacePendingTasks += cfstore.getPendingTasks();
-            }
-
-            double keyspaceReadLatency = keyspaceReadCount > 0 ? keyspaceTotalReadTime / keyspaceReadCount / 1000 : Double.NaN;
-            double keyspaceWriteLatency = keyspaceWriteCount > 0 ? keyspaceTotalWriteTime / keyspaceWriteCount / 1000 : Double.NaN;
-
-            outs.println("\tRead Count: " + keyspaceReadCount);
-            outs.println("\tRead Latency: " + String.format("%s", keyspaceReadLatency) + " ms.");
-            outs.println("\tWrite Count: " + keyspaceWriteCount);
-            outs.println("\tWrite Latency: " + String.format("%s", keyspaceWriteLatency) + " ms.");
-            outs.println("\tPending Tasks: " + keyspacePendingTasks);
-
-            // print out column family statistics for this keyspace
-            for (ColumnFamilyStoreMBean cfstore : columnFamilies)
-            {
-                String cfName = cfstore.getColumnFamilyName();
-                if(cfName.contains("."))
-                    outs.println("\t\tTable (index): " + cfName);
-                else
-                    outs.println("\t\tTable: " + cfName);
-
-                outs.println("\t\tSSTable count: " + cfstore.getLiveSSTableCount());
-                int[] leveledSStables = cfstore.getSSTableCountPerLevel();
-                if (leveledSStables != null)
-                {
-                    outs.print("\t\tSSTables in each level: [");
-                    for (int level = 0; level < leveledSStables.length; level++)
-                    {
-                        int count = leveledSStables[level];
-                        outs.print(count);
-                        long maxCount = 4L; // for L0
-                        if (level > 0)
-                            maxCount = (long) Math.pow(10, level);
-                        //  show max threshold for level when exceeded
-                        if (count > maxCount)
-                            outs.print("/" + maxCount);
-
-                        if (level < leveledSStables.length - 1)
-                            outs.print(", ");
-                        else
-                            outs.println("]");
-                    }
-                }
-                outs.println("\t\tSpace used (live), bytes: " + cfstore.getLiveDiskSpaceUsed());
-                outs.println("\t\tSpace used (total), bytes: " + cfstore.getTotalDiskSpaceUsed());
-                outs.println("\t\tSSTable Compression Ratio: " + cfstore.getCompressionRatio());
-                outs.println("\t\tNumber of keys (estimate): " + cfstore.estimateKeys());
-                outs.println("\t\tMemtable cell count: " + cfstore.getMemtableColumnsCount());
-                outs.println("\t\tMemtable data size, bytes: " + cfstore.getMemtableDataSize());
-                outs.println("\t\tMemtable switch count: " + cfstore.getMemtableSwitchCount());
-                outs.println("\t\tLocal read count: " + cfstore.getReadCount());
-                outs.printf("\t\tLocal read latency: %01.3f ms%n", cfstore.getRecentReadLatencyMicros() / 1000);
-                outs.println("\t\tLocal write count: " + cfstore.getWriteCount());
-                outs.printf("\t\tLocal write latency: %01.3f ms%n", cfstore.getRecentWriteLatencyMicros() / 1000);
-                outs.println("\t\tPending tasks: " + cfstore.getPendingTasks());
-                outs.println("\t\tBloom filter false positives: " + cfstore.getBloomFilterFalsePositives());
-                outs.println("\t\tBloom filter false ratio: " + String.format("%01.5f", cfstore.getRecentBloomFilterFalseRatio()));
-                outs.println("\t\tBloom filter space used, bytes: " + cfstore.getBloomFilterDiskSpaceUsed());
-                outs.println("\t\tCompacted partition minimum bytes: " + cfstore.getMinRowSize());
-                outs.println("\t\tCompacted partition maximum bytes: " + cfstore.getMaxRowSize());
-                outs.println("\t\tCompacted partition mean bytes: " + cfstore.getMeanRowSize());
-                outs.println("\t\tAverage live cells per slice (last five minutes): " + cfstore.getLiveCellsPerSlice());
-                outs.println("\t\tAverage tombstones per slice (last five minutes): " + cfstore.getTombstonesPerSlice());
-
-                outs.println("");
-            }
-            outs.println("----------------");
-        }
-    }
-
-    public void printRemovalStatus(PrintStream outs)
-    {
-        outs.println("RemovalStatus: " + probe.getRemovalStatus());
-    }
-
-    /**
-     * Returns a pair of the min and max indexes we actually have histogram data for.
-     * If there's no data, -1 will be returned for the min and max.
-     */
-    private Pair<Integer, Integer> getDataBounds(long[] data)
-    {
-        int lowestIndex = -1;
-        int highestIndex = -1;
-        for (int i = 0; i < data.length; i++)
-        {
-            if (data[i] > 0)
-            {
-                highestIndex = i;
-                if (lowestIndex == -1)
-                    lowestIndex = i;
-            }
-        }
-        return Pair.create(lowestIndex, highestIndex);
-    }
-
-    private void printHistogram(long[] data, long[] offsets, String unit, PrintStream output)
-    {
-        Pair<Integer, Integer> bounds = getDataBounds(data);
-        if (bounds.left == -1)
-        {
-            output.println("No Data");
-        }
-        else
-        {
-            long maxValue = -1;
-            for (int i = bounds.left; i <= bounds.right; i++)
-                maxValue = Math.max(maxValue, offsets[i]);
-
-            String format = "%" + new Long(maxValue).toString().length() + "d %s: %d";
-            for (int i = bounds.left; i <= bounds.right; i++)
-                output.println(String.format(format, offsets[i], unit, data[i]));
-        }
-        output.println("");
-    }
-
-    private void printCfHistograms(String keySpace, String columnFamily, PrintStream output, boolean compactFormat)
-    {
-        ColumnFamilyStoreMBean store = this.probe.getCfsProxy(keySpace, columnFamily);
-
-        // default is 90 offsets
-        long[] offsets = new EstimatedHistogram().getBucketOffsets();
-
-        long[] rrlh = store.getRecentReadLatencyHistogramMicros();
-        long[] rwlh = store.getRecentWriteLatencyHistogramMicros();
-        long[] sprh = store.getRecentSSTablesPerReadHistogram();
-        long[] ersh = store.getEstimatedRowSizeHistogram();
-        long[] ecch = store.getEstimatedColumnCountHistogram();
-
-        output.println(String.format("%s/%s histograms", keySpace, columnFamily));
-        output.println("");
-
-        if (compactFormat)
-        {
-            output.println(String.format("%-10s%10s%18s%18s%18s%18s",
-                    "Offset", "SSTables", "Write Latency", "Read Latency", "Partition Size", "Cell Count"));
-            output.println(String.format("%-10s%10s%18s%18s%18s%18s",
-                    "", "", "(micros)", "(micros)", "(bytes)", ""));
-
-            for (int i = 0; i < offsets.length; i++)
-            {
-                output.println(String.format("%-10d%10s%18s%18s%18s%18s",
-                        offsets[i],
-                        (i < sprh.length ? sprh[i] : "0"),
-                        (i < rwlh.length ? rwlh[i] : "0"),
-                        (i < rrlh.length ? rrlh[i] : "0"),
-                        (i < ersh.length ? ersh[i] : "0"),
-                        (i < ecch.length ? ecch[i] : "0")));
-            }
-        }
-        else
-        {
-            output.println("SSTables per Read");
-            printHistogram(sprh, offsets, "sstables", output);
-
-            output.println("Write Latency (microseconds)");
-            printHistogram(rwlh, offsets, "us", output);
-
-            output.println("Read Latency (microseconds)");
-            printHistogram(rrlh, offsets, "us", output);
-
-            output.println("Partition Size (bytes)");
-            printHistogram(ersh, offsets, "bytes", output);
-
-            output.println("Cell Count per Partition");
-            printHistogram(ecch, offsets, "cells", output);
-        }
-    }
-
-    private void printProxyHistograms(PrintStream output, boolean compactFormat)
-    {
-        StorageProxyMBean sp = this.probe.getSpProxy();
-        long[] offsets = new EstimatedHistogram().getBucketOffsets();
-        long[] rrlh = sp.getRecentReadLatencyHistogramMicros();
-        long[] rwlh = sp.getRecentWriteLatencyHistogramMicros();
-        long[] rrnglh = sp.getRecentRangeLatencyHistogramMicros();
-
-        output.println("proxy histograms");
-        output.println("");
-
-        if (compactFormat)
-        {
-            output.println(String.format("%-10s%18s%18s%18s",
-                    "Offset", "Read Latency", "Write Latency", "Range Latency"));
-            for (int i = 0; i < offsets.length; i++)
-            {
-                output.println(String.format("%-10d%18s%18s%18s",
-                        offsets[i],
-                        (i < rrlh.length ? rrlh[i] : "0"),
-                        (i < rwlh.length ? rwlh[i] : "0"),
-                        (i < rrnglh.length ? rrnglh[i] : "0")));
-            }
-        }
-        else
-        {
-            output.println("Read Latency (microseconds)");
-            printHistogram(rrlh, offsets, "us", output);
-
-            output.println("Write Latency (microseconds)");
-            printHistogram(rwlh, offsets, "us", output);
-
-            output.println("Range Latency (microseconds)");
-            printHistogram(rrnglh, offsets, "us", output);
-        }
-    }
-
-    private void printEndPoints(String keySpace, String cf, String key, PrintStream output)
-    {
-        List<InetAddress> endpoints = this.probe.getEndpoints(keySpace, cf, key);
-
-        for (InetAddress anEndpoint : endpoints)
-        {
-           output.println(anEndpoint.getHostAddress());
-        }
-    }
-
-    private void printSSTables(String keyspace, String cf, String key, PrintStream output)
-    {
-        List<String> sstables = this.probe.getSSTables(keyspace, cf, key);
-        for (String sstable : sstables)
-        {
-            output.println(sstable);
-        }
-    }
-
-    private void printIsNativeTransportRunning(PrintStream outs)
-    {
-        outs.println(probe.isNativeTransportRunning() ? "running" : "not running");
-    }
-
-    private void printIsThriftServerRunning(PrintStream outs)
-    {
-        outs.println(probe.isThriftServerRunning() ? "running" : "not running");
-    }
-
-    public static void main(String[] args) throws IOException, InterruptedException, ParseException
-    {
-        CommandLineParser parser = new PosixParser();
-        ToolCommandLine cmd = null;
-
-        try
-        {
-            cmd = new ToolCommandLine(parser.parse(options, args));
-        }
-        catch (ParseException p)
-        {
-            badUse(p.getMessage());
-        }
-
-        String host = cmd.hasOption(HOST_OPT.left) ? cmd.getOptionValue(HOST_OPT.left) : DEFAULT_HOST;
-
-        int port = DEFAULT_PORT;
-
-        String portNum = cmd.getOptionValue(PORT_OPT.left);
-        if (portNum != null)
-        {
-            try
-            {
-                port = Integer.parseInt(portNum);
-            }
-            catch (NumberFormatException e)
-            {
-                throw new ParseException("Port must be a number");
-            }
-        }
-
-        NodeCommand command = null;
-
-        try
-        {
-            command = cmd.getCommand();
-        }
-        catch (IllegalArgumentException e)
-        {
-            badUse(e.getMessage());
-        }
-
-        if(NodeCommand.HELP.equals(command))
-        {
-            printUsage();
-            System.exit(0);
-        }
-
-        NodeProbe probe = null;
-
-        try
-        {
-            String username = cmd.getOptionValue(USERNAME_OPT.left);
-            String password = cmd.getOptionValue(PASSWORD_OPT.left);
-
-            try
-            {
-                probe = username == null ? new NodeProbe(host, port) : new NodeProbe(host, port, username, password);
-            }
-            catch (IOException ioe)
-            {
-                Throwable inner = findInnermostThrowable(ioe);
-                if (inner instanceof ConnectException)
-                {
-                    System.err.printf("Failed to connect to '%s:%d': %s%n", host, port, inner.getMessage());
-                    System.exit(1);
-                }
-                else if (inner instanceof UnknownHostException)
-                {
-                    System.err.printf("Cannot resolve '%s': unknown host%n", host);
-                    System.exit(1);
-                }
-                else
-                {
-                    err(ioe, "Error connecting to remote JMX agent!");
-                }
-            }
-
-            NodeCmd nodeCmd = new NodeCmd(probe);
-
-            //print history here after we've already determined we can reasonably call cassandra
-            printHistory(args, cmd);
-
-            // Execute the requested command.
-            String[] arguments = cmd.getCommandArguments();
-            String tag;
-            String columnFamilyName = null;
-
-            switch (command)
-            {
-                case RING :
-                    boolean resolveIp = cmd.hasOption(RESOLVE_IP.left);
-                    if (arguments.length > 0) { nodeCmd.printRing(System.out, arguments[0], resolveIp); }
-                    else                      { nodeCmd.printRing(System.out, null, resolveIp); };
-                    break;
-
-                case INFO            : nodeCmd.printInfo(System.out, cmd); break;
-                case CFSTATS         :
-                    boolean ignoreMode = cmd.hasOption(CFSTATS_IGNORE_OPT.left);
-                    if (arguments.length > 0) { nodeCmd.printColumnFamilyStats(System.out, ignoreMode, arguments); }
-                    else                      { nodeCmd.printColumnFamilyStats(System.out, false, null); }
-                    break;
-                case TPSTATS         : nodeCmd.printThreadPoolStats(System.out); break;
-                case VERSION         : nodeCmd.printReleaseVersion(System.out); break;
-                case COMPACTIONSTATS : nodeCmd.printCompactionStats(System.out); break;
-                case COMPACTIONHISTORY:nodeCmd.printCompactionHistory(System.out); break;
-                case DESCRIBECLUSTER : nodeCmd.printClusterDescription(System.out, host); break;
-                case DISABLEBINARY   : probe.stopNativeTransport(); break;
-                case ENABLEBINARY    : probe.startNativeTransport(); break;
-                case STATUSBINARY    : nodeCmd.printIsNativeTransportRunning(System.out); break;
-                case DISABLEGOSSIP   : probe.stopGossiping(); break;
-                case ENABLEGOSSIP    : probe.startGossiping(); break;
-                case DISABLEHANDOFF  : probe.disableHintedHandoff(); break;
-                case ENABLEHANDOFF   :
-                    if (arguments.length > 0) { probe.enableHintedHandoff(arguments[0]); }
-                    else                      { probe.enableHintedHandoff(); }
-                    break;
-                case PAUSEHANDOFF    : probe.pauseHintsDelivery(); break;
-                case RESUMEHANDOFF   : probe.resumeHintsDelivery(); break;
-                case DISABLETHRIFT   : probe.stopThriftServer(); break;
-                case ENABLETHRIFT    : probe.startThriftServer(); break;
-                case STATUSTHRIFT    : nodeCmd.printIsThriftServerRunning(System.out); break;
-                case RESETLOCALSCHEMA: probe.resetLocalSchema(); break;
-                case ENABLEBACKUP    : probe.setIncrementalBackupsEnabled(true); break;
-                case DISABLEBACKUP   : probe.setIncrementalBackupsEnabled(false); break;
-
-                case SETHINTEDHANDOFFTHROTTLEKB:
-                    if (arguments.length != 1) { badUse("Missing argument for hinted handoff throttle."); }
-                    probe.setHintedHandoffThrottleInKB(Integer.parseInt(arguments[0]));
-                    break;
-
-                case TRUNCATEHINTS:
-                    if (arguments.length > 1) badUse("Too many arguments.");
-                    else if (arguments.length == 1) probe.truncateHints(arguments[0]);
-                    else probe.truncateHints();
-                    break;
-
-                case STATUS :
-                    resolveIp = cmd.hasOption(RESOLVE_IP.left);
-                    if (arguments.length > 0) nodeCmd.printClusterStatus(System.out, arguments[0], resolveIp);
-                    else                      nodeCmd.printClusterStatus(System.out, null, resolveIp);
-                    break;
-
-                case DECOMMISSION :
-                    if (arguments.length > 0)
-                    {
-                        System.err.println("Decommission will decommission the node you are connected to and does not take arguments!");
-                        System.exit(1);
-                    }
-                    probe.decommission();
-                    break;
-
-                case DRAIN :
-                    try { probe.drain(); }
-                    catch (ExecutionException ee) { err(ee, "Error occured during flushing"); }
-                    break;
-
-                case NETSTATS :
-                    if (arguments.length > 0) { nodeCmd.printNetworkStats(InetAddress.getByName(arguments[0]), System.out); }
-                    else                      { nodeCmd.printNetworkStats(null, System.out); }
-                    break;
-
-                case SNAPSHOT :
-                    columnFamilyName = cmd.getOptionValue(SNAPSHOT_COLUMNFAMILY_OPT.left);
-                    /* FALL THRU */
-                case CLEARSNAPSHOT :
-                    tag = cmd.getOptionValue(TAG_OPT.left);
-                    handleSnapshots(command, tag, arguments, columnFamilyName, probe);
-                    break;
-
-                case MOVE :
-                    if (arguments.length != 1) { badUse("Missing token argument for move."); }
-                    try
-                    {
-                        probe.move(arguments[0]);
-                    }
-                    catch (UnsupportedOperationException uoerror)
-                    {
-                        System.err.println(uoerror.getMessage());
-                        System.exit(1);
-                    }
-                    break;
-
-                case JOIN:
-                    if (probe.isJoined())
-                    {
-                        System.err.println("This node has already joined the ring.");
-                        System.exit(1);
-                    }
-
-                    probe.joinRing();
-                    break;
-
-                case SETCOMPACTIONTHROUGHPUT :
-                    if (arguments.length != 1) { badUse("Missing value argument."); }
-                    probe.setCompactionThroughput(Integer.parseInt(arguments[0]));
-                    break;
-
-                case SETSTREAMTHROUGHPUT :
-                    if (arguments.length != 1) { badUse("Missing value argument."); }
-                    probe.setStreamThroughput(Integer.parseInt(arguments[0]));
-                    break;
-
-                case SETTRACEPROBABILITY :
-                    if (arguments.length != 1) { badUse("Missing value argument."); }
-                    probe.setTraceProbability(Double.parseDouble(arguments[0]));
-                    break;
-
-                case REBUILD :
-                    if (arguments.length > 1) { badUse("Too many arguments."); }
-                    probe.rebuild(arguments.length == 1 ? arguments[0] : null);
-                    break;
-
-                case REMOVETOKEN :
-                    System.err.println("Warn: removetoken is deprecated, please use removenode instead");
-                case REMOVENODE  :
-                    if (arguments.length != 1) { badUse("Missing an argument for removenode (either status, force, or an ID)"); }
-                    else if (arguments[0].equals("status")) { nodeCmd.printRemovalStatus(System.out); }
-                    else if (arguments[0].equals("force"))  { nodeCmd.printRemovalStatus(System.out); probe.forceRemoveCompletion(); }
-                    else                                    { probe.removeNode(arguments[0]); }
-                    break;
-
-                case INVALIDATEKEYCACHE :
-                    probe.invalidateKeyCache();
-                    break;
-
-                case INVALIDATEROWCACHE :
-                    probe.invalidateRowCache();
-                    break;
-
-                case CLEANUP :
-                case COMPACT :
-                case REPAIR  :
-                case FLUSH   :
-                case SCRUB   :
-                case UPGRADESSTABLES   :
-                case DISABLEAUTOCOMPACTION:
-                case ENABLEAUTOCOMPACTION:
-                    optionalKSandCFs(command, cmd, arguments, probe);
-                    break;
-
-                case GETCOMPACTIONTHRESHOLD :
-                    if (arguments.length != 2) { badUse("getcompactionthreshold requires ks and cf args."); }
-                    nodeCmd.printCompactionThreshold(System.out, arguments[0], arguments[1]);
-                    break;
-
-                case GETCOMPACTIONTHROUGHPUT : nodeCmd.printCompactionThroughput(System.out); break;
-                case GETSTREAMTHROUGHPUT : nodeCmd.printStreamThroughput(System.out); break;
-
-                case CFHISTOGRAMS :
-                    if (arguments.length != 2) { badUse("cfhistograms requires ks and cf args"); }
-                    nodeCmd.printCfHistograms(arguments[0], arguments[1], System.out, cmd.hasOption(COMPACT_OPT.left));
-                    break;
-
-                case SETCACHECAPACITY :
-                    if (arguments.length != 2) { badUse("setcachecapacity requires key-cache-capacity, and row-cache-capacity args."); }
-                    probe.setCacheCapacities(Integer.parseInt(arguments[0]), Integer.parseInt(arguments[1]));
-                    break;
-
-                case SETCACHEKEYSTOSAVE :
-                    if (arguments.length != 2) { badUse("setcachekeystosave requires key-cache-keys-to-save, and row-cache-keys-to-save args."); }
-                    probe.setCacheKeysToSave(Integer.parseInt(arguments[0]), Integer.parseInt(arguments[1]));
-                    break;
-
-                case SETCOMPACTIONTHRESHOLD :
-                    if (arguments.length != 4) { badUse("setcompactionthreshold requires ks, cf, min, and max threshold args."); }
-                    int minthreshold = Integer.parseInt(arguments[2]);
-                    int maxthreshold = Integer.parseInt(arguments[3]);
-                    if ((minthreshold < 0) || (maxthreshold < 0)) { badUse("Thresholds must be positive integers"); }
-                    if (minthreshold > maxthreshold)              { badUse("Min threshold cannot be greater than max."); }
-                    if (minthreshold < 2 && maxthreshold != 0)    { badUse("Min threshold must be at least 2"); }
-                    probe.setCompactionThreshold(arguments[0], arguments[1], minthreshold, maxthreshold);
-                    break;
-                case GETENDPOINTS :
-                    if (arguments.length != 3) { badUse("getendpoints requires ks, cf and key args"); }
-                    nodeCmd.printEndPoints(arguments[0], arguments[1], arguments[2], System.out);
-                    break;
-
-                case PROXYHISTOGRAMS :
-                    if (arguments.length != 0) { badUse("proxyhistograms does not take arguments"); }
-                    nodeCmd.printProxyHistograms(System.out, cmd.hasOption(COMPACT_OPT.left));
-                    break;
-
-                case GETSSTABLES:
-                    if (arguments.length != 3) { badUse("getsstables requires ks, cf and key args"); }
-                    nodeCmd.printSSTables(arguments[0], arguments[1], arguments[2], System.out);
-                    break;
-
-                case REFRESH:
-                    if (arguments.length != 2) { badUse("load_new_sstables requires ks and cf args"); }
-                    probe.loadNewSSTables(arguments[0], arguments[1]);
-                    break;
-
-                case REBUILD_INDEX:
-                    if (arguments.length <= 2) { badUse("rebuild_index requires ks, cf and idx args"); }
-                        probe.rebuildIndex(arguments[0], arguments[1], arguments[2].split(","));
-
-                    break;
-
-                case GOSSIPINFO : nodeCmd.printGossipInfo(System.out); break;
-
-                case STOP:
-                    if (arguments.length != 1) { badUse("stop requires a type."); }
-                    probe.stop(arguments[0].toUpperCase());
-                    break;
-
-                case STOPDAEMON:
-                    if (arguments.length != 0) { badUse("stopdaemon does not take arguments."); }
-                    try { probe.stopCassandraDaemon(); }
-                    catch (Throwable t) { System.out.println("Cassandra has shut down.\n"); }
-                    break;
-
-                case DESCRIBERING :
-                    if (arguments.length != 1) { badUse("Missing keyspace argument for describering."); }
-                    nodeCmd.printDescribeRing(arguments[0], System.out);
-                    break;
-
-                case RANGEKEYSAMPLE :
-                    nodeCmd.printRangeKeySample(System.out);
-                    break;
-
-                case RELOADTRIGGERS :
-                    probe.reloadTriggers();
-                    break;
-
-                case SETLOGGINGLEVEL:
-                    String classQualifer = EMPTY;
-                    String level = EMPTY;
-                    if (arguments.length >= 1)
-                        classQualifer = arguments[0];
-                    if (arguments.length == 2)
-                        level = arguments[1];
-                    probe.setLoggingLevel(classQualifer, level);
-                    break;
-
-                case GETLOGGINGLEVELS :
-                    nodeCmd.getLoggingLevels(System.out);
-                    break;
-
-                default :
-                    throw new RuntimeException("Unreachable code.");
-            }
-        }
-        finally
-        {
-            if (probe != null)
-            {
-                try
-                {
-                    probe.close();
-                }
-                catch (IOException ex)
-                {
-                    // swallow the exception so the user will see the real one.
-                }
-            }
-        }
-        System.exit(probe.isFailed() ? 1 : 0);
-    }
-
-    private void getLoggingLevels(PrintStream out)
-    {
-        // what if some one set a very long logger name? 50 space may not be enough...
-        System.out.printf("%n%-50s%10s%n", "Logger Name", "Log Level");
-        for (Map.Entry<String, String> entry : this.probe.getLoggingLevels().entrySet())
-            System.out.printf("%-50s%10s%n", entry.getKey(), entry.getValue());
-    }
-
-    private void printCompactionHistory(PrintStream out)
-    {
-        out.println("Compaction History: ");
-
-        TabularData tabularData = this.probe.getCompactionHistory();
-        if (tabularData.isEmpty())
-        {
-            out.printf("There is no compaction history");
-            return;
-        }
-
-        String format = "%-41s%-19s%-29s%-26s%-15s%-15s%s%n";
-        List<String> indexNames = tabularData.getTabularType().getIndexNames();
-        out.printf(format, (Object[]) indexNames.toArray(new String[indexNames.size()]));
-
-        Set<?> values = tabularData.keySet();
-        for (Object eachValue : values)
-        {
-            List<?> value = (List<?>) eachValue;
-            out.printf(format, value.toArray(new Object[value.size()]));
-        }
-    }
-
-    private static void printHistory(String[] args, ToolCommandLine cmd)
-    {
-        //don't bother to print if no args passed (meaning, nodetool is just printing out the sub-commands list)
-        if (args.length == 0)
-            return;
-        String cmdLine = Joiner.on(" ").skipNulls().join(args);
-        final String password = cmd.getOptionValue(PASSWORD_OPT.left);
-        if (password != null)
-            cmdLine = cmdLine.replace(password, "<hidden>");
-
-        try (FileWriter writer = new FileWriter(new File(FBUtilities.getToolsOutputDirectory(), HISTORYFILE), true))
-        {
-            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS");
-            writer.append(sdf.format(new Date()) + ": " + cmdLine + "\n");
-        }
-        catch (IOException | IOError ioe)
-        {
-            //quietly ignore any errors about not being able to write out history
-        }
-    }
-
-    private static Throwable findInnermostThrowable(Throwable ex)
-    {
-        Throwable inner = ex.getCause();
-        return inner == null ? ex : findInnermostThrowable(inner);
-    }
-
-    private void printDescribeRing(String keyspaceName, PrintStream out)
-    {
-        out.println("Schema Version:" + probe.getSchemaVersion());
-        out.println("TokenRange: ");
-        try
-        {
-            for (String tokenRangeString : probe.describeRing(keyspaceName))
-            {
-                out.println("\t" + tokenRangeString);
-            }
-        }
-        catch (IOException e)
-        {
-            err(e, e.getMessage());
-        }
-    }
-
-    private void printRangeKeySample(PrintStream outs)
-    {
-        outs.println("RangeKeySample: ");
-        List<String> tokenStrings = this.probe.sampleKeyRange();
-        for (String tokenString : tokenStrings)
-        {
-            outs.println("\t" + tokenString);
-        }
-    }
-
-    private void printGossipInfo(PrintStream out) {
-        out.println(probe.getGossipInfo());
-    }
-
-    private static void badUse(String useStr)
-    {
-        System.err.println(useStr);
-        printUsage();
-        System.exit(1);
-    }
-
-    private static void err(Exception e, String errStr)
-    {
-        System.err.println(errStr);
-        e.printStackTrace();
-        System.exit(3);
-    }
-
-    private static void complainNonzeroArgs(String[] args, NodeCommand cmd)
-    {
-        if (args.length > 0) {
-            System.err.println("Too many arguments for command '"+cmd.toString()+"'.");
-            printUsage();
-            System.exit(1);
-        }
-    }
-
-    private static void handleSnapshots(NodeCommand nc, String tag, String[] cmdArgs, String columnFamily, NodeProbe probe) throws IOException
-    {
-        String[] keyspaces = Arrays.copyOfRange(cmdArgs, 0, cmdArgs.length);
-        System.out.print("Requested " + ((nc == NodeCommand.SNAPSHOT) ? "creating" : "clearing") + " snapshot for: ");
-        if ( keyspaces.length > 0 )
-        {
-          for (int i = 0; i < keyspaces.length; i++)
-              System.out.print(keyspaces[i] + " ");
-        }
-        else
-        {
-            System.out.print("all keyspaces ");
-        }
-
-        if (columnFamily != null)
-        {
-            System.out.print("and table: " + columnFamily);
-        }
-        System.out.println();
-
-        switch (nc)
-        {
-            case SNAPSHOT :
-                if (tag == null || tag.equals(""))
-                    tag = new Long(System.currentTimeMillis()).toString();
-                probe.takeSnapshot(tag, columnFamily, keyspaces);
-                System.out.println("Snapshot directory: " + tag);
-                break;
-            case CLEARSNAPSHOT :
-                probe.clearSnapshot(tag, keyspaces);
-                break;
-        }
-    }
-
-    private static void optionalKSandCFs(NodeCommand nc, ToolCommandLine cmd, String[] cmdArgs, NodeProbe probe) throws InterruptedException, IOException
-    {
-        // if there is one additional arg, it's the keyspace; more are columnfamilies
-        List<String> keyspaces = cmdArgs.length == 0 ? probe.getKeyspaces() : Arrays.asList(cmdArgs[0]);
-        for (String keyspace : keyspaces)
-        {
-            if (!probe.getKeyspaces().contains(keyspace))
-            {
-                System.err.println("Keyspace [" + keyspace + "] does not exist.");
-                System.exit(1);
-            }
-        }
-
-        // second loop so we're less likely to die halfway through due to invalid keyspace
-        for (String keyspace : keyspaces)
-        {
-            String[] columnFamilies = cmdArgs.length <= 1 ? new String[0] : Arrays.copyOfRange(cmdArgs, 1, cmdArgs.length);
-            switch (nc)
-            {
-                case REPAIR  :
-                    boolean sequential = !cmd.hasOption(PARALLEL_REPAIR_OPT.left);
-                    boolean localDC = cmd.hasOption(LOCAL_DC_REPAIR_OPT.left);
-                    boolean specificDC = cmd.hasOption(DC_REPAIR_OPT.left);
-                    boolean specificHosts = cmd.hasOption(HOST_REPAIR_OPT.left);
-                    boolean primaryRange = cmd.hasOption(PRIMARY_RANGE_OPT.left);
-                    Collection<String> dataCenters = null;
-                    Collection<String> hosts = null;
-
-                    if (primaryRange && (localDC || specificDC || specificHosts))
-                        throw new RuntimeException("Primary range repair should be performed on all nodes in the cluster.");
-
-                    if (specificDC)
-                        dataCenters = Arrays.asList(cmd.getOptionValue(DC_REPAIR_OPT.left).split(","));
-                    else if (localDC)
-                        dataCenters = Arrays.asList(probe.getDataCenter());
-                    else if(specificHosts)
-                        hosts  = Arrays.asList(cmd.getOptionValue(HOST_REPAIR_OPT.left).split(","));
-                    if (cmd.hasOption(START_TOKEN_OPT.left) || cmd.hasOption(END_TOKEN_OPT.left))
-                        probe.forceRepairRangeAsync(System.out, keyspace, sequential, dataCenters, hosts, cmd.getOptionValue(START_TOKEN_OPT.left), cmd.getOptionValue(END_TOKEN_OPT.left), columnFamilies);
-                    else
-                        probe.forceRepairAsync(System.out, keyspace, sequential, dataCenters, hosts, primaryRange, columnFamilies);
-                    break;
-                case FLUSH   :
-                    try { probe.forceKeyspaceFlush(keyspace, columnFamilies); }
-                    catch (ExecutionException ee) { err(ee, "Error occurred during flushing"); }
-                    break;
-                case COMPACT :
-                    try { probe.forceKeyspaceCompaction(keyspace, columnFamilies); }
-                    catch (ExecutionException ee) { err(ee, "Error occurred during compaction"); }
-                    break;
-                case CLEANUP :
-                    if (keyspace.equals(Keyspace.SYSTEM_KS)) { break; } // Skip cleanup on system cfs.
-                    try { probe.forceKeyspaceCleanup(keyspace, columnFamilies); }
-                    catch (ExecutionException ee) { err(ee, "Error occurred during cleanup"); }
-                    break;
-                case SCRUB :
-                    boolean disableSnapshot = cmd.hasOption(NO_SNAPSHOT.left);
-                    boolean skipCorrupted = cmd.hasOption(SCRUB_SKIP_CORRUPTED_OPT.left);
-                    try { probe.scrub(disableSnapshot, skipCorrupted, keyspace, columnFamilies); }
-                    catch (ExecutionException ee) { err(ee, "Error occurred while scrubbing keyspace " + keyspace); }
-                    break;
-                case UPGRADESSTABLES :
-                    boolean excludeCurrentVersion = !cmd.hasOption(UPGRADE_ALL_SSTABLE_OPT.left);
-                    try { probe.upgradeSSTables(keyspace, excludeCurrentVersion, columnFamilies); }
-                    catch (ExecutionException ee) { err(ee, "Error occurred while upgrading the sstables for keyspace " + keyspace); }
-                    break;
-                case ENABLEAUTOCOMPACTION:
-                    probe.enableAutoCompaction(keyspace, columnFamilies);
-                    break;
-                case DISABLEAUTOCOMPACTION:
-                    probe.disableAutoCompaction(keyspace, columnFamilies);
-                    break;
-                default:
-                    throw new RuntimeException("Unreachable code.");
-            }
-        }
-    }
-
-    /**
-     * Used for filtering keyspaces and columnfamilies to be displayed using the cfstats command.
-     */
-    private static class OptionFilter
-    {
-        private Map<String, List<String>> filter = new HashMap<String, List<String>>();
-        private Map<String, List<String>> verifier = new HashMap<String, List<String>>();
-        private String [] filterList;
-        private boolean ignoreMode;
-
-        public OptionFilter(boolean ignoreMode, String... filterList)
-        {
-            this.filterList = filterList;
-            this.ignoreMode = ignoreMode;
-
-            if(filterList == null)
-                return;
-
-            for(String s : filterList)
-            {
-                String [] keyValues = s.split("\\.", 2);
-
-                // build the map that stores the ks' and cfs to use
-                if(!filter.containsKey(keyValues[0]))
-                {
-                    filter.put(keyValues[0], new ArrayList<String>());
-                    verifier.put(keyValues[0], new ArrayList<String>());
-
-                    if(keyValues.length == 2)
-                    {
-                        filter.get(keyValues[0]).add(keyValues[1]);
-                        verifier.get(keyValues[0]).add(keyValues[1]);
-                    }
-                }
-                else
-                {
-                    if(keyValues.length == 2)
-                    {
-                        filter.get(keyValues[0]).add(keyValues[1]);
-                        verifier.get(keyValues[0]).add(keyValues[1]);
-                    }
-                }
-            }
-        }
-
-        public boolean isColumnFamilyIncluded(String keyspace, String columnFamily)
-        {
-            // supplying empty params list is treated as wanting to display all kss & cfs
-            if(filterList == null)
-                return !ignoreMode;
-
-            List<String> cfs = filter.get(keyspace);
-
-            // no such keyspace is in the map
-            if (cfs == null)
-                return ignoreMode;
-                // only a keyspace with no cfs was supplied
-                // so ignore or include (based on the flag) every column family in specified keyspace
-            else if (cfs.size() == 0)
-                return !ignoreMode;
-
-            // keyspace exists, and it contains specific cfs
-            verifier.get(keyspace).remove(columnFamily);
-            return ignoreMode ^ cfs.contains(columnFamily);
-        }
-
-        public void verifyKeyspaces(List<String> keyspaces)
-        {
-            for(String ks : verifier.keySet())
-                if(!keyspaces.contains(ks))
-                    throw new RuntimeException("Unknown keyspace: " + ks);
-        }
-
-        public void verifyColumnFamilies()
-        {
-            for(String ks : filter.keySet())
-                if(verifier.get(ks).size() > 0)
-                    throw new RuntimeException("Unknown column families: " + verifier.get(ks).toString() + " in keyspace: " + ks);
-        }
-    }
-
-    private static class ToolOptions extends Options
-    {
-        public void addOption(Pair<String, String> opts, boolean hasArgument, String description)
-        {
-            addOption(opts, hasArgument, description, false);
-        }
-
-        public void addOption(Pair<String, String> opts, boolean hasArgument, String description, boolean required)
-        {
-            addOption(opts.left, opts.right, hasArgument, description, required);
-        }
-
-        public void addOption(String opt, String longOpt, boolean hasArgument, String description, boolean required)
-        {
-            Option option = new Option(opt, longOpt, hasArgument, description);
-            option.setRequired(required);
-            addOption(option);
-        }
-    }
-
-    private static class ToolCommandLine
-    {
-        private final CommandLine commandLine;
-
-        public ToolCommandLine(CommandLine commands)
-        {
-            commandLine = commands;
-        }
-
-        public Option[] getOptions()
-        {
-            return commandLine.getOptions();
-        }
-
-        public boolean hasOption(String opt)
-        {
-            return commandLine.hasOption(opt);
-        }
-
-        public String getOptionValue(String opt)
-        {
-            return commandLine.getOptionValue(opt);
-        }
-
-        public NodeCommand getCommand()
-        {
-            if (commandLine.getArgs().length == 0)
-                throw new IllegalArgumentException("Command was not specified.");
-
-            String command = commandLine.getArgs()[0];
-
-            try
-            {
-                return NodeCommand.valueOf(command.toUpperCase());
-            }
-            catch (IllegalArgumentException e)
-            {
-                throw new IllegalArgumentException("Unrecognized command: " + command);
-            }
-        }
-
-        public String[] getCommandArguments()
-        {
-            List params = commandLine.getArgList();
-
-            if (params.size() < 2) // command parameters are empty
-                return new String[0];
-
-            String[] toReturn = new String[params.size() - 1];
-
-            for (int i = 1; i < params.size(); i++)
-            {
-                String parm = (String) params.get(i);
-                // why? look at CASSANDRA-4808
-                if (parm.startsWith("\\"))
-                    parm = parm.substring(1);
-                toReturn[i - 1] = parm;
-            }
-            return toReturn;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java b/src/java/org/apache/cassandra/tools/NodeProbe.java
index 78dce76..7faec72 100644
--- a/src/java/org/apache/cassandra/tools/NodeProbe.java
+++ b/src/java/org/apache/cassandra/tools/NodeProbe.java

@@ -43,6 +43,7 @@
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 
+import com.yammer.metrics.reporting.JmxReporter;
 import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutorMBean;
 import org.apache.cassandra.db.ColumnFamilyStoreMBean;
 import org.apache.cassandra.db.HintedHandOffManager;
@@ -58,12 +59,13 @@
 import org.apache.cassandra.streaming.StreamState;
 import org.apache.cassandra.streaming.StreamManagerMBean;
 import org.apache.cassandra.streaming.management.StreamStateCompositeData;
-import org.apache.cassandra.utils.SimpleCondition;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
  * JMX client operations for Cassandra.
  */
-public class NodeProbe
+public class NodeProbe implements AutoCloseable
 {
     private static final String fmtUrl = "service:jmx:rmi:///jndi/rmi://[%s]:%d/jmxrmi";
     private static final String ssObjName = "org.apache.cassandra.db:type=StorageService";
@@ -78,6 +80,7 @@
     private CompactionManagerMBean compactionProxy;
     private StorageServiceMBean ssProxy;
     private MemoryMXBean memProxy;
+    private GCInspectorMXBean gcProxy;
     private RuntimeMXBean runtimeProxy;
     private StreamManagerMBean streamProxy;
     public MessagingServiceMBean msProxy;
@@ -168,7 +171,10 @@
             spProxy = JMX.newMBeanProxy(mbeanServerConn, name, StorageProxyMBean.class);
             name = new ObjectName(HintedHandOffManager.MBEAN_NAME);
             hhProxy = JMX.newMBeanProxy(mbeanServerConn, name, HintedHandOffManagerMBean.class);
-        } catch (MalformedObjectNameException e)
+            name = new ObjectName(GCInspector.MBEAN_NAME);
+            gcProxy = JMX.newMBeanProxy(mbeanServerConn, name, GCInspectorMXBean.class);
+        }
+        catch (MalformedObjectNameException e)
         {
             throw new RuntimeException(
                     "Invalid ObjectName? Please report this as a bug.", e);
@@ -185,21 +191,49 @@
         jmxc.close();
     }
 
-    public void forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
     {
-        ssProxy.forceKeyspaceCleanup(keyspaceName, columnFamilies);
+        return ssProxy.forceKeyspaceCleanup(keyspaceName, columnFamilies);
     }
 
-    public void scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
     {
-        ssProxy.scrub(disableSnapshot, skipCorrupted, keyspaceName, columnFamilies);
+        return ssProxy.scrub(disableSnapshot, skipCorrupted, keyspaceName, columnFamilies);
     }
 
-    public void upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
     {
-        ssProxy.upgradeSSTables(keyspaceName, excludeCurrentVersion, columnFamilies);
+        return ssProxy.upgradeSSTables(keyspaceName, excludeCurrentVersion, columnFamilies);
     }
 
+    public void forceKeyspaceCleanup(PrintStream out, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    {
+        if (forceKeyspaceCleanup(keyspaceName, columnFamilies) != 0)
+        {
+            failed = true;
+            out.println("Aborted cleaning up atleast one column family in keyspace "+keyspaceName+", check server logs for more information.");
+        }
+    }
+
+    public void scrub(PrintStream out, boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    {
+        if (scrub(disableSnapshot, skipCorrupted, keyspaceName, columnFamilies) != 0)
+        {
+            failed = true;
+            out.println("Aborted scrubbing atleast one column family in keyspace "+keyspaceName+", check server logs for more information.");
+        }
+    }
+
+    public void upgradeSSTables(PrintStream out, String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    {
+        if (upgradeSSTables(keyspaceName, excludeCurrentVersion, columnFamilies) != 0)
+        {
+            failed = true;
+            out.println("Aborted upgrading sstables for atleast one column family in keyspace "+keyspaceName+", check server logs for more information.");
+        }
+    }
+
+
     public void forceKeyspaceCompaction(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
     {
         ssProxy.forceKeyspaceCompaction(keyspaceName, columnFamilies);
@@ -210,19 +244,14 @@
         ssProxy.forceKeyspaceFlush(keyspaceName, columnFamilies);
     }
 
-    public void forceKeyspaceRepair(String keyspaceName, boolean isSequential, boolean isLocal, String... columnFamilies) throws IOException
-    {
-        ssProxy.forceKeyspaceRepair(keyspaceName, isSequential, isLocal, columnFamilies);
-    }
-
-    public void forceRepairAsync(final PrintStream out, final String keyspaceName, boolean isSequential, Collection<String> dataCenters, final Collection<String> hosts,  boolean primaryRange, String... columnFamilies) throws IOException
+    public void forceRepairAsync(final PrintStream out, final String keyspaceName, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, boolean primaryRange, boolean fullRepair, String... columnFamilies) throws IOException
     {
         RepairRunner runner = new RepairRunner(out, keyspaceName, columnFamilies);
         try
         {
             jmxc.addConnectionNotificationListener(runner, null, null);
             ssProxy.addNotificationListener(runner, null, null);
-            if (!runner.repairAndWait(ssProxy, isSequential, dataCenters, hosts, primaryRange))
+            if (!runner.repairAndWait(ssProxy, isSequential, dataCenters, hosts, primaryRange, fullRepair))
                 failed = true;
         }
         catch (Exception e)
@@ -236,18 +265,22 @@
                 ssProxy.removeNotificationListener(runner);
                 jmxc.removeConnectionNotificationListener(runner);
             }
-            catch (Throwable ignored) {}
+            catch (Throwable t)
+            {
+                JVMStabilityInspector.inspectThrowable(t);
+                out.println("Exception occurred during clean-up. " + t);
+            }
         }
     }
 
-    public void forceRepairRangeAsync(final PrintStream out, final String keyspaceName, boolean isSequential, Collection<String> dataCenters, final Collection<String> hosts, final String startToken, final String endToken, String... columnFamilies) throws IOException
+    public void forceRepairRangeAsync(final PrintStream out, final String keyspaceName, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, final String startToken, final String endToken, boolean fullRepair, String... columnFamilies) throws IOException
     {
         RepairRunner runner = new RepairRunner(out, keyspaceName, columnFamilies);
         try
         {
             jmxc.addConnectionNotificationListener(runner, null, null);
             ssProxy.addNotificationListener(runner, null, null);
-            if (!runner.repairRangeAndWait(ssProxy,  isSequential, dataCenters, hosts, startToken, endToken))
+            if (!runner.repairRangeAndWait(ssProxy,  isSequential, dataCenters, hosts, startToken, endToken, fullRepair))
                 failed = true;
         }
         catch (Exception e)
@@ -261,18 +294,16 @@
                 ssProxy.removeNotificationListener(runner);
                 jmxc.removeConnectionNotificationListener(runner);
             }
-            catch (Throwable ignored) {}
+            catch (Throwable e)
+            {
+                out.println("Exception occurred during clean-up. " + e);
+            }
         }
     }
 
-    public void forceKeyspaceRepairPrimaryRange(String keyspaceName, boolean isSequential, boolean isLocal, String... columnFamilies) throws IOException
+    public void invalidateCounterCache()
     {
-        ssProxy.forceKeyspaceRepairPrimaryRange(keyspaceName, isSequential, isLocal, columnFamilies);
-    }
-
-    public void forceKeyspaceRepairRange(String beginToken, String endToken, String keyspaceName, boolean isSequential, boolean isLocal, String... columnFamilies) throws IOException
-    {
-        ssProxy.forceKeyspaceRepairRange(beginToken, endToken, keyspaceName, isSequential, isLocal, columnFamilies);
+        cacheService.invalidateCounterCache();
     }
 
     public void invalidateKeyCache()
@@ -349,6 +380,11 @@
         }
     }
 
+    public double[] getAndResetGCStats()
+    {
+        return gcProxy.getAndResetStats();
+    }
+
     public Iterator<Map.Entry<String, ColumnFamilyStoreMBean>> getColumnFamilyStoreMBeanProxies()
     {
         try
@@ -451,6 +487,16 @@
         ssProxy.clearSnapshot(tag, keyspaces);
     }
 
+    public Map<String, TabularData> getSnapshotDetails()
+    {
+        return ssProxy.getSnapshotDetails();
+    }
+
+    public long trueSnapshotsSize()
+    {
+        return ssProxy.trueSnapshotsSize();
+    }
+
     public boolean isJoined()
     {
         return ssProxy.isJoined();
@@ -529,12 +575,7 @@
         ssProxy.setIncrementalBackupsEnabled(enabled);
     }
 
-    public void setHintedHandoffThrottleInKB(int throttleInKb)
-    {
-        ssProxy.setHintedHandoffThrottleInKB(throttleInKb);
-    }
-
-    public void setCacheCapacities(int keyCacheCapacity, int rowCacheCapacity)
+    public void setCacheCapacities(int keyCacheCapacity, int rowCacheCapacity, int counterCacheCapacity)
     {
         try
         {
@@ -542,6 +583,7 @@
             CacheServiceMBean cacheMBean = JMX.newMBeanProxy(mbeanServerConn, new ObjectName(keyCachePath), CacheServiceMBean.class);
             cacheMBean.setKeyCacheCapacityInMB(keyCacheCapacity);
             cacheMBean.setRowCacheCapacityInMB(rowCacheCapacity);
+            cacheMBean.setCounterCacheCapacityInMB(counterCacheCapacity);
         }
         catch (MalformedObjectNameException e)
         {
@@ -549,7 +591,7 @@
         }
     }
 
-    public void setCacheKeysToSave(int keyCacheKeysToSave, int rowCacheKeysToSave)
+    public void setCacheKeysToSave(int keyCacheKeysToSave, int rowCacheKeysToSave, int counterCacheKeysToSave)
     {
         try
         {
@@ -557,6 +599,7 @@
             CacheServiceMBean cacheMBean = JMX.newMBeanProxy(mbeanServerConn, new ObjectName(keyCachePath), CacheServiceMBean.class);
             cacheMBean.setKeyCacheKeysToSave(keyCacheKeysToSave);
             cacheMBean.setRowCacheKeysToSave(rowCacheKeysToSave);
+            cacheMBean.setCounterCacheKeysToSave(counterCacheKeysToSave);
         }
         catch (MalformedObjectNameException e)
         {
@@ -564,6 +607,10 @@
         }
     }
 
+    public void setHintedHandoffThrottleInKB(int throttleInKB)
+    {
+        ssProxy.setHintedHandoffThrottleInKB(throttleInKB);
+    }
 
     public List<InetAddress> getEndpoints(String keyspace, String cf, String key)
     {
@@ -663,7 +710,7 @@
 
         for (Map.Entry<String, String> pair : tokenToEndpoint.entrySet())
         {
-            if (pair.getKey().toString().equals(stringToken))
+            if (pair.getKey().equals(stringToken))
             {
                 return pair.getValue();
             }
@@ -907,6 +954,179 @@
         return spProxy.getReadRepairRepairedBackground();
     }
 
+    // JMX getters for the o.a.c.metrics API below.
+    /**
+     * Retrieve cache metrics based on the cache type (KeyCache, RowCache, or CounterCache)
+     * @param cacheType KeyCach, RowCache, or CounterCache
+     * @param metricName Capacity, Entries, HitRate, Size, Requests or Hits.
+     */
+    public Object getCacheMetric(String cacheType, String metricName)
+    {
+        try
+        {
+            switch(metricName)
+            {
+                case "Capacity":
+                case "Entries":
+                case "HitRate":
+                case "Size":
+                    return JMX.newMBeanProxy(mbeanServerConn,
+                            new ObjectName("org.apache.cassandra.metrics:type=Cache,scope=" + cacheType + ",name=" + metricName),
+                            JmxReporter.GaugeMBean.class).getValue();
+                case "Requests":
+                case "Hits":
+                    return JMX.newMBeanProxy(mbeanServerConn,
+                            new ObjectName("org.apache.cassandra.metrics:type=Cache,scope=" + cacheType + ",name=" + metricName),
+                            JmxReporter.MeterMBean.class).getCount();
+                default:
+                    throw new RuntimeException("Unknown cache metric name.");
+
+            }
+        }
+        catch (MalformedObjectNameException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Retrieve ColumnFamily metrics
+     * @param ks Keyspace for which stats are to be displayed.
+     * @param cf ColumnFamily for which stats are to be displayed.
+     * @param metricName View {@link org.apache.cassandra.metrics.ColumnFamilyMetrics}.
+     */
+    public Object getColumnFamilyMetric(String ks, String cf, String metricName)
+    {
+        try
+        {
+            String type = cf.contains(".") ? "IndexColumnFamily": "ColumnFamily";
+            ObjectName oName = new ObjectName(String.format("org.apache.cassandra.metrics:type=%s,keyspace=%s,scope=%s,name=%s", type, ks, cf, metricName));
+            switch(metricName)
+            {
+                case "BloomFilterDiskSpaceUsed":
+                case "BloomFilterFalsePositives":
+                case "BloomFilterFalseRatio":
+                case "CompressionRatio":
+                case "EstimatedColumnCountHistogram":
+                case "EstimatedRowSizeHistogram":
+                case "KeyCacheHitRate":
+                case "LiveSSTableCount":
+                case "MaxRowSize":
+                case "MeanRowSize":
+                case "MemtableColumnsCount":
+                case "MemtableLiveDataSize":
+                case "MinRowSize":
+                case "RecentBloomFilterFalsePositives":
+                case "RecentBloomFilterFalseRatio":
+                case "SnapshotsSize":
+                    return JMX.newMBeanProxy(mbeanServerConn, oName, JmxReporter.GaugeMBean.class).getValue();
+                case "LiveDiskSpaceUsed":
+                case "MemtableSwitchCount":
+                case "SpeculativeRetries":
+                case "TotalDiskSpaceUsed":
+                case "WriteTotalLatency":
+                case "ReadTotalLatency":
+                case "PendingFlushes":
+                    return JMX.newMBeanProxy(mbeanServerConn, oName, JmxReporter.CounterMBean.class).getCount();
+                case "ReadLatency":
+                case "CoordinatorReadLatency":
+                case "CoordinatorScanLatency":
+                case "WriteLatency":
+                    return JMX.newMBeanProxy(mbeanServerConn, oName, JmxReporter.TimerMBean.class);
+                case "LiveScannedHistogram":
+                case "SSTablesPerReadHistogram":
+                case "TombstoneScannedHistogram":
+                    return JMX.newMBeanProxy(mbeanServerConn, oName, JmxReporter.HistogramMBean.class);
+                default:
+                    throw new RuntimeException("Unknown column family metric.");
+            }
+        }
+        catch (MalformedObjectNameException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Retrieve Proxy metrics
+     * @param scope RangeSlice, Read or Write
+     */
+    public JmxReporter.TimerMBean getProxyMetric(String scope)
+    {
+        try
+        {
+            return JMX.newMBeanProxy(mbeanServerConn,
+                    new ObjectName("org.apache.cassandra.metrics:type=ClientRequest,scope=" + scope + ",name=Latency"),
+                    JmxReporter.TimerMBean.class);
+        }
+        catch (MalformedObjectNameException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Retrieve Proxy metrics
+     * @param metricName CompletedTasks, PendingTasks, BytesCompacted or TotalCompactionsCompleted.
+     */
+    public Object getCompactionMetric(String metricName)
+    {
+        try
+        {
+            switch(metricName)
+            {
+                case "BytesCompacted":
+                    return JMX.newMBeanProxy(mbeanServerConn,
+                            new ObjectName("org.apache.cassandra.metrics:type=Compaction,name=" + metricName),
+                            JmxReporter.CounterMBean.class);
+                case "CompletedTasks":
+                case "PendingTasks":
+                    return JMX.newMBeanProxy(mbeanServerConn,
+                            new ObjectName("org.apache.cassandra.metrics:type=Compaction,name=" + metricName),
+                            JmxReporter.GaugeMBean.class).getValue();
+                case "TotalCompactionsCompleted":
+                    return JMX.newMBeanProxy(mbeanServerConn,
+                            new ObjectName("org.apache.cassandra.metrics:type=Compaction,name=" + metricName),
+                            JmxReporter.MeterMBean.class);
+                default:
+                    throw new RuntimeException("Unknown compaction metric.");
+            }
+        }
+        catch (MalformedObjectNameException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Retrieve Proxy metrics
+     * @param metricName Exceptions, Load, TotalHints or TotalHintsInProgress.
+     */
+    public long getStorageMetric(String metricName)
+    {
+        try
+        {
+            return JMX.newMBeanProxy(mbeanServerConn,
+                    new ObjectName("org.apache.cassandra.metrics:type=Storage,name=" + metricName),
+                    JmxReporter.CounterMBean.class).getCount();
+        }
+        catch (MalformedObjectNameException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public double[] metricPercentilesAsArray(JmxReporter.HistogramMBean metric)
+    {
+        return new double[]{ metric.get50thPercentile(),
+                metric.get75thPercentile(),
+                metric.get95thPercentile(),
+                metric.get98thPercentile(),
+                metric.get99thPercentile(),
+                metric.getMin(),
+                metric.getMax()};
+    }
+
     public TabularData getCompactionHistory()
     {
         return compactionProxy.getCompactionHistory();
@@ -919,12 +1139,19 @@
 
     public void setLoggingLevel(String classQualifier, String level)
     {
-        ssProxy.setLog4jLevel(classQualifier, level);
+        try
+        {
+            ssProxy.setLoggingLevel(classQualifier, level);
+        }
+        catch (Exception e)
+        {
+          throw new RuntimeException("Error setting log for " + classQualifier +" on level " + level +". Please check logback configuration and ensure to have <jmxConfigurator /> set", e); 
+        }
     }
 
     public Map<String, String> getLoggingLevels()
     {
-         return ssProxy.getLoggingLevels();
+        return ssProxy.getLoggingLevels();
     }
 }
 
@@ -1055,16 +1282,16 @@
         this.columnFamilies = columnFamilies;
     }
 
-    public boolean repairAndWait(StorageServiceMBean ssProxy, boolean isSequential, Collection<String> dataCenters, final Collection<String> hosts, boolean primaryRangeOnly) throws Exception
+    public boolean repairAndWait(StorageServiceMBean ssProxy, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, boolean primaryRangeOnly, boolean fullRepair) throws Exception
     {
-        cmd = ssProxy.forceRepairAsync(keyspace, isSequential, dataCenters, hosts, primaryRangeOnly, columnFamilies);
+        cmd = ssProxy.forceRepairAsync(keyspace, isSequential, dataCenters, hosts, primaryRangeOnly, fullRepair, columnFamilies);
         waitForRepair();
         return success;
     }
 
-    public boolean repairRangeAndWait(StorageServiceMBean ssProxy, boolean isSequential, Collection<String> dataCenters, final Collection<String> hosts, String startToken, String endToken) throws Exception
+    public boolean repairRangeAndWait(StorageServiceMBean ssProxy, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, String startToken, String endToken, boolean fullRepair) throws Exception
     {
-        cmd = ssProxy.forceRepairRangeAsync(startToken, endToken, keyspace, isSequential, dataCenters, hosts, columnFamilies);
+        cmd = ssProxy.forceRepairRangeAsync(startToken, endToken, keyspace, isSequential, dataCenters, hosts, fullRepair, columnFamilies);
         waitForRepair();
         return success;
     }

diff --git a/src/java/org/apache/cassandra/tools/NodeTool.java b/src/java/org/apache/cassandra/tools/NodeTool.java
new file mode 100644
index 0000000..55cd869
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/NodeTool.java

@@ -0,0 +1,2424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.tools;
+
+import java.io.*;
+import java.lang.management.MemoryUsage;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.text.DecimalFormat;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.Map.Entry;
+import java.util.concurrent.ExecutionException;
+
+import javax.management.openmbean.TabularData;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Throwables;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.Maps;
+import com.yammer.metrics.reporting.JmxReporter;
+
+import io.airlift.command.*;
+
+import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutorMBean;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.ColumnFamilyStoreMBean;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.compaction.CompactionManagerMBean;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.locator.EndpointSnitchInfoMBean;
+import org.apache.cassandra.locator.LocalStrategy;
+import org.apache.cassandra.net.MessagingServiceMBean;
+import org.apache.cassandra.service.CacheServiceMBean;
+import org.apache.cassandra.streaming.ProgressInfo;
+import org.apache.cassandra.streaming.SessionInfo;
+import org.apache.cassandra.streaming.StreamState;
+import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.base.Throwables.getStackTraceAsString;
+import static com.google.common.collect.Iterables.toArray;
+import static com.google.common.collect.Lists.newArrayList;
+import static java.lang.Integer.parseInt;
+import static java.lang.String.format;
+import static org.apache.commons.lang3.ArrayUtils.EMPTY_STRING_ARRAY;
+import static org.apache.commons.lang3.StringUtils.*;
+
+public class NodeTool
+{
+    private static final String HISTORYFILE = "nodetool.history";
+
+    public static void main(String... args)
+    {
+        List<Class<? extends Runnable>> commands = newArrayList(
+                Help.class,
+                Info.class,
+                Ring.class,
+                NetStats.class,
+                CfStats.class,
+                CfHistograms.class,
+                Cleanup.class,
+                ClearSnapshot.class,
+                Compact.class,
+                Scrub.class,
+                Flush.class,
+                UpgradeSSTable.class,
+                DisableAutoCompaction.class,
+                EnableAutoCompaction.class,
+                CompactionStats.class,
+                CompactionHistory.class,
+                Decommission.class,
+                DescribeCluster.class,
+                DisableBinary.class,
+                EnableBinary.class,
+                EnableGossip.class,
+                DisableGossip.class,
+                EnableHandoff.class,
+                EnableThrift.class,
+                GcStats.class,
+                GetCompactionThreshold.class,
+                GetCompactionThroughput.class,
+                GetStreamThroughput.class,
+                GetEndpoints.class,
+                GetSSTables.class,
+                GossipInfo.class,
+                InvalidateKeyCache.class,
+                InvalidateRowCache.class,
+                InvalidateCounterCache.class,
+                Join.class,
+                Move.class,
+                PauseHandoff.class,
+                ResumeHandoff.class,
+                ProxyHistograms.class,
+                Rebuild.class,
+                Refresh.class,
+                RemoveToken.class,
+                RemoveNode.class,
+                Repair.class,
+                SetCacheCapacity.class,
+                SetHintedHandoffThrottleInKB.class,
+                SetCompactionThreshold.class,
+                SetCompactionThroughput.class,
+                SetStreamThroughput.class,
+                SetTraceProbability.class,
+                Snapshot.class,
+                ListSnapshots.class,
+                Status.class,
+                StatusBinary.class,
+                StatusThrift.class,
+                Stop.class,
+                StopDaemon.class,
+                Version.class,
+                DescribeRing.class,
+                RebuildIndex.class,
+                RangeKeySample.class,
+                EnableBackup.class,
+                DisableBackup.class,
+                ResetLocalSchema.class,
+                ReloadTriggers.class,
+                SetCacheKeysToSave.class,
+                DisableThrift.class,
+                DisableHandoff.class,
+                Drain.class,
+                TruncateHints.class,
+                TpStats.class,
+                SetLoggingLevel.class,
+                GetLoggingLevels.class
+        );
+
+        Cli<Runnable> parser = Cli.<Runnable>builder("nodetool")
+                .withDescription("Manage your Cassandra cluster")
+                .withDefaultCommand(Help.class)
+                .withCommands(commands)
+                .build();
+
+        int status = 0;
+        try
+        {
+            Runnable parse = parser.parse(args);
+            printHistory(args);
+            parse.run();
+        } catch (IllegalArgumentException |
+                IllegalStateException |
+                ParseArgumentsMissingException |
+                ParseArgumentsUnexpectedException |
+                ParseOptionConversionException |
+                ParseOptionMissingException |
+                ParseOptionMissingValueException |
+                ParseCommandMissingException |
+                ParseCommandUnrecognizedException e)
+        {
+            badUse(e);
+            status = 1;
+        } catch (Throwable throwable)
+        {
+            err(Throwables.getRootCause(throwable));
+            status = 2;
+        }
+
+        System.exit(status);
+    }
+
+    private static void printHistory(String... args)
+    {
+        //don't bother to print if no args passed (meaning, nodetool is just printing out the sub-commands list)
+        if (args.length == 0)
+            return;
+
+        String cmdLine = Joiner.on(" ").skipNulls().join(args);
+        cmdLine = cmdLine.replaceFirst("(?<=(-pw|--password))\\s+\\S+", " <hidden>");
+
+        try (FileWriter writer = new FileWriter(new File(FBUtilities.getToolsOutputDirectory(), HISTORYFILE), true))
+        {
+            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS");
+            writer.append(sdf.format(new Date())).append(": ").append(cmdLine).append(System.lineSeparator());
+        }
+        catch (IOException | IOError ioe)
+        {
+            //quietly ignore any errors about not being able to write out history
+        }
+    }
+
+    private static void badUse(Exception e)
+    {
+        System.out.println("nodetool: " + e.getMessage());
+        System.out.println("See 'nodetool help' or 'nodetool help <command>'.");
+    }
+
+    private static void err(Throwable e)
+    {
+        System.err.println("error: " + e.getMessage());
+        System.err.println("-- StackTrace --");
+        System.err.println(getStackTraceAsString(e));
+    }
+
+    public static abstract class NodeToolCmd implements Runnable
+    {
+
+        @Option(type = OptionType.GLOBAL, name = {"-h", "--host"}, description = "Node hostname or ip address")
+        private String host = "127.0.0.1";
+
+        @Option(type = OptionType.GLOBAL, name = {"-p", "--port"}, description = "Remote jmx agent port number")
+        private String port = "7199";
+
+        @Option(type = OptionType.GLOBAL, name = {"-u", "--username"}, description = "Remote jmx agent username")
+        private String username = EMPTY;
+
+        @Option(type = OptionType.GLOBAL, name = {"-pw", "--password"}, description = "Remote jmx agent password")
+        private String password = EMPTY;
+
+        @Option(type = OptionType.GLOBAL, name = {"-pwf", "--password-file"}, description = "Path to the JMX password file")
+        private String passwordFilePath = EMPTY;
+
+        @Override
+        public void run()
+        {
+            if (isNotEmpty(username)) {
+                if (isNotEmpty(passwordFilePath))
+                    password = readUserPasswordFromFile(username, passwordFilePath);
+
+                if (isEmpty(password))
+                    password = promptAndReadPassword();
+            }
+
+            try (NodeProbe probe = connect())
+            {
+                execute(probe);
+            } 
+            catch (IOException e)
+            {
+                throw new RuntimeException("Error while closing JMX connection", e);
+            }
+
+        }
+
+        private String readUserPasswordFromFile(String username, String passwordFilePath) {
+            String password = EMPTY;
+
+            File passwordFile = new File(passwordFilePath);
+            try (Scanner scanner = new Scanner(passwordFile).useDelimiter("\\s+"))
+            {
+                while (scanner.hasNextLine())
+                {
+                    if (scanner.hasNext())
+                    {
+                        String jmxRole = scanner.next();
+                        if (jmxRole.equals(username) && scanner.hasNext())
+                        {
+                            password = scanner.next();
+                            break;
+                        }
+                    }
+                    scanner.nextLine();
+                }
+            } catch (FileNotFoundException e)
+            {
+                throw new RuntimeException(e);
+            }
+
+            return password;
+        }
+
+        private String promptAndReadPassword()
+        {
+            String password = EMPTY;
+
+            Console console = System.console();
+            if (console != null)
+                password = String.valueOf(console.readPassword("Password:"));
+
+            return password;
+        }
+
+        protected abstract void execute(NodeProbe probe);
+
+        private NodeProbe connect()
+        {
+            NodeProbe nodeClient = null;
+
+            try
+            {
+                if (username.isEmpty())
+                    nodeClient = new NodeProbe(host, parseInt(port));
+                else
+                    nodeClient = new NodeProbe(host, parseInt(port), username, password);
+            } catch (IOException e)
+            {
+                Throwable rootCause = Throwables.getRootCause(e);
+                System.err.println(format("nodetool: Failed to connect to '%s:%s' - %s: '%s'.", host, port, rootCause.getClass().getSimpleName(), rootCause.getMessage()));
+                System.exit(1);
+            }
+
+            return nodeClient;
+        }
+
+        protected List<String> parseOptionalKeyspace(List<String> cmdArgs, NodeProbe nodeProbe)
+        {
+            List<String> keyspaces = new ArrayList<>();
+
+            if (cmdArgs == null || cmdArgs.isEmpty())
+                keyspaces.addAll(nodeProbe.getKeyspaces());
+            else
+                keyspaces.add(cmdArgs.get(0));
+
+            for (String keyspace : keyspaces)
+            {
+                if (!nodeProbe.getKeyspaces().contains(keyspace))
+                    throw new IllegalArgumentException("Keyspace [" + keyspace + "] does not exist.");
+            }
+
+            return Collections.unmodifiableList(keyspaces);
+        }
+
+        protected String[] parseOptionalColumnFamilies(List<String> cmdArgs)
+        {
+            return cmdArgs.size() <= 1 ? EMPTY_STRING_ARRAY : toArray(cmdArgs.subList(1, cmdArgs.size()), String.class);
+        }
+    }
+
+    @Command(name = "info", description = "Print node information (uptime, load, ...)")
+    public static class Info extends NodeToolCmd
+    {
+        @Option(name = {"-T", "--tokens"}, description = "Display all tokens")
+        private boolean tokens = false;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            boolean gossipInitialized = probe.isInitialized();
+
+            System.out.printf("%-17s: %s%n", "ID", probe.getLocalHostId());
+            System.out.printf("%-17s: %s%n", "Gossip active", gossipInitialized);
+            System.out.printf("%-17s: %s%n", "Thrift active", probe.isThriftServerRunning());
+            System.out.printf("%-17s: %s%n", "Native Transport active", probe.isNativeTransportRunning());
+            System.out.printf("%-17s: %s%n", "Load", probe.getLoadString());
+            if (gossipInitialized)
+                System.out.printf("%-17s: %s%n", "Generation No", probe.getCurrentGenerationNumber());
+            else
+                System.out.printf("%-17s: %s%n", "Generation No", 0);
+
+            // Uptime
+            long secondsUp = probe.getUptime() / 1000;
+            System.out.printf("%-17s: %d%n", "Uptime (seconds)", secondsUp);
+
+            // Memory usage
+            MemoryUsage heapUsage = probe.getHeapMemoryUsage();
+            double memUsed = (double) heapUsage.getUsed() / (1024 * 1024);
+            double memMax = (double) heapUsage.getMax() / (1024 * 1024);
+            System.out.printf("%-17s: %.2f / %.2f%n", "Heap Memory (MB)", memUsed, memMax);
+
+            // Data Center/Rack
+            System.out.printf("%-17s: %s%n", "Data Center", probe.getDataCenter());
+            System.out.printf("%-17s: %s%n", "Rack", probe.getRack());
+
+            // Exceptions
+            System.out.printf("%-17s: %s%n", "Exceptions", probe.getStorageMetric("Exceptions"));
+
+            CacheServiceMBean cacheService = probe.getCacheServiceMBean();
+
+            // Key Cache: Hits, Requests, RecentHitRate, SavePeriodInSeconds
+            System.out.printf("%-17s: entries %d, size %s, capacity %s, %d hits, %d requests, %.3f recent hit rate, %d save period in seconds%n",
+                    "Key Cache",
+                    probe.getCacheMetric("KeyCache", "Entries"),
+                    FileUtils.stringifyFileSize((long) probe.getCacheMetric("KeyCache", "Size")),
+                    FileUtils.stringifyFileSize((long) probe.getCacheMetric("KeyCache", "Capacity")),
+                    probe.getCacheMetric("KeyCache", "Hits"),
+                    probe.getCacheMetric("KeyCache", "Requests"),
+                    probe.getCacheMetric("KeyCache", "HitRate"),
+                    cacheService.getKeyCacheSavePeriodInSeconds());
+
+            // Row Cache: Hits, Requests, RecentHitRate, SavePeriodInSeconds
+            System.out.printf("%-17s: entries %d, size %s, capacity %s, %d hits, %d requests, %.3f recent hit rate, %d save period in seconds%n",
+                    "Row Cache",
+                    probe.getCacheMetric("RowCache", "Entries"),
+                    FileUtils.stringifyFileSize((long) probe.getCacheMetric("RowCache", "Size")),
+                    FileUtils.stringifyFileSize((long) probe.getCacheMetric("RowCache", "Capacity")),
+                    probe.getCacheMetric("RowCache", "Hits"),
+                    probe.getCacheMetric("RowCache", "Requests"),
+                    probe.getCacheMetric("RowCache", "HitRate"),
+                    cacheService.getRowCacheSavePeriodInSeconds());
+
+            // Counter Cache: Hits, Requests, RecentHitRate, SavePeriodInSeconds
+            System.out.printf("%-17s: entries %d, size %s, capacity %s, %d hits, %d requests, %.3f recent hit rate, %d save period in seconds%n",
+                    "Counter Cache",
+                    probe.getCacheMetric("CounterCache", "Entries"),
+                    FileUtils.stringifyFileSize((long) probe.getCacheMetric("CounterCache", "Size")),
+                    FileUtils.stringifyFileSize((long) probe.getCacheMetric("CounterCache", "Capacity")),
+                    probe.getCacheMetric("CounterCache", "Hits"),
+                    probe.getCacheMetric("CounterCache", "Requests"),
+                    probe.getCacheMetric("CounterCache", "HitRate"),
+                    cacheService.getCounterCacheSavePeriodInSeconds());
+
+            // Tokens
+            List<String> tokens = probe.getTokens();
+            if (tokens.size() == 1 || this.tokens)
+                for (String token : tokens)
+                    System.out.printf("%-17s: %s%n", "Token", token);
+            else
+                System.out.printf("%-17s: (invoke with -T/--tokens to see all %d tokens)%n", "Token", tokens.size());
+        }
+    }
+
+    @Command(name = "ring", description = "Print information about the token ring")
+    public static class Ring extends NodeToolCmd
+    {
+        @Arguments(description = "Specify a keyspace for accurate ownership information (topology awareness)")
+        private String keyspace = null;
+
+        @Option(title = "resolve_ip", name = {"-r", "--resolve-ip"}, description = "Show node domain names instead of IPs")
+        private boolean resolveIp = false;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            Map<String, String> tokensToEndpoints = probe.getTokenToEndpointMap();
+            LinkedHashMultimap<String, String> endpointsToTokens = LinkedHashMultimap.create();
+            boolean haveVnodes = false;
+            for (Map.Entry<String, String> entry : tokensToEndpoints.entrySet())
+            {
+                haveVnodes |= endpointsToTokens.containsKey(entry.getValue());
+                endpointsToTokens.put(entry.getValue(), entry.getKey());
+            }
+
+            int maxAddressLength = Collections.max(endpointsToTokens.keys(), new Comparator<String>()
+            {
+                @Override
+                public int compare(String first, String second)
+                {
+                    return ((Integer) first.length()).compareTo(second.length());
+                }
+            }).length();
+
+            String formatPlaceholder = "%%-%ds  %%-12s%%-7s%%-8s%%-16s%%-20s%%-44s%%n";
+            String format = format(formatPlaceholder, maxAddressLength);
+
+            StringBuffer errors = new StringBuffer();
+            boolean showEffectiveOwnership = true;
+            // Calculate per-token ownership of the ring
+            Map<InetAddress, Float> ownerships;
+            try
+            {
+                ownerships = probe.effectiveOwnership(keyspace);
+            } 
+            catch (IllegalStateException ex)
+            {
+                ownerships = probe.getOwnership();
+                errors.append("Note: " + ex.getMessage() + "%n");
+                showEffectiveOwnership = false;
+            } 
+            catch (IllegalArgumentException ex)
+            {
+                System.out.printf("%nError: " + ex.getMessage() + "%n");
+                return;
+            }
+
+            
+            System.out.println();
+            for (Entry<String, SetHostStat> entry : getOwnershipByDc(probe, resolveIp, tokensToEndpoints, ownerships).entrySet())
+                printDc(probe, format, entry.getKey(), endpointsToTokens, entry.getValue(),showEffectiveOwnership);
+
+            if (haveVnodes)
+            {
+                System.out.println("  Warning: \"nodetool ring\" is used to output all the tokens of a node.");
+                System.out.println("  To view status related info of a node use \"nodetool status\" instead.\n");
+            }
+
+            System.out.printf("%n  " + errors.toString());
+        }
+
+        private void printDc(NodeProbe probe, String format,
+                             String dc,
+                             LinkedHashMultimap<String, String> endpointsToTokens,
+                             SetHostStat hoststats,boolean showEffectiveOwnership)
+        {
+            Collection<String> liveNodes = probe.getLiveNodes();
+            Collection<String> deadNodes = probe.getUnreachableNodes();
+            Collection<String> joiningNodes = probe.getJoiningNodes();
+            Collection<String> leavingNodes = probe.getLeavingNodes();
+            Collection<String> movingNodes = probe.getMovingNodes();
+            Map<String, String> loadMap = probe.getLoadMap();
+
+            System.out.println("Datacenter: " + dc);
+            System.out.println("==========");
+
+            // get the total amount of replicas for this dc and the last token in this dc's ring
+            List<String> tokens = new ArrayList<>();
+            String lastToken = "";
+
+            for (HostStat stat : hoststats)
+            {
+                tokens.addAll(endpointsToTokens.get(stat.endpoint.getHostAddress()));
+                lastToken = tokens.get(tokens.size() - 1);
+            }
+
+            System.out.printf(format, "Address", "Rack", "Status", "State", "Load", "Owns", "Token");
+
+            if (hoststats.size() > 1)
+                System.out.printf(format, "", "", "", "", "", "", lastToken);
+            else
+                System.out.println();
+
+            for (HostStat stat : hoststats)
+            {
+                String endpoint = stat.endpoint.getHostAddress();
+                String rack;
+                try
+                {
+                    rack = probe.getEndpointSnitchInfoProxy().getRack(endpoint);
+                }
+                catch (UnknownHostException e)
+                {
+                    rack = "Unknown";
+                }
+
+                String status = liveNodes.contains(endpoint)
+                        ? "Up"
+                        : deadNodes.contains(endpoint)
+                                ? "Down"
+                                : "?";
+
+                String state = "Normal";
+
+                if (joiningNodes.contains(endpoint))
+                    state = "Joining";
+                else if (leavingNodes.contains(endpoint))
+                    state = "Leaving";
+                else if (movingNodes.contains(endpoint))
+                    state = "Moving";
+
+                String load = loadMap.containsKey(endpoint)
+                        ? loadMap.get(endpoint)
+                        : "?";
+                String owns = stat.owns != null && showEffectiveOwnership? new DecimalFormat("##0.00%").format(stat.owns) : "?";
+                System.out.printf(format, stat.ipOrDns(), rack, status, state, load, owns, stat.token);
+            }
+            System.out.println();
+        }
+    }
+
+    @Command(name = "netstats", description = "Print network information on provided host (connecting node by default)")
+    public static class NetStats extends NodeToolCmd
+    {
+        @Option(title = "human_readable",
+                name = {"-H", "--human-readable"},
+                description = "Display bytes in human readable form, i.e. KB, MB, GB, TB")
+        private boolean humanReadable = false;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.printf("Mode: %s%n", probe.getOperationMode());
+            Set<StreamState> statuses = probe.getStreamStatus();
+            if (statuses.isEmpty())
+                System.out.println("Not sending any streams.");
+            for (StreamState status : statuses)
+            {
+                System.out.printf("%s %s%n", status.description, status.planId.toString());
+                for (SessionInfo info : status.sessions)
+                {
+                    System.out.printf("    %s%n", info.peer.toString());
+                    if (!info.receivingSummaries.isEmpty())
+                    {
+                        if (humanReadable)
+                            System.out.printf("        Receiving %d files, %s total%n", info.getTotalFilesToReceive(), FileUtils.stringifyFileSize(info.getTotalSizeToReceive()));
+                        else
+                            System.out.printf("        Receiving %d files, %d bytes total%n", info.getTotalFilesToReceive(), info.getTotalSizeToReceive());
+                        for (ProgressInfo progress : info.getReceivingFiles())
+                        {
+                            System.out.printf("            %s%n", progress.toString());
+                        }
+                    }
+                    if (!info.sendingSummaries.isEmpty())
+                    {
+                        if (humanReadable)
+                            System.out.printf("        Sending %d files, %s total%n", info.getTotalFilesToSend(), FileUtils.stringifyFileSize(info.getTotalSizeToSend()));
+                        else
+                            System.out.printf("        Sending %d files, %d bytes total%n", info.getTotalFilesToSend(), info.getTotalSizeToSend());
+                        for (ProgressInfo progress : info.getSendingFiles())
+                        {
+                            System.out.printf("            %s%n", progress.toString());
+                        }
+                    }
+                }
+            }
+
+            System.out.printf("Read Repair Statistics:%nAttempted: %d%nMismatch (Blocking): %d%nMismatch (Background): %d%n", probe.getReadRepairAttempted(), probe.getReadRepairRepairedBlocking(), probe.getReadRepairRepairedBackground());
+
+            MessagingServiceMBean ms = probe.msProxy;
+            System.out.printf("%-25s", "Pool Name");
+            System.out.printf("%10s", "Active");
+            System.out.printf("%10s", "Pending");
+            System.out.printf("%15s%n", "Completed");
+
+            int pending;
+            long completed;
+
+            pending = 0;
+            for (int n : ms.getCommandPendingTasks().values())
+                pending += n;
+            completed = 0;
+            for (long n : ms.getCommandCompletedTasks().values())
+                completed += n;
+            System.out.printf("%-25s%10s%10s%15s%n", "Commands", "n/a", pending, completed);
+
+            pending = 0;
+            for (int n : ms.getResponsePendingTasks().values())
+                pending += n;
+            completed = 0;
+            for (long n : ms.getResponseCompletedTasks().values())
+                completed += n;
+            System.out.printf("%-25s%10s%10s%15s%n", "Responses", "n/a", pending, completed);
+        }
+    }
+
+    @Command(name = "cfstats", description = "Print statistics on column families")
+    public static class CfStats extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace.cfname>...]", description = "List of column families (or keyspace) names")
+        private List<String> cfnames = new ArrayList<>();
+
+        @Option(name = "-i", description = "Ignore the list of column families and display the remaining cfs")
+        private boolean ignore = false;
+
+        @Option(title = "human_readable",
+                name = {"-H", "--human-readable"},
+                description = "Display bytes in human readable form, i.e. KB, MB, GB, TB")
+        private boolean humanReadable = false;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            OptionFilter filter = new OptionFilter(ignore, cfnames);
+            Map<String, List<ColumnFamilyStoreMBean>> cfstoreMap = new HashMap<>();
+
+            // get a list of column family stores
+            Iterator<Map.Entry<String, ColumnFamilyStoreMBean>> cfamilies = probe.getColumnFamilyStoreMBeanProxies();
+
+            while (cfamilies.hasNext())
+            {
+                Map.Entry<String, ColumnFamilyStoreMBean> entry = cfamilies.next();
+                String keyspaceName = entry.getKey();
+                ColumnFamilyStoreMBean cfsProxy = entry.getValue();
+
+                if (!cfstoreMap.containsKey(keyspaceName) && filter.isColumnFamilyIncluded(entry.getKey(), cfsProxy.getColumnFamilyName()))
+                {
+                    List<ColumnFamilyStoreMBean> columnFamilies = new ArrayList<>();
+                    columnFamilies.add(cfsProxy);
+                    cfstoreMap.put(keyspaceName, columnFamilies);
+                } else if (filter.isColumnFamilyIncluded(entry.getKey(), cfsProxy.getColumnFamilyName()))
+                {
+                    cfstoreMap.get(keyspaceName).add(cfsProxy);
+                }
+            }
+
+            // make sure all specified kss and cfs exist
+            filter.verifyKeyspaces(probe.getKeyspaces());
+            filter.verifyColumnFamilies();
+
+            // print out the table statistics
+            for (Map.Entry<String, List<ColumnFamilyStoreMBean>> entry : cfstoreMap.entrySet())
+            {
+                String keyspaceName = entry.getKey();
+                List<ColumnFamilyStoreMBean> columnFamilies = entry.getValue();
+                long keyspaceReadCount = 0;
+                long keyspaceWriteCount = 0;
+                int keyspacePendingFlushes = 0;
+                double keyspaceTotalReadTime = 0.0f;
+                double keyspaceTotalWriteTime = 0.0f;
+
+                System.out.println("Keyspace: " + keyspaceName);
+                for (ColumnFamilyStoreMBean cfstore : columnFamilies)
+                {
+                    String cfName = cfstore.getColumnFamilyName();
+                    long writeCount = ((JmxReporter.TimerMBean) probe.getColumnFamilyMetric(keyspaceName, cfName, "WriteLatency")).getCount();
+                    long readCount = ((JmxReporter.TimerMBean) probe.getColumnFamilyMetric(keyspaceName, cfName, "ReadLatency")).getCount();
+
+                    if (readCount > 0)
+                    {
+                        keyspaceReadCount += readCount;
+                        keyspaceTotalReadTime += (long) probe.getColumnFamilyMetric(keyspaceName, cfName, "ReadTotalLatency");
+                    }
+                    if (writeCount > 0)
+                    {
+                        keyspaceWriteCount += writeCount;
+                        keyspaceTotalWriteTime += (long) probe.getColumnFamilyMetric(keyspaceName, cfName, "WriteTotalLatency");
+                    }
+                    keyspacePendingFlushes += (long) probe.getColumnFamilyMetric(keyspaceName, cfName, "PendingFlushes");
+                }
+
+                double keyspaceReadLatency = keyspaceReadCount > 0
+                                             ? keyspaceTotalReadTime / keyspaceReadCount / 1000
+                                             : Double.NaN;
+                double keyspaceWriteLatency = keyspaceWriteCount > 0
+                                              ? keyspaceTotalWriteTime / keyspaceWriteCount / 1000
+                                              : Double.NaN;
+
+                System.out.println("\tRead Count: " + keyspaceReadCount);
+                System.out.println("\tRead Latency: " + String.format("%s", keyspaceReadLatency) + " ms.");
+                System.out.println("\tWrite Count: " + keyspaceWriteCount);
+                System.out.println("\tWrite Latency: " + String.format("%s", keyspaceWriteLatency) + " ms.");
+                System.out.println("\tPending Flushes: " + keyspacePendingFlushes);
+
+                // print out column family statistics for this keyspace
+                for (ColumnFamilyStoreMBean cfstore : columnFamilies)
+                {
+                    String cfName = cfstore.getColumnFamilyName();
+                    if (cfName.contains("."))
+                        System.out.println("\t\tTable (index): " + cfName);
+                    else
+                        System.out.println("\t\tTable: " + cfName);
+
+                    System.out.println("\t\tSSTable count: " + probe.getColumnFamilyMetric(keyspaceName, cfName, "LiveSSTableCount"));
+
+                    int[] leveledSStables = cfstore.getSSTableCountPerLevel();
+                    if (leveledSStables != null)
+                    {
+                        System.out.print("\t\tSSTables in each level: [");
+                        for (int level = 0; level < leveledSStables.length; level++)
+                        {
+                            int count = leveledSStables[level];
+                            System.out.print(count);
+                            long maxCount = 4L; // for L0
+                            if (level > 0)
+                                maxCount = (long) Math.pow(10, level);
+                            //  show max threshold for level when exceeded
+                            if (count > maxCount)
+                                System.out.print("/" + maxCount);
+
+                            if (level < leveledSStables.length - 1)
+                                System.out.print(", ");
+                            else
+                                System.out.println("]");
+                        }
+                    }
+                    System.out.println("\t\tSpace used (live): " + format((Long) probe.getColumnFamilyMetric(keyspaceName, cfName, "LiveDiskSpaceUsed"), humanReadable));
+                    System.out.println("\t\tSpace used (total): " + format((Long) probe.getColumnFamilyMetric(keyspaceName, cfName, "TotalDiskSpaceUsed"), humanReadable));
+                    System.out.println("\t\tSpace used by snapshots (total): " + format((Long) probe.getColumnFamilyMetric(keyspaceName, cfName, "SnapshotsSize"), humanReadable));
+                    System.out.println("\t\tSSTable Compression Ratio: " + probe.getColumnFamilyMetric(keyspaceName, cfName, "CompressionRatio"));
+                    System.out.println("\t\tMemtable cell count: " + probe.getColumnFamilyMetric(keyspaceName, cfName, "MemtableColumnsCount"));
+                    System.out.println("\t\tMemtable data size: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, cfName, "MemtableLiveDataSize"), humanReadable));
+                    System.out.println("\t\tMemtable switch count: " + probe.getColumnFamilyMetric(keyspaceName, cfName, "MemtableSwitchCount"));
+                    System.out.println("\t\tLocal read count: " + ((JmxReporter.TimerMBean) probe.getColumnFamilyMetric(keyspaceName, cfName, "ReadLatency")).getCount());
+                    double localReadLatency = ((JmxReporter.TimerMBean) probe.getColumnFamilyMetric(keyspaceName, cfName, "ReadLatency")).getMean() / 1000;
+                    double localRLatency = localReadLatency > 0 ? localReadLatency : Double.NaN;
+                    System.out.printf("\t\tLocal read latency: %01.3f ms%n", localRLatency);
+                    System.out.println("\t\tLocal write count: " + ((JmxReporter.TimerMBean) probe.getColumnFamilyMetric(keyspaceName, cfName, "WriteLatency")).getCount());
+                    double localWriteLatency = ((JmxReporter.TimerMBean) probe.getColumnFamilyMetric(keyspaceName, cfName, "WriteLatency")).getMean() / 1000;
+                    double localWLatency = localWriteLatency > 0 ? localWriteLatency : Double.NaN;
+                    System.out.printf("\t\tLocal write latency: %01.3f ms%n", localWLatency);
+                    System.out.println("\t\tPending flushes: " + probe.getColumnFamilyMetric(keyspaceName, cfName, "PendingFlushes"));
+                    System.out.println("\t\tBloom filter false positives: " + probe.getColumnFamilyMetric(keyspaceName, cfName, "BloomFilterFalsePositives"));
+                    System.out.printf("\t\tBloom filter false ratio: %s%n", String.format("%01.5f", probe.getColumnFamilyMetric(keyspaceName, cfName, "RecentBloomFilterFalseRatio")));
+                    System.out.println("\t\tBloom filter space used: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, cfName, "BloomFilterDiskSpaceUsed"), humanReadable));
+                    System.out.println("\t\tCompacted partition minimum bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, cfName, "MinRowSize"), humanReadable));
+                    System.out.println("\t\tCompacted partition maximum bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, cfName, "MaxRowSize"), humanReadable));
+                    System.out.println("\t\tCompacted partition mean bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, cfName, "MeanRowSize"), humanReadable));
+                    JmxReporter.HistogramMBean histogram = (JmxReporter.HistogramMBean) probe.getColumnFamilyMetric(keyspaceName, cfName, "LiveScannedHistogram");
+                    System.out.println("\t\tAverage live cells per slice (last five minutes): " + histogram.getMean());
+                    System.out.println("\t\tMaximum live cells per slice (last five minutes): " + histogram.getMax());
+                    histogram = (JmxReporter.HistogramMBean) probe.getColumnFamilyMetric(keyspaceName, cfName, "TombstoneScannedHistogram");
+                    System.out.println("\t\tAverage tombstones per slice (last five minutes): " + histogram.getMean());
+                    System.out.println("\t\tMaximum tombstones per slice (last five minutes): " + histogram.getMax());
+
+                    System.out.println("");
+                }
+                System.out.println("----------------");
+            }
+        }
+
+        private String format(long bytes, boolean humanReadable) {
+            return humanReadable ? FileUtils.stringifyFileSize(bytes) : Long.toString(bytes);
+        }
+
+        /**
+         * Used for filtering keyspaces and columnfamilies to be displayed using the cfstats command.
+         */
+        private static class OptionFilter
+        {
+            private Map<String, List<String>> filter = new HashMap<>();
+            private Map<String, List<String>> verifier = new HashMap<>();
+            private List<String> filterList = new ArrayList<>();
+            private boolean ignoreMode;
+
+            public OptionFilter(boolean ignoreMode, List<String> filterList)
+            {
+                this.filterList.addAll(filterList);
+                this.ignoreMode = ignoreMode;
+
+                for (String s : filterList)
+                {
+                    String[] keyValues = s.split("\\.", 2);
+
+                    // build the map that stores the ks' and cfs to use
+                    if (!filter.containsKey(keyValues[0]))
+                    {
+                        filter.put(keyValues[0], new ArrayList<String>());
+                        verifier.put(keyValues[0], new ArrayList<String>());
+
+                        if (keyValues.length == 2)
+                        {
+                            filter.get(keyValues[0]).add(keyValues[1]);
+                            verifier.get(keyValues[0]).add(keyValues[1]);
+                        }
+                    } else
+                    {
+                        if (keyValues.length == 2)
+                        {
+                            filter.get(keyValues[0]).add(keyValues[1]);
+                            verifier.get(keyValues[0]).add(keyValues[1]);
+                        }
+                    }
+                }
+            }
+
+            public boolean isColumnFamilyIncluded(String keyspace, String columnFamily)
+            {
+                // supplying empty params list is treated as wanting to display all kss & cfs
+                if (filterList.isEmpty())
+                    return !ignoreMode;
+
+                List<String> cfs = filter.get(keyspace);
+
+                // no such keyspace is in the map
+                if (cfs == null)
+                    return ignoreMode;
+                    // only a keyspace with no cfs was supplied
+                    // so ignore or include (based on the flag) every column family in specified keyspace
+                else if (cfs.size() == 0)
+                    return !ignoreMode;
+
+                // keyspace exists, and it contains specific cfs
+                verifier.get(keyspace).remove(columnFamily);
+                return ignoreMode ^ cfs.contains(columnFamily);
+            }
+
+            public void verifyKeyspaces(List<String> keyspaces)
+            {
+                for (String ks : verifier.keySet())
+                    if (!keyspaces.contains(ks))
+                        throw new IllegalArgumentException("Unknown keyspace: " + ks);
+            }
+
+            public void verifyColumnFamilies()
+            {
+                for (String ks : filter.keySet())
+                    if (verifier.get(ks).size() > 0)
+                        throw new IllegalArgumentException("Unknown column families: " + verifier.get(ks).toString() + " in keyspace: " + ks);
+            }
+        }
+    }
+
+    @Command(name = "cfhistograms", description = "Print statistic histograms for a given column family")
+    public static class CfHistograms extends NodeToolCmd
+    {
+        @Arguments(usage = "<keyspace> <cfname>", description = "The keyspace and column family name")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() == 2, "cfhistograms requires ks and cf args");
+
+            String keyspace = args.get(0);
+            String cfname = args.get(1);
+
+            // calculate percentile of row size and column count
+            long[] estimatedRowSize = (long[]) probe.getColumnFamilyMetric(keyspace, cfname, "EstimatedRowSizeHistogram");
+            long[] estimatedColumnCount = (long[]) probe.getColumnFamilyMetric(keyspace, cfname, "EstimatedColumnCountHistogram");
+
+            long[] bucketOffsets = new EstimatedHistogram().getBucketOffsets();
+            EstimatedHistogram rowSizeHist = new EstimatedHistogram(bucketOffsets, estimatedRowSize);
+            EstimatedHistogram columnCountHist = new EstimatedHistogram(bucketOffsets, estimatedColumnCount);
+
+            // build arrays to store percentile values
+            double[] estimatedRowSizePercentiles = new double[7];
+            double[] estimatedColumnCountPercentiles = new double[7];
+            double[] offsetPercentiles = new double[]{0.5, 0.75, 0.95, 0.98, 0.99};
+            for (int i = 0; i < offsetPercentiles.length; i++)
+            {
+                estimatedRowSizePercentiles[i] = rowSizeHist.percentile(offsetPercentiles[i]);
+                estimatedColumnCountPercentiles[i] = columnCountHist.percentile(offsetPercentiles[i]);
+            }
+
+            // min value
+            estimatedRowSizePercentiles[5] = rowSizeHist.min();
+            estimatedColumnCountPercentiles[5] = columnCountHist.min();
+            // max value
+            estimatedRowSizePercentiles[6] = rowSizeHist.max();
+            estimatedColumnCountPercentiles[6] = columnCountHist.max();
+
+            String[] percentiles = new String[]{"50%", "75%", "95%", "98%", "99%", "Min", "Max"};
+            double[] readLatency = probe.metricPercentilesAsArray((JmxReporter.HistogramMBean) probe.getColumnFamilyMetric(keyspace, cfname, "ReadLatency"));
+            double[] writeLatency = probe.metricPercentilesAsArray((JmxReporter.TimerMBean) probe.getColumnFamilyMetric(keyspace, cfname, "WriteLatency"));
+            double[] sstablesPerRead = probe.metricPercentilesAsArray((JmxReporter.HistogramMBean) probe.getColumnFamilyMetric(keyspace, cfname, "SSTablesPerReadHistogram"));
+
+            System.out.println(format("%s/%s histograms", keyspace, cfname));
+            System.out.println(format("%-10s%10s%18s%18s%18s%18s",
+                    "Percentile", "SSTables", "Write Latency", "Read Latency", "Partition Size", "Cell Count"));
+            System.out.println(format("%-10s%10s%18s%18s%18s%18s",
+                    "", "", "(micros)", "(micros)", "(bytes)", ""));
+
+            for (int i = 0; i < percentiles.length; i++)
+            {
+                System.out.println(format("%-10s%10.2f%18.2f%18.2f%18.0f%18.0f",
+                        percentiles[i],
+                        sstablesPerRead[i],
+                        writeLatency[i],
+                        readLatency[i],
+                        estimatedRowSizePercentiles[i],
+                        estimatedColumnCountPercentiles[i]));
+            }
+            System.out.println();
+        }
+    }
+
+    @Command(name = "cleanup", description = "Triggers the immediate cleanup of keys no longer belonging to a node. By default, clean all keyspaces")
+    public static class Cleanup extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace> <cfnames>...]", description = "The keyspace followed by one or many column families")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            List<String> keyspaces = parseOptionalKeyspace(args, probe);
+            String[] cfnames = parseOptionalColumnFamilies(args);
+
+            for (String keyspace : keyspaces)
+            {
+                if (Keyspace.SYSTEM_KS.equals(keyspace))
+                    continue;
+
+                try
+                {
+                    probe.forceKeyspaceCleanup(System.out, keyspace, cfnames);
+                } catch (Exception e)
+                {
+                    throw new RuntimeException("Error occurred during cleanup", e);
+                }
+            }
+        }
+    }
+
+    @Command(name = "clearsnapshot", description = "Remove the snapshot with the given name from the given keyspaces. If no snapshotName is specified we will remove all snapshots")
+    public static class ClearSnapshot extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspaces>...] ", description = "Remove snapshots from the given keyspaces")
+        private List<String> keyspaces = new ArrayList<>();
+
+        @Option(title = "snapshot_name", name = "-t", description = "Remove the snapshot with a given name")
+        private String snapshotName = EMPTY;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            StringBuilder sb = new StringBuilder();
+
+            sb.append("Requested clearing snapshot(s) for ");
+
+            if (keyspaces.isEmpty())
+                sb.append("[all keyspaces]");
+            else
+                sb.append("[").append(join(keyspaces, ", ")).append("]");
+
+            if (!snapshotName.isEmpty())
+                sb.append(" with snapshot name [").append(snapshotName).append("]");
+
+            System.out.println(sb.toString());
+
+            try
+            {
+                probe.clearSnapshot(snapshotName, toArray(keyspaces, String.class));
+            } catch (IOException e)
+            {
+                throw new RuntimeException("Error during clearing snapshots", e);
+            }
+        }
+    }
+
+    @Command(name = "compact", description = "Force a (major) compaction on one or more column families")
+    public static class Compact extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace> <cfnames>...]", description = "The keyspace followed by one or many column families")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            List<String> keyspaces = parseOptionalKeyspace(args, probe);
+            String[] cfnames = parseOptionalColumnFamilies(args);
+
+            for (String keyspace : keyspaces)
+            {
+                try
+                {
+                    probe.forceKeyspaceCompaction(keyspace, cfnames);
+                } catch (Exception e)
+                {
+                    throw new RuntimeException("Error occurred during compaction", e);
+                }
+            }
+        }
+    }
+
+    @Command(name = "flush", description = "Flush one or more column families")
+    public static class Flush extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace> <cfnames>...]", description = "The keyspace followed by one or many column families")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            List<String> keyspaces = parseOptionalKeyspace(args, probe);
+            String[] cfnames = parseOptionalColumnFamilies(args);
+
+            for (String keyspace : keyspaces)
+            {
+                try
+                {
+                    probe.forceKeyspaceFlush(keyspace, cfnames);
+                } catch (Exception e)
+                {
+                    throw new RuntimeException("Error occurred during flushing", e);
+                }
+            }
+        }
+    }
+
+    @Command(name = "scrub", description = "Scrub (rebuild sstables for) one or more column families")
+    public static class Scrub extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace> <cfnames>...]", description = "The keyspace followed by one or many column families")
+        private List<String> args = new ArrayList<>();
+
+        @Option(title = "disable_snapshot",
+                name = {"-ns", "--no-snapshot"},
+                description = "Scrubbed CFs will be snapshotted first, if disableSnapshot is false. (default false)")
+        private boolean disableSnapshot = false;
+
+        @Option(title = "skip_corrupted",
+                name = {"-s", "--skip-corrupted"},
+                description = "Skip corrupted partitions even when scrubbing counter tables. (default false)")
+        private boolean skipCorrupted = false;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            List<String> keyspaces = parseOptionalKeyspace(args, probe);
+            String[] cfnames = parseOptionalColumnFamilies(args);
+
+            for (String keyspace : keyspaces)
+            {
+                try
+                {
+                    probe.scrub(System.out, disableSnapshot, skipCorrupted, keyspace, cfnames);
+                } catch (Exception e)
+                {
+                    throw new RuntimeException("Error occurred during flushing", e);
+                }
+            }
+        }
+    }
+
+    @Command(name = "disableautocompaction", description = "Disable autocompaction for the given keyspace and column family")
+    public static class DisableAutoCompaction extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace> <cfnames>...]", description = "The keyspace followed by one or many column families")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            List<String> keyspaces = parseOptionalKeyspace(args, probe);
+            String[] cfnames = parseOptionalColumnFamilies(args);
+
+            for (String keyspace : keyspaces)
+            {
+                try
+                {
+                    probe.disableAutoCompaction(keyspace, cfnames);
+                } catch (IOException e)
+                {
+                    throw new RuntimeException("Error occurred during disabling auto-compaction", e);
+                }
+            }
+        }
+    }
+
+    @Command(name = "enableautocompaction", description = "Enable autocompaction for the given keyspace and column family")
+    public static class EnableAutoCompaction extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace> <cfnames>...]", description = "The keyspace followed by one or many column families")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            List<String> keyspaces = parseOptionalKeyspace(args, probe);
+            String[] cfnames = parseOptionalColumnFamilies(args);
+
+            for (String keyspace : keyspaces)
+            {
+                try
+                {
+                    probe.enableAutoCompaction(keyspace, cfnames);
+                } catch (IOException e)
+                {
+                    throw new RuntimeException("Error occurred during enabling auto-compaction", e);
+                }
+            }
+        }
+    }
+
+    @Command(name = "upgradesstables", description = "Rewrite sstables (for the requested column families) that are not on the current version (thus upgrading them to said current version)")
+    public static class UpgradeSSTable extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace> <cfnames>...]", description = "The keyspace followed by one or many column families")
+        private List<String> args = new ArrayList<>();
+
+        @Option(title = "include_all", name = {"-a", "--include-all-sstables"}, description = "Use -a to include all sstables, even those already on the current version")
+        private boolean includeAll = false;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            List<String> keyspaces = parseOptionalKeyspace(args, probe);
+            String[] cfnames = parseOptionalColumnFamilies(args);
+
+            for (String keyspace : keyspaces)
+            {
+                try
+                {
+                    probe.upgradeSSTables(System.out, keyspace, !includeAll, cfnames);
+                } catch (Exception e)
+                {
+                    throw new RuntimeException("Error occurred during enabling auto-compaction", e);
+                }
+            }
+        }
+    }
+
+    @Command(name = "compactionstats", description = "Print statistics on compactions")
+    public static class CompactionStats extends NodeToolCmd
+    {
+        @Option(title = "human_readable",
+                name = {"-H", "--human-readable"},
+                description = "Display bytes in human readable form, i.e. KB, MB, GB, TB")
+        private boolean humanReadable = false;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            int compactionThroughput = probe.getCompactionThroughput();
+            CompactionManagerMBean cm = probe.getCompactionManagerProxy();
+            System.out.println("pending tasks: " + probe.getCompactionMetric("PendingTasks"));
+            long remainingBytes = 0;
+            List<Map<String, String>> compactions = cm.getCompactions();
+            if (!compactions.isEmpty())
+            {
+                List<String[]> lines = new ArrayList<>();
+                int[] columnSizes = new int[] { 0, 0, 0, 0, 0, 0, 0 };
+
+                addLine(lines, columnSizes, "compaction type", "keyspace", "table", "completed", "total", "unit", "progress");
+                for (Map<String, String> c : compactions)
+                {
+                    long total = Long.parseLong(c.get("total"));
+                    long completed = Long.parseLong(c.get("completed"));
+                    String taskType = c.get("taskType");
+                    String keyspace = c.get("keyspace");
+                    String columnFamily = c.get("columnfamily");
+                    String completedStr = humanReadable ? FileUtils.stringifyFileSize(completed) : Long.toString(completed);
+                    String totalStr = humanReadable ? FileUtils.stringifyFileSize(total) : Long.toString(total);
+                    String unit = c.get("unit");
+                    String percentComplete = total == 0 ? "n/a" : new DecimalFormat("0.00").format((double) completed / total * 100) + "%";
+                    addLine(lines, columnSizes, taskType, keyspace, columnFamily, completedStr, totalStr, unit, percentComplete);
+                    if (taskType.equals(OperationType.COMPACTION.toString()))
+                        remainingBytes += total - completed;
+                }
+
+                StringBuilder buffer = new StringBuilder();
+                for (int columnSize : columnSizes) {
+                    buffer.append("%");
+                    buffer.append(columnSize + 3);
+                    buffer.append("s");
+                }
+                buffer.append("%n");
+                String format = buffer.toString();
+
+                for (String[] line : lines)
+                {
+                    System.out.printf(format, line[0], line[1], line[2], line[3], line[4], line[5], line[6]);
+                }
+
+                String remainingTime = "n/a";
+                if (compactionThroughput != 0)
+                {
+                    long remainingTimeInSecs = remainingBytes / (1024L * 1024L * compactionThroughput);
+                    remainingTime = format("%dh%02dm%02ds", remainingTimeInSecs / 3600, (remainingTimeInSecs % 3600) / 60, (remainingTimeInSecs % 60));
+                }
+                System.out.printf("%25s%10s%n", "Active compaction remaining time : ", remainingTime);
+            }
+        }
+
+        private void addLine(List<String[]> lines, int[] columnSizes, String... columns) {
+            lines.add(columns);
+            for (int i = 0; i < columns.length; i++) {
+                columnSizes[i] = Math.max(columnSizes[i], columns[i].length());
+            }
+        }
+    }
+
+    @Command(name = "compactionhistory", description = "Print history of compaction")
+    public static class CompactionHistory extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println("Compaction History: ");
+
+            TabularData tabularData = probe.getCompactionHistory();
+            if (tabularData.isEmpty())
+            {
+                System.out.printf("There is no compaction history");
+                return;
+            }
+
+            String format = "%-41s%-19s%-29s%-26s%-15s%-15s%s%n";
+            List<String> indexNames = tabularData.getTabularType().getIndexNames();
+            System.out.printf(format, toArray(indexNames, Object.class));
+
+            Set<?> values = tabularData.keySet();
+            for (Object eachValue : values)
+            {
+                List<?> value = (List<?>) eachValue;
+                System.out.printf(format, toArray(value, Object.class));
+            }
+        }
+    }
+
+    @Command(name = "decommission", description = "Decommission the *node I am connecting to*")
+    public static class Decommission extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            try
+            {
+                probe.decommission();
+            } catch (InterruptedException e)
+            {
+                throw new RuntimeException("Error decommissioning node", e);
+            }
+        }
+    }
+
+    @Command(name = "describecluster", description = "Print the name, snitch, partitioner and schema version of a cluster")
+    public static class DescribeCluster extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            // display cluster name, snitch and partitioner
+            System.out.println("Cluster Information:");
+            System.out.println("\tName: " + probe.getClusterName());
+            System.out.println("\tSnitch: " + probe.getEndpointSnitchInfoProxy().getSnitchName());
+            System.out.println("\tPartitioner: " + probe.getPartitioner());
+
+            // display schema version for each node
+            System.out.println("\tSchema versions:");
+            Map<String, List<String>> schemaVersions = probe.getSpProxy().getSchemaVersions();
+            for (String version : schemaVersions.keySet())
+            {
+                System.out.println(format("\t\t%s: %s%n", version, schemaVersions.get(version)));
+            }
+        }
+    }
+
+    @Command(name = "disablebinary", description = "Disable native transport (binary protocol)")
+    public static class DisableBinary extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.stopNativeTransport();
+        }
+    }
+
+    @Command(name = "enablebinary", description = "Reenable native transport (binary protocol)")
+    public static class EnableBinary extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.startNativeTransport();
+        }
+    }
+
+    @Command(name = "enablegossip", description = "Reenable gossip")
+    public static class EnableGossip extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.startGossiping();
+        }
+    }
+
+    @Command(name = "disablegossip", description = "Disable gossip (effectively marking the node down)")
+    public static class DisableGossip extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.stopGossiping();
+        }
+    }
+
+    @Command(name = "enablehandoff", description = "Reenable the future hints storing on the current node")
+    public static class EnableHandoff extends NodeToolCmd
+    {
+        @Arguments(usage = "<dc-name>,<dc-name>", description = "Enable hinted handoff only for these DCs")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() <= 1, "enablehandoff does not accept two args");
+            if(args.size() == 1)
+                probe.enableHintedHandoff(args.get(0));
+            else
+                probe.enableHintedHandoff();
+        }
+    }
+
+    @Command(name = "enablethrift", description = "Reenable thrift server")
+    public static class EnableThrift extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.startThriftServer();
+        }
+    }
+
+    @Command(name = "getcompactionthreshold", description = "Print min and max compaction thresholds for a given column family")
+    public static class GetCompactionThreshold extends NodeToolCmd
+    {
+        @Arguments(usage = "<keyspace> <cfname>", description = "The keyspace with a column family")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() == 2, "getcompactionthreshold requires ks and cf args");
+            String ks = args.get(0);
+            String cf = args.get(1);
+
+            ColumnFamilyStoreMBean cfsProxy = probe.getCfsProxy(ks, cf);
+            System.out.println("Current compaction thresholds for " + ks + "/" + cf + ": \n" +
+                    " min = " + cfsProxy.getMinimumCompactionThreshold() + ", " +
+                    " max = " + cfsProxy.getMaximumCompactionThreshold());
+        }
+    }
+
+    @Command(name = "getcompactionthroughput", description = "Print the MB/s throughput cap for compaction in the system")
+    public static class GetCompactionThroughput extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println("Current compaction throughput: " + probe.getCompactionThroughput() + " MB/s");
+        }
+    }
+
+    @Command(name = "getstreamthroughput", description = "Print the Mb/s throughput cap for streaming in the system")
+    public static class GetStreamThroughput extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println("Current stream throughput: " + probe.getStreamThroughput() + " Mb/s");
+        }
+    }
+
+    @Command(name = "getendpoints", description = "Print the end points that owns the key")
+    public static class GetEndpoints extends NodeToolCmd
+    {
+        @Arguments(usage = "<keyspace> <cfname> <key>", description = "The keyspace, the column family, and the key for which we need to find the endpoint")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() == 3, "getendpoints requires ks, cf and key args");
+            String ks = args.get(0);
+            String cf = args.get(1);
+            String key = args.get(2);
+
+            List<InetAddress> endpoints = probe.getEndpoints(ks, cf, key);
+            for (InetAddress endpoint : endpoints)
+            {
+                System.out.println(endpoint.getHostAddress());
+            }
+        }
+    }
+
+    @Command(name = "getsstables", description = "Print the sstable filenames that own the key")
+    public static class GetSSTables extends NodeToolCmd
+    {
+        @Arguments(usage = "<keyspace> <cfname> <key>", description = "The keyspace, the column family, and the key")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() == 3, "getsstables requires ks, cf and key args");
+            String ks = args.get(0);
+            String cf = args.get(1);
+            String key = args.get(2);
+
+            List<String> sstables = probe.getSSTables(ks, cf, key);
+            for (String sstable : sstables)
+            {
+                System.out.println(sstable);
+            }
+        }
+    }
+
+    @Command(name = "gossipinfo", description = "Shows the gossip information for the cluster")
+    public static class GossipInfo extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println(probe.getGossipInfo());
+        }
+    }
+
+    @Command(name = "invalidatekeycache", description = "Invalidate the key cache")
+    public static class InvalidateKeyCache extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.invalidateKeyCache();
+        }
+    }
+
+    @Command(name = "invalidaterowcache", description = "Invalidate the row cache")
+    public static class InvalidateRowCache extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.invalidateRowCache();
+        }
+    }
+
+    @Command(name = "invalidatecountercache", description = "Invalidate the counter cache")
+    public static class InvalidateCounterCache extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.invalidateCounterCache();
+        }
+    }
+
+    @Command(name = "join", description = "Join the ring")
+    public static class Join extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkState(!probe.isJoined(), "This node has already joined the ring.");
+
+            try
+            {
+                probe.joinRing();
+            } catch (IOException e)
+            {
+                throw new RuntimeException("Error during joining the ring", e);
+            }
+        }
+    }
+
+    @Command(name = "move", description = "Move node on the token ring to a new token")
+    public static class Move extends NodeToolCmd
+    {
+        @Arguments(usage = "<new token>", description = "The new token.", required = true)
+        private String newToken = EMPTY;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            try
+            {
+                probe.move(newToken);
+            } catch (IOException e)
+            {
+                throw new RuntimeException("Error during moving node", e);
+            }
+        }
+    }
+
+
+
+    @Command(name = "pausehandoff", description = "Pause hints delivery process")
+    public static class PauseHandoff extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.pauseHintsDelivery();
+        }
+    }
+
+    @Command(name = "resumehandoff", description = "Resume hints delivery process")
+    public static class ResumeHandoff extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.resumeHintsDelivery();
+        }
+    }
+
+
+    @Command(name = "proxyhistograms", description = "Print statistic histograms for network operations")
+    public static class ProxyHistograms extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            String[] percentiles = new String[]{"50%", "75%", "95%", "98%", "99%", "Min", "Max"};
+            double[] readLatency = probe.metricPercentilesAsArray(probe.getProxyMetric("Read"));
+            double[] writeLatency = probe.metricPercentilesAsArray(probe.getProxyMetric("Write"));
+            double[] rangeLatency = probe.metricPercentilesAsArray(probe.getProxyMetric("RangeSlice"));
+
+            System.out.println("proxy histograms");
+            System.out.println(format("%-10s%18s%18s%18s",
+                    "Percentile", "Read Latency", "Write Latency", "Range Latency"));
+            System.out.println(format("%-10s%18s%18s%18s",
+                    "", "(micros)", "(micros)", "(micros)"));
+            for (int i = 0; i < percentiles.length; i++)
+            {
+                System.out.println(format("%-10s%18.2f%18.2f%18.2f",
+                        percentiles[i],
+                        readLatency[i],
+                        writeLatency[i],
+                        rangeLatency[i]));
+            }
+            System.out.println();
+        }
+    }
+
+    @Command(name = "rebuild", description = "Rebuild data by streaming from other nodes (similarly to bootstrap)")
+    public static class Rebuild extends NodeToolCmd
+    {
+        @Arguments(usage = "<src-dc-name>", description = "Name of DC from which to select sources for streaming. By default, pick any DC")
+        private String sourceDataCenterName = null;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.rebuild(sourceDataCenterName);
+        }
+    }
+
+    @Command(name = "refresh", description = "Load newly placed SSTables to the system without restart")
+    public static class Refresh extends NodeToolCmd
+    {
+        @Arguments(usage = "<keyspace> <cfname>", description = "The keyspace and column family name")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() == 2, "refresh requires ks and cf args");
+            probe.loadNewSSTables(args.get(0), args.get(1));
+        }
+    }
+
+    @Deprecated
+    @Command(name = "removetoken", description = "DEPRECATED (see removenode)", hidden = true)
+    public static class RemoveToken extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.err.println("Warn: removetoken is deprecated, please use removenode instead");
+        }
+    }
+
+    @Command(name = "removenode", description = "Show status of current node removal, force completion of pending removal or remove provided ID")
+    public static class RemoveNode extends NodeToolCmd
+    {
+        @Arguments(title = "remove_operation", usage = "<status>|<force>|<ID>", description = "Show status of current node removal, force completion of pending removal, or remove provided ID", required = true)
+        private String removeOperation = EMPTY;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            switch (removeOperation)
+            {
+                case "status":
+                    System.out.println("RemovalStatus: " + probe.getRemovalStatus());
+                    break;
+                case "force":
+                    System.out.println("RemovalStatus: " + probe.getRemovalStatus());
+                    probe.forceRemoveCompletion();
+                    break;
+                default:
+                    probe.removeNode(removeOperation);
+                    break;
+            }
+        }
+    }
+
+    @Command(name = "repair", description = "Repair one or more column families")
+    public static class Repair extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace> <cfnames>...]", description = "The keyspace followed by one or many column families")
+        private List<String> args = new ArrayList<>();
+
+        @Option(title = "parallel", name = {"-par", "--parallel"}, description = "Use -par to carry out a parallel repair")
+        private boolean parallel = false;
+
+        @Option(title = "local_dc", name = {"-local", "--in-local-dc"}, description = "Use -local to only repair against nodes in the same datacenter")
+        private boolean localDC = false;
+
+        @Option(title = "specific_dc", name = {"-dc", "--in-dc"}, description = "Use -dc to repair specific datacenters")
+        private List<String> specificDataCenters = new ArrayList<>();
+
+        @Option(title = "specific_host", name = {"-hosts", "--in-hosts"}, description = "Use -hosts to repair specific hosts")
+        private List<String> specificHosts = new ArrayList<>();
+
+        @Option(title = "start_token", name = {"-st", "--start-token"}, description = "Use -st to specify a token at which the repair range starts")
+        private String startToken = EMPTY;
+
+        @Option(title = "end_token", name = {"-et", "--end-token"}, description = "Use -et to specify a token at which repair range ends")
+        private String endToken = EMPTY;
+
+        @Option(title = "primary_range", name = {"-pr", "--partitioner-range"}, description = "Use -pr to repair only the first range returned by the partitioner")
+        private boolean primaryRange = false;
+
+        @Option(title = "incremental_repair", name = {"-inc", "--incremental"}, description = "Use -inc to use the new incremental repair")
+        private boolean incrementalRepair = false;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            List<String> keyspaces = parseOptionalKeyspace(args, probe);
+            String[] cfnames = parseOptionalColumnFamilies(args);
+
+            if (primaryRange && (!specificDataCenters.isEmpty() || !specificHosts.isEmpty()))
+                throw new RuntimeException("Primary range repair should be performed on all nodes in the cluster.");
+
+            for (String keyspace : keyspaces)
+            {
+                try
+                {
+                    Collection<String> dataCenters = null;
+                    Collection<String> hosts = null;
+                    if (!specificDataCenters.isEmpty())
+                        dataCenters = newArrayList(specificDataCenters);
+                    else if (localDC)
+                        dataCenters = newArrayList(probe.getDataCenter());
+                    else if(!specificHosts.isEmpty())
+                        hosts = newArrayList(specificHosts);
+                    if (!startToken.isEmpty() || !endToken.isEmpty())
+                        probe.forceRepairRangeAsync(System.out, keyspace, !parallel, dataCenters,hosts, startToken, endToken, !incrementalRepair);
+                    else
+                        probe.forceRepairAsync(System.out, keyspace, !parallel, dataCenters, hosts, primaryRange, !incrementalRepair, cfnames);
+                } catch (Exception e)
+                {
+                    throw new RuntimeException("Error occurred during repair", e);
+                }
+            }
+        }
+    }
+
+    @Command(name = "setcachecapacity", description = "Set global key, row, and counter cache capacities (in MB units)")
+    public static class SetCacheCapacity extends NodeToolCmd
+    {
+        @Arguments(title = "<key-cache-capacity> <row-cache-capacity> <counter-cache-capacity>",
+                   usage = "<key-cache-capacity> <row-cache-capacity> <counter-cache-capacity>",
+                   description = "Key cache, row cache, and counter cache (in MB)",
+                   required = true)
+        private List<Integer> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() == 3, "setcachecapacity requires key-cache-capacity, row-cache-capacity, and counter-cache-capacity args.");
+            probe.setCacheCapacities(args.get(0), args.get(1), args.get(2));
+        }
+    }
+
+    @Command(name = "setcompactionthreshold", description = "Set min and max compaction thresholds for a given column family")
+    public static class SetCompactionThreshold extends NodeToolCmd
+    {
+        @Arguments(title = "<keyspace> <cfname> <minthreshold> <maxthreshold>", usage = "<keyspace> <cfname> <minthreshold> <maxthreshold>", description = "The keyspace, the column family, min and max threshold", required = true)
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() == 4, "setcompactionthreshold requires ks, cf, min, and max threshold args.");
+
+            int minthreshold = parseInt(args.get(2));
+            int maxthreshold = parseInt(args.get(3));
+            checkArgument(minthreshold >= 0 && maxthreshold >= 0, "Thresholds must be positive integers");
+            checkArgument(minthreshold <= maxthreshold, "Min threshold cannot be greater than max.");
+            checkArgument(minthreshold >= 2 || maxthreshold == 0, "Min threshold must be at least 2");
+
+            probe.setCompactionThreshold(args.get(0), args.get(1), minthreshold, maxthreshold);
+        }
+    }
+
+    @Command(name = "setcompactionthroughput", description = "Set the MB/s throughput cap for compaction in the system, or 0 to disable throttling")
+    public static class SetCompactionThroughput extends NodeToolCmd
+    {
+        @Arguments(title = "compaction_throughput", usage = "<value_in_mb>", description = "Value in MB, 0 to disable throttling", required = true)
+        private Integer compactionThroughput = null;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.setCompactionThroughput(compactionThroughput);
+        }
+    }
+
+    @Command(name = "sethintedhandoffthrottlekb", description =  "Set hinted handoff throttle in kb per second, per delivery thread.")
+    public static class SetHintedHandoffThrottleInKB extends NodeToolCmd
+    {
+        @Arguments(title = "throttle_in_kb", usage = "<value_in_kb_per_sec>", description = "Value in KB per second", required = true)
+        private Integer throttleInKB = null;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.setHintedHandoffThrottleInKB(throttleInKB);
+        }
+    }
+
+    @Command(name = "setstreamthroughput", description = "Set the Mb/s throughput cap for streaming in the system, or 0 to disable throttling")
+    public static class SetStreamThroughput extends NodeToolCmd
+    {
+        @Arguments(title = "stream_throughput", usage = "<value_in_mb>", description = "Value in Mb, 0 to disable throttling", required = true)
+        private Integer streamThroughput = null;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.setStreamThroughput(streamThroughput);
+        }
+    }
+
+    @Command(name = "settraceprobability", description = "Sets the probability for tracing any given request to value. 0 disables, 1 enables for all requests, 0 is the default")
+    public static class SetTraceProbability extends NodeToolCmd
+    {
+        @Arguments(title = "trace_probability", usage = "<value>", description = "Trace probability between 0 and 1 (ex: 0.2)", required = true)
+        private Double traceProbability = null;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(traceProbability >= 0 && traceProbability <= 1, "Trace probability must be between 0 and 1");
+            probe.setTraceProbability(traceProbability);
+        }
+    }
+
+    @Command(name = "snapshot", description = "Take a snapshot of specified keyspaces or a snapshot of the specified column family")
+    public static class Snapshot extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspaces...>]", description = "List of keyspaces. By default, all keyspaces")
+        private List<String> keyspaces = new ArrayList<>();
+
+        @Option(title = "cfname", name = {"-cf", "--column-family"}, description = "The column family name (you must specify one and only one keyspace for using this option)")
+        private String columnFamily = null;
+
+        @Option(title = "tag", name = {"-t", "--tag"}, description = "The name of the snapshot")
+        private String snapshotName = Long.toString(System.currentTimeMillis());
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            try
+            {
+                StringBuilder sb = new StringBuilder();
+
+                sb.append("Requested creating snapshot(s) for ");
+
+                if (keyspaces.isEmpty())
+                    sb.append("[all keyspaces]");
+                else
+                    sb.append("[").append(join(keyspaces, ", ")).append("]");
+
+                if (!snapshotName.isEmpty())
+                    sb.append(" with snapshot name [").append(snapshotName).append("]");
+
+                System.out.println(sb.toString());
+
+                probe.takeSnapshot(snapshotName, columnFamily, toArray(keyspaces, String.class));
+                System.out.println("Snapshot directory: " + snapshotName);
+            } catch (IOException e)
+            {
+                throw new RuntimeException("Error during taking a snapshot", e);
+            }
+        }
+    }
+
+    @Command(name = "listsnapshots", description = "Lists all the snapshots along with the size on disk and true size.")
+    public static class ListSnapshots extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            try
+            {
+                System.out.println("Snapshot Details: ");
+
+                final Map<String,TabularData> snapshotDetails = probe.getSnapshotDetails();
+                if (snapshotDetails.isEmpty())
+                {
+                    System.out.printf("There are no snapshots");
+                    return;
+                }
+
+                final long trueSnapshotsSize = probe.trueSnapshotsSize();
+                final String format = "%-20s%-29s%-29s%-19s%-19s%n";
+                // display column names only once
+                final List<String> indexNames = snapshotDetails.entrySet().iterator().next().getValue().getTabularType().getIndexNames();
+                System.out.printf(format, (Object[]) indexNames.toArray(new String[indexNames.size()]));
+
+                for (final Map.Entry<String, TabularData> snapshotDetail : snapshotDetails.entrySet())
+                {
+                    Set<?> values = snapshotDetail.getValue().keySet();
+                    for (Object eachValue : values)
+                    {
+                        final List<?> value = (List<?>) eachValue;
+                        System.out.printf(format, value.toArray(new Object[value.size()]));
+                    }
+                }
+
+                System.out.println("\nTotal TrueDiskSpaceUsed: " + FileUtils.stringifyFileSize(trueSnapshotsSize) + "\n");
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException("Error during list snapshot", e);
+            }
+        }
+    }
+
+    @Command(name = "status", description = "Print cluster information (state, load, IDs, ...)")
+    public static class Status extends NodeToolCmd
+    {
+        @Arguments(usage = "[<keyspace>]", description = "The keyspace name")
+        private String keyspace = null;
+
+        @Option(title = "resolve_ip", name = {"-r", "--resolve-ip"}, description = "Show node domain names instead of IPs")
+        private boolean resolveIp = false;
+
+        private boolean hasEffectiveOwns = false;
+        private boolean isTokenPerNode = true;
+        private int maxAddressLength = 0;
+        private String format = null;
+        private Collection<String> joiningNodes, leavingNodes, movingNodes, liveNodes, unreachableNodes;
+        private Map<String, String> loadMap, hostIDMap, tokensToEndpoints;
+        private EndpointSnitchInfoMBean epSnitchInfo;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            joiningNodes = probe.getJoiningNodes();
+            leavingNodes = probe.getLeavingNodes();
+            movingNodes = probe.getMovingNodes();
+            loadMap = probe.getLoadMap();
+            tokensToEndpoints = probe.getTokenToEndpointMap();
+            liveNodes = probe.getLiveNodes();
+            unreachableNodes = probe.getUnreachableNodes();
+            hostIDMap = probe.getHostIdMap();
+            epSnitchInfo = probe.getEndpointSnitchInfoProxy();
+            
+            StringBuffer errors = new StringBuffer();
+
+            Map<InetAddress, Float> ownerships;
+            try
+            {
+                ownerships = probe.effectiveOwnership(keyspace);
+                hasEffectiveOwns = true;
+            } catch (IllegalStateException e)
+            {
+                ownerships = probe.getOwnership();
+                errors.append("Note: " + e.getMessage() + "%n");
+            }
+            catch (IllegalArgumentException ex)
+            {
+                System.out.printf("%nError: " + ex.getMessage() + "%n");
+                return;
+            }
+
+            Map<String, SetHostStat> dcs = getOwnershipByDc(probe, resolveIp, tokensToEndpoints, ownerships);
+
+            // More tokens than nodes (aka vnodes)?
+            if (dcs.values().size() < tokensToEndpoints.keySet().size())
+                isTokenPerNode = false;
+
+            findMaxAddressLength(dcs);
+
+            // Datacenters
+            for (Map.Entry<String, SetHostStat> dc : dcs.entrySet())
+            {
+                String dcHeader = String.format("Datacenter: %s%n", dc.getKey());
+                System.out.printf(dcHeader);
+                for (int i = 0; i < (dcHeader.length() - 1); i++) System.out.print('=');
+                System.out.println();
+
+                // Legend
+                System.out.println("Status=Up/Down");
+                System.out.println("|/ State=Normal/Leaving/Joining/Moving");
+
+                printNodesHeader(hasEffectiveOwns, isTokenPerNode);
+
+                ArrayListMultimap<InetAddress, HostStat> hostToTokens = ArrayListMultimap.create();
+                for (HostStat stat : dc.getValue())
+                    hostToTokens.put(stat.endpoint, stat);
+
+                for (InetAddress endpoint : hostToTokens.keySet())
+                {
+                    Float owns = ownerships.get(endpoint);
+                    List<HostStat> tokens = hostToTokens.get(endpoint);
+                    printNode(endpoint.getHostAddress(), owns, tokens, hasEffectiveOwns, isTokenPerNode);
+                }
+            }
+            
+            System.out.printf("%n" + errors.toString());
+            
+        }
+
+        private void findMaxAddressLength(Map<String, SetHostStat> dcs)
+        {
+            maxAddressLength = 0;
+            for (Map.Entry<String, SetHostStat> dc : dcs.entrySet())
+            {
+                for (HostStat stat : dc.getValue())
+                {
+                    maxAddressLength = Math.max(maxAddressLength, stat.ipOrDns().length());
+                }
+            }
+        }
+
+        private void printNodesHeader(boolean hasEffectiveOwns, boolean isTokenPerNode)
+        {
+            String fmt = getFormat(hasEffectiveOwns, isTokenPerNode);
+            String owns = hasEffectiveOwns ? "Owns (effective)" : "Owns";
+
+            if (isTokenPerNode)
+                System.out.printf(fmt, "-", "-", "Address", "Load", owns, "Host ID", "Token", "Rack");
+            else
+                System.out.printf(fmt, "-", "-", "Address", "Load", "Tokens", owns, "Host ID", "Rack");
+        }
+
+        private void printNode(String endpoint, Float owns, List<HostStat> tokens, boolean hasEffectiveOwns, boolean isTokenPerNode)
+        {
+            String status, state, load, strOwns, hostID, rack, fmt;
+            fmt = getFormat(hasEffectiveOwns, isTokenPerNode);
+            if (liveNodes.contains(endpoint)) status = "U";
+            else if (unreachableNodes.contains(endpoint)) status = "D";
+            else status = "?";
+            if (joiningNodes.contains(endpoint)) state = "J";
+            else if (leavingNodes.contains(endpoint)) state = "L";
+            else if (movingNodes.contains(endpoint)) state = "M";
+            else state = "N";
+
+            load = loadMap.containsKey(endpoint) ? loadMap.get(endpoint) : "?";
+            strOwns = owns != null && hasEffectiveOwns ? new DecimalFormat("##0.0%").format(owns) : "?";
+            hostID = hostIDMap.get(endpoint);
+
+            try
+            {
+                rack = epSnitchInfo.getRack(endpoint);
+            } catch (UnknownHostException e)
+            {
+                throw new RuntimeException(e);
+            }
+
+            String endpointDns = tokens.get(0).ipOrDns();
+            if (isTokenPerNode)
+                System.out.printf(fmt, status, state, endpointDns, load, strOwns, hostID, tokens.get(0).token, rack);
+            else
+                System.out.printf(fmt, status, state, endpointDns, load, tokens.size(), strOwns, hostID, rack);
+        }
+
+        private String getFormat(
+                boolean hasEffectiveOwns,
+                boolean isTokenPerNode)
+        {
+            if (format == null)
+            {
+                StringBuilder buf = new StringBuilder();
+                String addressPlaceholder = String.format("%%-%ds  ", maxAddressLength);
+                buf.append("%s%s  ");                         // status
+                buf.append(addressPlaceholder);               // address
+                buf.append("%-9s  ");                         // load
+                if (!isTokenPerNode)
+                    buf.append("%-6s  ");                     // "Tokens"
+                if (hasEffectiveOwns)
+                    buf.append("%-16s  ");                    // "Owns (effective)"
+                else
+                    buf.append("%-6s  ");                     // "Owns
+                buf.append("%-36s  ");                        // Host ID
+                if (isTokenPerNode)
+                    buf.append("%-39s  ");                    // token
+                buf.append("%s%n");                           // "Rack"
+
+                format = buf.toString();
+            }
+
+            return format;
+        }
+    }
+
+    private static Map<String, SetHostStat> getOwnershipByDc(NodeProbe probe, boolean resolveIp, 
+                                                             Map<String, String> tokenToEndpoint,
+                                                             Map<InetAddress, Float> ownerships)
+    {
+        Map<String, SetHostStat> ownershipByDc = Maps.newLinkedHashMap();
+        EndpointSnitchInfoMBean epSnitchInfo = probe.getEndpointSnitchInfoProxy();
+        try
+        {
+            for (Entry<String, String> tokenAndEndPoint : tokenToEndpoint.entrySet())
+            {
+                String dc = epSnitchInfo.getDatacenter(tokenAndEndPoint.getValue());
+                if (!ownershipByDc.containsKey(dc))
+                    ownershipByDc.put(dc, new SetHostStat(resolveIp));
+                ownershipByDc.get(dc).add(tokenAndEndPoint.getKey(), tokenAndEndPoint.getValue(), ownerships);
+            }
+        }
+        catch (UnknownHostException e)
+        {
+            throw new RuntimeException(e);
+        }
+        return ownershipByDc;
+    }
+
+    static class SetHostStat implements Iterable<HostStat>
+    {
+        final List<HostStat> hostStats = new ArrayList<HostStat>();
+        final boolean resolveIp;
+
+        public SetHostStat(boolean resolveIp)
+        {
+            this.resolveIp = resolveIp;
+        }
+
+        public int size()
+        {
+            return hostStats.size();
+        }
+
+        @Override
+        public Iterator<HostStat> iterator()
+        {
+            return hostStats.iterator();
+        }
+
+        public void add(String token, String host, Map<InetAddress, Float> ownerships) throws UnknownHostException
+        {
+            InetAddress endpoint = InetAddress.getByName(host);
+            Float owns = ownerships.get(endpoint);
+            hostStats.add(new HostStat(token, endpoint, resolveIp, owns));
+        }
+    }
+
+    static class HostStat
+    {
+        public final InetAddress endpoint;
+        public final boolean resolveIp;
+        public final Float owns;
+        public final String token;
+
+        public HostStat(String token, InetAddress endpoint, boolean resolveIp, Float owns)
+        {
+            this.token = token;
+            this.endpoint = endpoint;
+            this.resolveIp = resolveIp;
+            this.owns = owns;
+        }
+
+        public String ipOrDns()
+        {
+            return resolveIp ? endpoint.getHostName() : endpoint.getHostAddress();
+        }
+    }
+
+    @Command(name = "statusbinary", description = "Status of native transport (binary protocol)")
+    public static class StatusBinary extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println(
+                    probe.isNativeTransportRunning()
+                    ? "running"
+                    : "not running");
+        }
+    }
+
+    @Command(name = "statusthrift", description = "Status of thrift server")
+    public static class StatusThrift extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println(
+                    probe.isThriftServerRunning()
+                    ? "running"
+                    : "not running");
+        }
+    }
+
+    @Command(name = "stop", description = "Stop compaction")
+    public static class Stop extends NodeToolCmd
+    {
+        @Arguments(title = "compaction_type", usage = "<compaction type>", description = "Supported types are COMPACTION, VALIDATION, CLEANUP, SCRUB, INDEX_BUILD", required = true)
+        private OperationType compactionType = OperationType.UNKNOWN;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.stop(compactionType.name());
+        }
+    }
+
+    @Command(name = "stopdaemon", description = "Stop cassandra daemon")
+    public static class StopDaemon extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            try
+            {
+                probe.stopCassandraDaemon();
+            } catch (Exception ignored)
+            {
+                // ignored
+            }
+            System.out.println("Cassandra has shutdown.");
+        }
+    }
+
+    @Command(name = "version", description = "Print cassandra version")
+    public static class Version extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println("ReleaseVersion: " + probe.getReleaseVersion());
+        }
+    }
+
+    @Command(name = "describering", description = "Shows the token ranges info of a given keyspace")
+    public static class DescribeRing extends NodeToolCmd
+    {
+        @Arguments(description = "The keyspace name", required = true)
+        String keyspace = EMPTY;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println("Schema Version:" + probe.getSchemaVersion());
+            System.out.println("TokenRange: ");
+            try
+            {
+                for (String tokenRangeString : probe.describeRing(keyspace))
+                {
+                    System.out.println("\t" + tokenRangeString);
+                }
+            } catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    @Command(name = "rangekeysample", description = "Shows the sampled keys held across all keyspaces")
+    public static class RangeKeySample extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.println("RangeKeySample: ");
+            List<String> tokenStrings = probe.sampleKeyRange();
+            for (String tokenString : tokenStrings)
+            {
+                System.out.println("\t" + tokenString);
+            }
+        }
+    }
+
+    @Command(name = "rebuild_index", description = "A full rebuild of native secondary indexes for a given column family")
+    public static class RebuildIndex extends NodeToolCmd
+    {
+        @Arguments(usage = "<keyspace> <cfname> <indexName...>", description = "The keyspace and column family name followed by a list of index names (IndexNameExample: Standard3.IdxName Standard3.IdxName1)")
+        List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() >= 3, "rebuild_index requires ks, cf and idx args");
+            probe.rebuildIndex(args.get(0), args.get(1), toArray(args.subList(2, args.size()), String.class));
+        }
+    }
+
+    @Command(name = "resetlocalschema", description = "Reset node's local schema and resync")
+    public static class ResetLocalSchema extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            try
+            {
+                probe.resetLocalSchema();
+            } catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    @Command(name = "enablebackup", description = "Enable incremental backup")
+    public static class EnableBackup extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.setIncrementalBackupsEnabled(true);
+        }
+    }
+
+    @Command(name = "disablebackup", description = "Disable incremental backup")
+    public static class DisableBackup extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.setIncrementalBackupsEnabled(false);
+        }
+    }
+
+    @Command(name = "setcachekeystosave", description = "Set number of keys saved by each cache for faster post-restart warmup. 0 to disable")
+    public static class SetCacheKeysToSave extends NodeToolCmd
+    {
+        @Arguments(title = "<key-cache-keys-to-save> <row-cache-keys-to-save> <counter-cache-keys-to-save>",
+                   usage = "<key-cache-keys-to-save> <row-cache-keys-to-save> <counter-cache-keys-to-save>",
+                   description = "The number of keys saved by each cache. 0 to disable",
+                   required = true)
+        private List<Integer> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            checkArgument(args.size() == 3, "setcachekeystosave requires key-cache-keys-to-save, row-cache-keys-to-save, and counter-cache-keys-to-save args.");
+            probe.setCacheKeysToSave(args.get(0), args.get(1), args.get(2));
+        }
+    }
+
+    @Command(name = "reloadtriggers", description = "Reload trigger classes")
+    public static class ReloadTriggers extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.reloadTriggers();
+        }
+    }
+
+    @Command(name = "disablehandoff", description = "Disable gossip (effectively marking the node down)")
+    public static class DisableHandoff extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.disableHintedHandoff();
+        }
+    }
+
+    @Command(name = "disablethrift", description = "Disable thrift server")
+    public static class DisableThrift extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            probe.stopThriftServer();
+        }
+    }
+
+    @Command(name = "drain", description = "Drain the node (stop accepting writes and flush all column families)")
+    public static class Drain extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            try
+            {
+                probe.drain();
+            } catch (IOException | InterruptedException | ExecutionException e)
+            {
+                throw new RuntimeException("Error occured during flushing", e);
+            }
+        }
+    }
+
+    @Command(name = "tpstats", description = "Print usage statistics of thread pools")
+    public static class TpStats extends NodeTool.NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            System.out.printf("%-25s%10s%10s%15s%10s%18s%n", "Pool Name", "Active", "Pending", "Completed", "Blocked", "All time blocked");
+
+            Iterator<Map.Entry<String, JMXEnabledThreadPoolExecutorMBean>> threads = probe.getThreadPoolMBeanProxies();
+            while (threads.hasNext())
+            {
+                Map.Entry<String, JMXEnabledThreadPoolExecutorMBean> thread = threads.next();
+                String poolName = thread.getKey();
+                JMXEnabledThreadPoolExecutorMBean threadPoolProxy = thread.getValue();
+                System.out.printf("%-25s%10s%10s%15s%10s%18s%n",
+                        poolName,
+                        threadPoolProxy.getActiveCount(),
+                        threadPoolProxy.getPendingTasks(),
+                        threadPoolProxy.getCompletedTasks(),
+                        threadPoolProxy.getCurrentlyBlockedTasks(),
+                        threadPoolProxy.getTotalBlockedTasks());
+            }
+
+            System.out.printf("%n%-20s%10s%n", "Message type", "Dropped");
+            for (Map.Entry<String, Integer> entry : probe.getDroppedMessages().entrySet())
+                System.out.printf("%-20s%10s%n", entry.getKey(), entry.getValue());
+        }
+    }
+
+    @Command(name = "gcstats", description = "Print GC Statistics")
+    public static class GcStats extends NodeTool.NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            double[] stats = probe.getAndResetGCStats();
+            double mean = stats[2] / stats[5];
+            double stdev = Math.sqrt((stats[3] / stats[5]) - (mean * mean));
+            System.out.printf("%20s%20s%20s%20s%20s%20s%n", "Interval (ms)", "Max GC Elapsed (ms)", "Total GC Elapsed (ms)", "Stdev GC Elapsed (ms)", "GC Reclaimed (MB)", "Collections");
+            System.out.printf("%20.0f%20.0f%20.0f%20.0f%20.0f%20.0f%n", stats[0], stats[1], stats[2], stdev, stats[4], stats[5]);
+        }
+    }
+
+    @Command(name = "truncatehints", description = "Truncate all hints on the local node, or truncate hints for the endpoint(s) specified.")
+    public static class TruncateHints extends NodeToolCmd
+    {
+        @Arguments(usage = "[endpoint ... ]", description = "Endpoint address(es) to delete hints for, either ip address (\"127.0.0.1\") or hostname")
+        private String endpoint = EMPTY;
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            if (endpoint.isEmpty())
+                probe.truncateHints();
+            else
+                probe.truncateHints(endpoint);
+        }
+    }
+    
+    @Command(name = "setlogginglevel", description = "Set the log level threshold for a given class. If both class and level are empty/null, it will reset to the initial configuration")
+    public static class SetLoggingLevel extends NodeToolCmd
+    {
+        @Arguments(usage = "<class> <level>", description = "The class to change the level for and the log level threshold to set (can be empty)")
+        private List<String> args = new ArrayList<>();
+
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            String classQualifier = args.size() >= 1 ? args.get(0) : EMPTY;
+            String level = args.size() == 2 ? args.get(1) : EMPTY;
+            probe.setLoggingLevel(classQualifier, level);
+        }
+    }
+    
+    @Command(name = "getlogginglevels", description = "Get the runtime logging levels")
+    public static class GetLoggingLevels extends NodeToolCmd
+    {
+        @Override
+        public void execute(NodeProbe probe)
+        {
+            // what if some one set a very long logger name? 50 space may not be enough...
+            System.out.printf("%n%-50s%10s%n", "Logger Name", "Log Level");
+            for (Map.Entry<String, String> entry : probe.getLoggingLevels().entrySet())
+                System.out.printf("%-50s%10s%n", entry.getKey(), entry.getValue());
+        }
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/tools/SSTableExport.java b/src/java/org/apache/cassandra/tools/SSTableExport.java
index f8b85c3..cc725a4 100644
--- a/src/java/org/apache/cassandra/tools/SSTableExport.java
+++ b/src/java/org/apache/cassandra/tools/SSTableExport.java

@@ -20,7 +20,6 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
-import java.nio.ByteBuffer;
 import java.util.*;
 
 import org.apache.commons.cli.*;
@@ -29,6 +28,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.exceptions.ConfigurationException;
@@ -38,9 +38,6 @@
 import org.codehaus.jackson.JsonGenerator;
 import org.codehaus.jackson.map.ObjectMapper;
 
-import static org.apache.cassandra.utils.ByteBufferUtil.bytesToHex;
-import static org.apache.cassandra.utils.ByteBufferUtil.hexToBytes;
-
 /**
  * Export SSTables to JSON format.
  */
@@ -97,67 +94,19 @@
         out.print(": ");
     }
 
-    /**
-     * JSON ColumnFamily metadata serializer.</br> Serializes:
-     * <ul>
-     * <li>column family deletion info (if present)</li>
-     * </ul>
-     *
-     * @param out The output steam to write data
-     * @param deletionInfo
-     */
-    private static void writeMeta(PrintStream out, DeletionInfo deletionInfo)
-    {
-        if (!deletionInfo.isLive())
-        {
-            // begin meta
-            writeKey(out, "metadata");
-            writeDeletionInfo(out, deletionInfo.getTopLevelDeletion());
-            out.print(",");
-        }
-    }
-
-    private static void writeDeletionInfo(PrintStream out, DeletionTime deletionTime)
-    {
-        out.print("{");
-        writeKey(out, "deletionInfo");
-        // only store topLevelDeletion (serializeForSSTable only uses this)
-        writeJSON(out, deletionTime);
-        out.print("}");
-    }
-
-    /**
-     * Serialize columns using given column iterator
-     *
-     * @param atoms      column iterator
-     * @param out        output stream
-     * @param cfMetaData Column Family metadata (to get validator)
-     */
-    private static void serializeAtoms(Iterator<OnDiskAtom> atoms, PrintStream out, CFMetaData cfMetaData)
-    {
-        while (atoms.hasNext())
-        {
-            writeJSON(out, serializeAtom(atoms.next(), cfMetaData));
-
-            if (atoms.hasNext())
-                out.print(", ");
-        }
-    }
-
     private static List<Object> serializeAtom(OnDiskAtom atom, CFMetaData cfMetaData)
     {
-        AbstractType<?> comparator = cfMetaData.comparator;
-        if (atom instanceof Column)
+        if (atom instanceof Cell)
         {
-            return serializeColumn((Column) atom, comparator, cfMetaData);
+            return serializeColumn((Cell) atom, cfMetaData);
         }
         else
         {
             assert atom instanceof RangeTombstone;
             RangeTombstone rt = (RangeTombstone) atom;
             ArrayList<Object> serializedColumn = new ArrayList<Object>();
-            serializedColumn.add(comparator.getString(rt.min));
-            serializedColumn.add(comparator.getString(rt.max));
+            serializedColumn.add(cfMetaData.comparator.getString(rt.min));
+            serializedColumn.add(cfMetaData.comparator.getString(rt.max));
             serializedColumn.add(rt.data.markedForDeleteAt);
             serializedColumn.add("t");
             serializedColumn.add(rt.data.localDeletionTime);
@@ -166,46 +115,55 @@
     }
 
     /**
-     * Serialize a given column to the JSON format
+     * Serialize a given cell to a List of Objects that jsonMapper knows how to turn into strings.  Format is
      *
-     * @param column     column presentation
-     * @param comparator columns comparator
+     * human_readable_name, value, timestamp, [flag, [options]]
+     *
+     * Value is normally the human readable value as rendered by the validator, but for deleted cells we
+     * give the local deletion time instead.
+     *
+     * Flag may be exactly one of {d,e,c} for deleted, expiring, or counter:
+     *  - No options for deleted cells
+     *  - If expiring, options will include the TTL and local deletion time.
+     *  - If counter, options will include timestamp of last delete
+     *
+     * @param cell     cell presentation
      * @param cfMetaData Column Family metadata (to get validator)
-     * @return column as serialized list
+     * @return cell as serialized list
      */
-    private static List<Object> serializeColumn(Column column, AbstractType<?> comparator, CFMetaData cfMetaData)
+    private static List<Object> serializeColumn(Cell cell, CFMetaData cfMetaData)
     {
+        CellNameType comparator = cfMetaData.comparator;
         ArrayList<Object> serializedColumn = new ArrayList<Object>();
 
-        ByteBuffer name = ByteBufferUtil.clone(column.name());
-        ByteBuffer value = ByteBufferUtil.clone(column.value());
+        serializedColumn.add(comparator.getString(cell.name()));
 
-        serializedColumn.add(comparator.getString(name));
-        if (column instanceof DeletedColumn)
+        if (cell instanceof DeletedCell)
         {
-            serializedColumn.add(ByteBufferUtil.bytesToHex(value));
+            serializedColumn.add(cell.getLocalDeletionTime());
         }
         else
         {
-            AbstractType<?> validator = cfMetaData.getValueValidatorFromColumnName(name);
-            serializedColumn.add(validator.getString(value));
+            AbstractType<?> validator = cfMetaData.getValueValidator(cell.name());
+            serializedColumn.add(validator.getString(cell.value()));
         }
-        serializedColumn.add(column.timestamp());
 
-        if (column instanceof DeletedColumn)
+        serializedColumn.add(cell.timestamp());
+
+        if (cell instanceof DeletedCell)
         {
             serializedColumn.add("d");
         }
-        else if (column instanceof ExpiringColumn)
+        else if (cell instanceof ExpiringCell)
         {
             serializedColumn.add("e");
-            serializedColumn.add(((ExpiringColumn) column).getTimeToLive());
-            serializedColumn.add(column.getLocalDeletionTime());
+            serializedColumn.add(((ExpiringCell) cell).getTimeToLive());
+            serializedColumn.add(cell.getLocalDeletionTime());
         }
-        else if (column instanceof CounterColumn)
+        else if (cell instanceof CounterCell)
         {
             serializedColumn.add("c");
-            serializedColumn.add(((CounterColumn) column).timestampOfLastDelete());
+            serializedColumn.add(((CounterCell) cell).timestampOfLastDelete());
         }
 
         return serializedColumn;
@@ -227,17 +185,32 @@
     {
         out.print("{");
         writeKey(out, "key");
-        writeJSON(out, bytesToHex(key.key));
-        out.print(",");
+        writeJSON(out, metadata.getKeyValidator().getString(key.getKey()));
+        out.print(",\n");
 
-        writeMeta(out, deletionInfo);
+        if (!deletionInfo.isLive())
+        {
+            out.print(" ");
+            writeKey(out, "metadata");
+            out.print("{");
+            writeKey(out, "deletionInfo");
+            writeJSON(out, deletionInfo.getTopLevelDeletion());
+            out.print("}");
+            out.print(",\n");
+        }
 
-        writeKey(out, "columns");
+        out.print(" ");
+        writeKey(out, "cells");
         out.print("[");
+        while (atoms.hasNext())
+        {
+            writeJSON(out, serializeAtom(atoms.next(), metadata));
 
-        serializeAtoms(atoms, out, metadata);
-
+            if (atoms.hasNext())
+                out.print(",\n           ");
+        }
         out.print("]");
+
         out.print("}");
     }
 
@@ -246,9 +219,10 @@
      *
      * @param desc the descriptor of the file to export the rows from
      * @param outs PrintStream to write the output to
+     * @param metadata Metadata to print keys in a proper format
      * @throws IOException on failure to read/write input/output
      */
-    public static void enumeratekeys(Descriptor desc, PrintStream outs)
+    public static void enumeratekeys(Descriptor desc, PrintStream outs, CFMetaData metadata)
     throws IOException
     {
         KeyIterator iter = new KeyIterator(desc);
@@ -264,7 +238,7 @@
                     throw new IOException("Key out of order! " + lastKey + " > " + key);
                 lastKey = key;
 
-                outs.println(bytesToHex(key.key));
+                outs.println(metadata.getKeyValidator().getString(key.getKey()));
                 checkStream(outs); // flushes
             }
         }
@@ -281,9 +255,10 @@
      * @param outs     PrintStream to write the output to
      * @param toExport the keys corresponding to the rows to export
      * @param excludes keys to exclude from export
+     * @param metadata Metadata to print keys in a proper format
      * @throws IOException on failure to read/write input/output
      */
-    public static void export(Descriptor desc, PrintStream outs, Collection<String> toExport, String[] excludes) throws IOException
+    public static void export(Descriptor desc, PrintStream outs, Collection<String> toExport, String[] excludes, CFMetaData metadata) throws IOException
     {
         SSTableReader sstable = SSTableReader.open(desc);
         RandomAccessReader dfile = sstable.openDataReader();
@@ -303,7 +278,7 @@
 
             for (String key : toExport)
             {
-                DecoratedKey decoratedKey = partitioner.decorateKey(hexToBytes(key));
+                DecoratedKey decoratedKey = partitioner.decorateKey(metadata.getKeyValidator().fromString(key));
 
                 if (lastKey != null && lastKey.compareTo(decoratedKey) > 0)
                     throw new IOException("Key out of order! " + lastKey + " > " + decoratedKey);
@@ -316,15 +291,9 @@
 
                 dfile.seek(entry.position);
                 ByteBufferUtil.readWithShortLength(dfile); // row key
-                if (sstable.descriptor.version.hasRowSizeAndColumnCount)
-                    dfile.readLong(); // row size
                 DeletionInfo deletionInfo = new DeletionInfo(DeletionTime.serializer.deserialize(dfile));
-                int columnCount = sstable.descriptor.version.hasRowSizeAndColumnCount ? dfile.readInt()
-                        : Integer.MAX_VALUE;
 
-                Iterator<OnDiskAtom> atomIterator = sstable.metadata.getOnDiskIterator(dfile, columnCount,
-                        sstable.descriptor.version);
-
+                Iterator<OnDiskAtom> atomIterator = sstable.metadata.getOnDiskIterator(dfile, sstable.descriptor.version);
                 checkStream(outs);
 
                 if (i != 0)
@@ -344,7 +313,7 @@
 
     // This is necessary to accommodate the test suite since you cannot open a Reader more
     // than once from within the same process.
-    static void export(SSTableReader reader, PrintStream outs, String[] excludes) throws IOException
+    static void export(SSTableReader reader, PrintStream outs, String[] excludes, CFMetaData metadata) throws IOException
     {
         Set<String> excludeSet = new HashSet<String>();
 
@@ -364,7 +333,7 @@
             {
                 row = (SSTableIdentityIterator) scanner.next();
 
-                String currentKey = bytesToHex(row.getKey().key);
+                String currentKey = row.getColumnFamily().metadata().getKeyValidator().getString(row.getKey().getKey());
 
                 if (excludeSet.contains(currentKey))
                     continue;
@@ -392,11 +361,12 @@
      * @param desc     the descriptor of the sstable to read from
      * @param outs     PrintStream to write the output to
      * @param excludes keys to exclude from export
+     * @param metadata Metadata to print keys in a proper format
      * @throws IOException on failure to read/write input/output
      */
-    public static void export(Descriptor desc, PrintStream outs, String[] excludes) throws IOException
+    public static void export(Descriptor desc, PrintStream outs, String[] excludes, CFMetaData metadata) throws IOException
     {
-        export(SSTableReader.open(desc), outs, excludes);
+        export(SSTableReader.open(desc), outs, excludes, metadata);
     }
 
     /**
@@ -404,11 +374,12 @@
      *
      * @param desc     the descriptor of the sstable to read from
      * @param excludes keys to exclude from export
+     * @param metadata Metadata to print keys in a proper format
      * @throws IOException on failure to read/write SSTable/standard out
      */
-    public static void export(Descriptor desc, String[] excludes) throws IOException
+    public static void export(Descriptor desc, String[] excludes, CFMetaData metadata) throws IOException
     {
-        export(desc, System.out, excludes);
+        export(desc, System.out, excludes, metadata);
     }
 
     /**
@@ -460,7 +431,7 @@
         }
         Keyspace keyspace = Keyspace.open(descriptor.ksname);
 
-        // Make it work for indexes too - find parent cf if necessary
+        // Make it works for indexes too - find parent cf if necessary
         String baseName = descriptor.cfname;
         if (descriptor.cfname.contains("."))
         {
@@ -469,9 +440,10 @@
         }
 
         // IllegalArgumentException will be thrown here if ks/cf pair does not exist
+        ColumnFamilyStore cfStore = null;
         try
         {
-            keyspace.getColumnFamilyStore(baseName);
+            cfStore = keyspace.getColumnFamilyStore(baseName);
         }
         catch (IllegalArgumentException e)
         {
@@ -484,14 +456,14 @@
         {
             if (cmd.hasOption(ENUMERATEKEYS_OPTION))
             {
-                enumeratekeys(descriptor, System.out);
+                enumeratekeys(descriptor, System.out, cfStore.metadata);
             }
             else
             {
                 if ((keys != null) && (keys.length > 0))
-                    export(descriptor, System.out, Arrays.asList(keys), excludes);
+                    export(descriptor, System.out, Arrays.asList(keys), excludes, cfStore.metadata);
                 else
-                    export(descriptor, excludes);
+                    export(descriptor, excludes, cfStore.metadata);
             }
         }
         catch (IOException e)

diff --git a/src/java/org/apache/cassandra/tools/SSTableImport.java b/src/java/org/apache/cassandra/tools/SSTableImport.java
index 11bfc81..75ad26f 100644
--- a/src/java/org/apache/cassandra/tools/SSTableImport.java
+++ b/src/java/org/apache/cassandra/tools/SSTableImport.java

@@ -17,8 +17,6 @@
  */
 package org.apache.cassandra.tools;
 
-import static org.apache.cassandra.utils.ByteBufferUtil.hexToBytes;
-
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
@@ -37,15 +35,16 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.io.sstable.SSTableWriter;
 import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.codehaus.jackson.JsonFactory;
 import org.codehaus.jackson.JsonParser;
@@ -62,14 +61,12 @@
     private static final String COLUMN_FAMILY_OPTION = "c";
     private static final String KEY_COUNT_OPTION = "n";
     private static final String IS_SORTED_OPTION = "s";
-    private static final String OLD_SC_FORMAT_OPTION = "S";
 
     private static final Options options = new Options();
     private static CommandLine cmd;
 
     private Integer keyCountToImport;
     private final boolean isSorted;
-    private final boolean oldSCFormat;
 
     private static final JsonFactory factory = new MappingJsonFactory().configure(
             JsonParser.Feature.INTERN_FIELD_NAMES, false);
@@ -86,7 +83,6 @@
 
         options.addOption(new Option(KEY_COUNT_OPTION, true, "Number of keys to import (Optional)."));
         options.addOption(new Option(IS_SORTED_OPTION, false, "Assume JSON file as already sorted (e.g. created by sstable2json tool) (Optional)."));
-        options.addOption(new Option(OLD_SC_FORMAT_OPTION, false, "Assume JSON file use legacy super column format (Optional)."));
     }
 
     private static class JsonColumn<T>
@@ -103,65 +99,52 @@
         // Counter columns
         private long timestampOfLastDelete;
 
-        public JsonColumn(T json, CFMetaData meta, boolean oldSCFormat, boolean isSubColumn)
+        public JsonColumn(T json, CFMetaData meta)
         {
             if (json instanceof List)
             {
-                AbstractType<?> comparator = oldSCFormat ? SuperColumns.getComparatorFor(meta, isSubColumn) : meta.comparator;
+                CellNameType comparator = meta.comparator;
                 List fields = (List<?>) json;
 
-                assert fields.size() >= 3 : "Column definition should have at least 3";
+                assert fields.size() >= 3 : "Cell definition should have at least 3";
 
-                name  = stringAsType((String) fields.get(0), comparator);
+                name  = stringAsType((String) fields.get(0), comparator.asAbstractType());
                 timestamp = (Long) fields.get(2);
                 kind = "";
 
                 if (fields.size() > 3)
                 {
-                    if (fields.get(3) instanceof Boolean)
+                    kind = (String) fields.get(3);
+                    if (isExpiring())
                     {
-                        // old format, reading this for backward compatibility sake
-                        if (fields.size() == 6)
-                        {
-                            kind = "e";
-                            ttl = (Integer) fields.get(4);
-                            localExpirationTime = (Integer) fields.get(5);
-                        }
-                        else
-                        {
-                            kind = ((Boolean) fields.get(3)) ? "d" : "";
-                        }
+                        ttl = (Integer) fields.get(4);
+                        localExpirationTime = (Integer) fields.get(5);
                     }
-                    else
+                    else if (isCounter())
                     {
-                        kind = (String) fields.get(3);
-                        if (isExpiring())
-                        {
-                            ttl = (Integer) fields.get(4);
-                            localExpirationTime = (Integer) fields.get(5);
-                        }
-                        else if (isCounter())
-                        {
-                            timestampOfLastDelete = (long) ((Integer) fields.get(4));
-                        }
-                        else if (isRangeTombstone())
-                        {
-                            localExpirationTime = (Integer) fields.get(4);
-                        }
+                        timestampOfLastDelete = (long) ((Integer) fields.get(4));
+                    }
+                    else if (isRangeTombstone())
+                    {
+                        localExpirationTime = (Integer) fields.get(4);
                     }
                 }
 
                 if (isDeleted())
                 {
-                    value = ByteBufferUtil.hexToBytes((String) fields.get(1));
+                    value = ByteBufferUtil.bytes((Integer) fields.get(1));
                 }
                 else if (isRangeTombstone())
                 {
-                    value = comparator.fromString((String)fields.get(1));
+                    value = stringAsType((String) fields.get(1), comparator.asAbstractType());
                 }
                 else
                 {
-                    value = stringAsType((String) fields.get(1), meta.getValueValidatorFromColumnName(name));
+                    assert meta.isCQL3Table() || name.hasRemaining() : "Cell name should not be empty";
+                    value = stringAsType((String) fields.get(1), 
+                            meta.getValueValidator(name.hasRemaining() 
+                                    ? comparator.cellFromByteBuffer(name)
+                                    : meta.comparator.rowMarker(Composites.EMPTY)));
                 }
             }
         }
@@ -199,55 +182,53 @@
 
     public SSTableImport()
     {
-        this(null, false, false);
+        this(null, false);
     }
 
     public SSTableImport(boolean isSorted)
     {
-        this(isSorted, false);
+        this(null, isSorted);
     }
 
-    public SSTableImport(boolean isSorted, boolean oldSCFormat)
-    {
-        this(null, isSorted, oldSCFormat);
-    }
-
-    public SSTableImport(Integer keyCountToImport, boolean isSorted, boolean oldSCFormat)
+    public SSTableImport(Integer keyCountToImport, boolean isSorted)
     {
         this.keyCountToImport = keyCountToImport;
         this.isSorted = isSorted;
-        this.oldSCFormat = oldSCFormat;
-    }
-
-    private void addToStandardCF(List<?> row, ColumnFamily cfamily)
-    {
-        addColumnsToCF(row, null, cfamily);
     }
 
     /**
      * Add columns to a column family.
      *
      * @param row the columns associated with a row
-     * @param superName name of the super column if any
      * @param cfamily the column family to add columns to
      */
-    private void addColumnsToCF(List<?> row, ByteBuffer superName, ColumnFamily cfamily)
+    private void addColumnsToCF(List<?> row, ColumnFamily cfamily)
     {
         CFMetaData cfm = cfamily.metadata();
         assert cfm != null;
 
         for (Object c : row)
         {
-            JsonColumn col = new JsonColumn<List>((List) c, cfm, oldSCFormat, (superName != null));
-            ByteBuffer cname = superName == null ? col.getName() : CompositeType.build(superName, col.getName());
+            JsonColumn col = new JsonColumn<List>((List) c, cfm);
+            if (col.isRangeTombstone())
+            {
+                Composite start = cfm.comparator.fromByteBuffer(col.getName());
+                Composite end = cfm.comparator.fromByteBuffer(col.getValue());
+                cfamily.addAtom(new RangeTombstone(start, end, col.timestamp, col.localExpirationTime));
+                continue;
+            }
+            
+            assert cfm.isCQL3Table() || col.getName().hasRemaining() : "Cell name should not be empty";
+            CellName cname = col.getName().hasRemaining() ? cfm.comparator.cellFromByteBuffer(col.getName()) 
+                    : cfm.comparator.rowMarker(Composites.EMPTY);
 
             if (col.isExpiring())
             {
-                cfamily.addColumn(new ExpiringColumn(cname, col.getValue(), col.timestamp, col.ttl, col.localExpirationTime));
+                cfamily.addColumn(new BufferExpiringCell(cname, col.getValue(), col.timestamp, col.ttl, col.localExpirationTime));
             }
             else if (col.isCounter())
             {
-                cfamily.addColumn(new CounterColumn(cname, col.getValue(), col.timestamp, col.timestampOfLastDelete));
+                cfamily.addColumn(new BufferCounterCell(cname, col.getValue(), col.timestamp, col.timestampOfLastDelete));
             }
             else if (col.isDeleted())
             {
@@ -255,13 +236,13 @@
             }
             else if (col.isRangeTombstone())
             {
-                ByteBuffer end = superName == null ? col.getValue() : CompositeType.build(superName, col.getValue());
+                CellName end = cfm.comparator.cellFromByteBuffer(col.getValue());
                 cfamily.addAtom(new RangeTombstone(cname, end, col.timestamp, col.localExpirationTime));
             }
             // cql3 row marker, see CASSANDRA-5852
-            else if (!cname.hasRemaining())
+            else if (cname.isEmpty())
             {
-                cfamily.addColumn(ByteBuffer.wrap(new byte[3]), col.getValue(), col.timestamp);
+                cfamily.addColumn(cfm.comparator.rowMarker(Composites.EMPTY), col.getValue(), col.timestamp);
             }
             else
             {
@@ -288,35 +269,6 @@
     }
 
     /**
-     * Add super columns to a column family.
-     *
-     * @param row the super columns associated with a row
-     * @param cfamily the column family to add columns to
-     */
-    private void addToSuperCF(Map<?, ?> row, ColumnFamily cfamily)
-    {
-        CFMetaData metaData = cfamily.metadata();
-        assert metaData != null;
-
-        AbstractType<?> comparator = metaData.comparator;
-
-        // Super columns
-        for (Map.Entry<?, ?> entry : row.entrySet())
-        {
-            Map<?, ?> data = (Map<?, ?>) entry.getValue();
-
-            ByteBuffer superName = stringAsType((String) entry.getKey(), ((CompositeType)comparator).types.get(0));
-
-            addColumnsToCF((List<?>) data.get("subColumns"), superName, cfamily);
-
-            if (data.containsKey("metadata"))
-            {
-                parseMeta((Map<?, ?>) data.get("metadata"), cfamily, superName);
-            }
-        }
-    }
-
-    /**
      * Convert a JSON formatted file to an SSTable.
      *
      * @param jsonFile the file containing JSON formatted data
@@ -328,7 +280,7 @@
      */
     public int importJson(String jsonFile, String keyspace, String cf, String ssTablePath) throws IOException
     {
-        ColumnFamily columnFamily = TreeMapBackedSortedColumns.factory.create(keyspace, cf);
+        ColumnFamily columnFamily = ArrayBackedSortedColumns.factory.create(keyspace, cf);
         IPartitioner<?> partitioner = DatabaseDescriptor.getPartitioner();
 
         int importedKeys = (isSorted) ? importSorted(jsonFile, columnFamily, ssTablePath, partitioner)
@@ -350,7 +302,7 @@
         Object[] data = parser.readValueAs(new TypeReference<Object[]>(){});
 
         keyCountToImport = (keyCountToImport == null) ? data.length : keyCountToImport;
-        SSTableWriter writer = new SSTableWriter(ssTablePath, keyCountToImport);
+        SSTableWriter writer = new SSTableWriter(ssTablePath, keyCountToImport, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         System.out.printf("Importing %s keys...%n", keyCountToImport);
 
@@ -360,7 +312,7 @@
         for (Object row : data)
         {
             Map<?,?> rowAsMap = (Map<?, ?>)row;
-            decoratedKeys.put(partitioner.decorateKey(hexToBytes((String)rowAsMap.get("key"))), rowAsMap);
+            decoratedKeys.put(partitioner.decorateKey(getKeyValidator(columnFamily).fromString((String) rowAsMap.get("key"))), rowAsMap);
         }
 
         for (Map.Entry<DecoratedKey, Map<?, ?>> row : decoratedKeys.entrySet())
@@ -370,11 +322,8 @@
                 parseMeta((Map<?, ?>) row.getValue().get("metadata"), columnFamily, null);
             }
 
-            Object columns = row.getValue().get("columns");
-            if (columnFamily.getType() == ColumnFamilyType.Super && oldSCFormat)
-                addToSuperCF((Map<?, ?>) columns, columnFamily);
-            else
-                addToStandardCF((List<?>) columns, columnFamily);
+            Object columns = row.getValue().get("cells");
+            addColumnsToCF((List<?>) columns, columnFamily);
 
 
             writer.append(row.getKey(), columnFamily);
@@ -426,7 +375,7 @@
         System.out.printf("Importing %s keys...%n", keyCountToImport);
 
         parser = getParser(jsonFile); // renewing parser
-        SSTableWriter writer = new SSTableWriter(ssTablePath, keyCountToImport);
+        SSTableWriter writer = new SSTableWriter(ssTablePath, keyCountToImport, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         int lineNumber = 1;
         DecoratedKey prevStoredKey = null;
@@ -436,15 +385,12 @@
         {
             String key = parser.getCurrentName();
             Map<?, ?> row = parser.readValueAs(new TypeReference<Map<?, ?>>(){});
-            DecoratedKey currentKey = partitioner.decorateKey(hexToBytes((String) row.get("key")));
+            DecoratedKey currentKey = partitioner.decorateKey(getKeyValidator(columnFamily).fromString((String) row.get("key")));
 
             if (row.containsKey("metadata"))
                 parseMeta((Map<?, ?>) row.get("metadata"), columnFamily, null);
 
-            if (columnFamily.getType() == ColumnFamilyType.Super && oldSCFormat)
-                addToSuperCF((Map<?, ?>)row.get("columns"), columnFamily);
-            else
-                addToStandardCF((List<?>)row.get("columns"), columnFamily);
+            addColumnsToCF((List<?>) row.get("cells"), columnFamily);
 
             if (prevStoredKey != null && prevStoredKey.compareTo(currentKey) != -1)
             {
@@ -481,6 +427,21 @@
     }
 
     /**
+     * Get key validator for column family
+     * @param columnFamily column family instance
+     * @return key validator for given column family
+     */
+    private AbstractType<?> getKeyValidator(ColumnFamily columnFamily) {
+        // this is a fix to support backward compatibility
+        // which allows to skip the current key validator
+        // please, take a look onto CASSANDRA-7498 for more details
+        if ("true".equals(System.getProperty("skip.key.validator", "false"))) {
+            return BytesType.instance;
+        }
+        return columnFamily.metadata().getKeyValidator();
+    }
+
+    /**
      * Get JsonParser object for file
      * @param fileName name of the file
      * @return json parser instance for given file
@@ -528,7 +489,6 @@
 
         Integer keyCountToImport = null;
         boolean isSorted = false;
-        boolean oldSCFormat = false;
 
         if (cmd.hasOption(KEY_COUNT_OPTION))
         {
@@ -540,11 +500,6 @@
             isSorted = true;
         }
 
-        if (cmd.hasOption(OLD_SC_FORMAT_OPTION))
-        {
-            oldSCFormat = true;
-        }
-
         DatabaseDescriptor.loadSchemas();
         if (Schema.instance.getNonSystemKeyspaces().size() < 1)
         {
@@ -555,7 +510,7 @@
 
         try
         {
-           new SSTableImport(keyCountToImport, isSorted, oldSCFormat).importJson(json, keyspace, cfamily, ssTable);
+           new SSTableImport(keyCountToImport, isSorted).importJson(json, keyspace, cfamily, ssTable);
         }
         catch (Exception e)
         {
@@ -590,7 +545,7 @@
     {
         try
         {
-            return (type == BytesType.instance) ? hexToBytes(content) : type.fromString(content);
+            return type.fromString(content);
         }
         catch (MarshalException e)
         {

diff --git a/src/java/org/apache/cassandra/tools/SSTableLevelResetter.java b/src/java/org/apache/cassandra/tools/SSTableLevelResetter.java
index 999f440..2b273c5 100644
--- a/src/java/org/apache/cassandra/tools/SSTableLevelResetter.java
+++ b/src/java/org/apache/cassandra/tools/SSTableLevelResetter.java

@@ -22,12 +22,15 @@
 import java.util.Map;
 import java.util.Set;
 
-import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.compaction.LeveledManifest;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
-import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
  * Reset level to 0 on a given set of sstables
@@ -55,25 +58,55 @@
             System.exit(1);
         }
 
-        String keyspace = args[1];
-        String columnfamily = args[2];
-        Directories directories = Directories.create(keyspace, columnfamily);
-        boolean foundSSTable = false;
-        for (Map.Entry<Descriptor, Set<Component>> sstable : directories.sstableLister().list().entrySet())
+        // TODO several daemon threads will run from here.
+        // So we have to explicitly call System.exit.
+        try
         {
-            if (sstable.getValue().contains(Component.STATS))
+            // load keyspace descriptions.
+            DatabaseDescriptor.loadSchemas();
+
+            String keyspaceName = args[1];
+            String columnfamily = args[2];
+            // validate columnfamily
+            if (Schema.instance.getCFMetaData(keyspaceName, columnfamily) == null)
             {
-                foundSSTable = true;
-                Descriptor descriptor = sstable.getKey();
-                Pair<SSTableMetadata, Set<Integer>> metadata = SSTableMetadata.serializer.deserialize(descriptor);
-                out.println("Changing level from " + metadata.left.sstableLevel + " to 0 on " + descriptor.filenameFor(Component.DATA));
-                LeveledManifest.mutateLevel(metadata, descriptor, descriptor.filenameFor(Component.STATS), 0);
+                System.err.println("ColumnFamily not found: " + keyspaceName + "/" + columnfamily);
+                System.exit(1);
+            }
+
+            Keyspace keyspace = Keyspace.openWithoutSSTables(keyspaceName);
+            ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(columnfamily);
+            boolean foundSSTable = false;
+            for (Map.Entry<Descriptor, Set<Component>> sstable : cfs.directories.sstableLister().list().entrySet())
+            {
+                if (sstable.getValue().contains(Component.STATS))
+                {
+                    foundSSTable = true;
+                    Descriptor descriptor = sstable.getKey();
+                    StatsMetadata metadata = (StatsMetadata) descriptor.getMetadataSerializer().deserialize(descriptor, MetadataType.STATS);
+                    if (metadata.sstableLevel > 0)
+                    {
+                        out.println("Changing level from " + metadata.sstableLevel + " to 0 on " + descriptor.filenameFor(Component.DATA));
+                        descriptor.getMetadataSerializer().mutateLevel(descriptor, 0);
+                    }
+                    else
+                    {
+                        out.println("Skipped " + descriptor.filenameFor(Component.DATA) + " since it is already on level 0");
+                    }
+                }
+            }
+
+            if (!foundSSTable)
+            {
+                out.println("Found no sstables, did you give the correct keyspace/columnfamily?");
             }
         }
-
-        if (!foundSSTable)
+        catch (Throwable t)
         {
-            out.println("Found no sstables, did you give the correct keyspace/columnfamily?");
+            JVMStabilityInspector.inspectThrowable(t);
+            t.printStackTrace();
+            System.exit(1);
         }
+        System.exit(0);
     }
 }

diff --git a/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java b/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
index 64720b5..8b33d50 100644
--- a/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
+++ b/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java

@@ -20,10 +20,11 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
+import java.util.EnumSet;
 import java.util.Map;
 
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
+import org.apache.cassandra.io.sstable.metadata.*;
 
 /**
  * Shows the contents of sstable metadata
@@ -47,21 +48,39 @@
             if (new File(fname).exists())
             {
                 Descriptor descriptor = Descriptor.fromFilename(fname);
-                SSTableMetadata metadata = SSTableMetadata.serializer.deserialize(descriptor).left;
+                Map<MetadataType, MetadataComponent> metadata = descriptor.getMetadataSerializer().deserialize(descriptor, EnumSet.allOf(MetadataType.class));
+                ValidationMetadata validation = (ValidationMetadata) metadata.get(MetadataType.VALIDATION);
+                StatsMetadata stats = (StatsMetadata) metadata.get(MetadataType.STATS);
+                CompactionMetadata compaction = (CompactionMetadata) metadata.get(MetadataType.COMPACTION);
 
                 out.printf("SSTable: %s%n", descriptor);
-                out.printf("Partitioner: %s%n", metadata.partitioner);
-                out.printf("Maximum timestamp: %s%n", metadata.maxTimestamp);
-                out.printf("SSTable max local deletion time: %s%n", metadata.maxLocalDeletionTime);
-                out.printf("Compression ratio: %s%n", metadata.compressionRatio);
-                out.printf("Estimated droppable tombstones: %s%n", metadata.getEstimatedDroppableTombstoneRatio((int) (System.currentTimeMillis() / 1000)));
-                out.printf("SSTable Level: %d%n", metadata.sstableLevel);
-                out.println(metadata.replayPosition);
-                printHistograms(metadata, out);
-                out.println("Estimated tombstone drop times:");
-                for (Map.Entry<Double, Long> entry : metadata.estimatedTombstoneDropTime.getAsMap().entrySet())
+                if (validation != null)
                 {
-                    out.printf("%-10s:%10s%n",entry.getKey().intValue(), entry.getValue());
+                    out.printf("Partitioner: %s%n", validation.partitioner);
+                    out.printf("Bloom Filter FP chance: %f%n", validation.bloomFilterFPChance);
+                }
+                if (stats != null)
+                {
+                    out.printf("Minimum timestamp: %s%n", stats.minTimestamp);
+                    out.printf("Maximum timestamp: %s%n", stats.maxTimestamp);
+                    out.printf("SSTable max local deletion time: %s%n", stats.maxLocalDeletionTime);
+                    out.printf("Compression ratio: %s%n", stats.compressionRatio);
+                    out.printf("Estimated droppable tombstones: %s%n", stats.getEstimatedDroppableTombstoneRatio((int) (System.currentTimeMillis() / 1000)));
+                    out.printf("SSTable Level: %d%n", stats.sstableLevel);
+                    out.printf("Repaired at: %d%n", stats.repairedAt);
+                    out.println(stats.replayPosition);
+                    out.println("Estimated tombstone drop times:%n");
+                    for (Map.Entry<Double, Long> entry : stats.estimatedTombstoneDropTime.getAsMap().entrySet())
+                    {
+                        out.printf("%-10s:%10s%n",entry.getKey().intValue(), entry.getValue());
+                    }
+                    printHistograms(stats, out);
+                }
+                if (compaction != null)
+                {
+                    out.printf("Ancestors: %s%n", compaction.ancestors.toString());
+                    out.printf("Estimated cardinality: %s%n", compaction.cardinalityEstimator.cardinality());
+
                 }
             }
             else
@@ -71,14 +90,14 @@
         }
     }
 
-    private static void printHistograms(SSTableMetadata metadata, PrintStream out)
+    private static void printHistograms(StatsMetadata metadata, PrintStream out)
     {
         long[] offsets = metadata.estimatedRowSize.getBucketOffsets();
         long[] ersh = metadata.estimatedRowSize.getBuckets(false);
         long[] ecch = metadata.estimatedColumnCount.getBuckets(false);
 
         out.println(String.format("%-10s%18s%18s",
-                                  "Count", "Row Size", "Column Count"));
+                                  "Count", "Row Size", "Cell Count"));
 
         for (int i = 0; i < offsets.length; i++)
         {

diff --git a/src/java/org/apache/cassandra/tools/SSTableRepairedAtSetter.java b/src/java/org/apache/cassandra/tools/SSTableRepairedAtSetter.java
new file mode 100644
index 0000000..b37d3b4
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/SSTableRepairedAtSetter.java

@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.tools;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.file.attribute.FileTime;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.service.ActiveRepairService;
+
+/**
+ * Set repairedAt status on a given set of sstables.
+ *
+ * If you pass --is-repaired, it will set the repairedAt time to the last modified time.
+ *
+ * If you know you ran repair 2 weeks ago, you can do something like
+ *
+ * sstablerepairset --is-repaired -f <(find /var/lib/cassandra/data/.../ -iname "*Data.db*" -mtime +14)
+ *
+ */
+public class SSTableRepairedAtSetter
+{
+    /**
+     * @param args a list of sstables whose metadata we are changing
+     */
+    public static void main(final String[] args) throws IOException
+    {
+        PrintStream out = System.out;
+        if (args.length == 0)
+        {
+            out.println("This command should be run with Cassandra stopped!");
+            out.println("Usage: sstablerepairedset [--is-repaired | --is-unrepaired] [-f <sstable-list> | <sstables>]");
+            System.exit(1);
+        }
+
+        if (args.length < 3 || !args[0].equals("--really-set") || (!args[1].equals("--is-repaired") && !args[1].equals("--is-unrepaired")))
+        {
+            out.println("This command should be run with Cassandra stopped, otherwise you will get very strange behavior");
+            out.println("Verify that Cassandra is not running and then execute the command like this:");
+            out.println("Usage: sstablelevelreset --really-set [--is-repaired | --is-unrepaired] [-f <sstable-list> | <sstables>]");
+            System.exit(1);
+        }
+
+        boolean setIsRepaired = args[1].equals("--is-repaired");
+
+        List<String> fileNames;
+        if (args[2].equals("-f"))
+        {
+            fileNames = Files.readAllLines(Paths.get(args[3]), Charset.defaultCharset());
+        }
+        else
+        {
+            fileNames = Arrays.asList(args).subList(2, args.length);
+        }
+
+        for (String fname: fileNames)
+        {
+            Descriptor descriptor = Descriptor.fromFilename(fname);
+            if (descriptor.version.hasRepairedAt)
+            {
+                if (setIsRepaired)
+                {
+                    FileTime f = Files.getLastModifiedTime(new File(descriptor.filenameFor(Component.DATA)).toPath());
+                    descriptor.getMetadataSerializer().mutateRepairedAt(descriptor, f.toMillis());
+                }
+                else
+                {
+                    descriptor.getMetadataSerializer().mutateRepairedAt(descriptor, ActiveRepairService.UNREPAIRED_SSTABLE);
+                }
+            }
+            else
+            {
+                System.err.println("SSTable " + fname + " does not have repaired property, run upgradesstables");
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
index 315e4e1..78d4d9e 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java

@@ -32,18 +32,12 @@
 import org.apache.cassandra.db.compaction.LeveledManifest;
 import org.apache.cassandra.db.compaction.Scrubber;
 import org.apache.cassandra.io.sstable.*;
-import org.apache.cassandra.service.CassandraDaemon;
 import org.apache.cassandra.utils.OutputHandler;
 
 import static org.apache.cassandra.tools.BulkLoader.CmdLineOptions;
 
 public class StandaloneScrubber
 {
-    static
-    {
-        CassandraDaemon.initLog4j();
-    }
-
     private static final String TOOL_NAME = "sstablescrub";
     private static final String VERBOSE_OPTION  = "verbose";
     private static final String DEBUG_OPTION  = "debug";
@@ -99,13 +93,6 @@
             }
             System.out.println(String.format("Pre-scrub sstables snapshotted into snapshot %s", snapshotName));
 
-            // if old-style json manifest, snapshot it
-            if (cfs.directories.tryGetLeveledManifest() != null)
-            {
-                cfs.directories.snapshotLeveledManifest(snapshotName);
-                System.out.println(String.format("Leveled manifest snapshotted into snapshot %s", snapshotName));
-            }
-
             LeveledManifest manifest = null;
             // If leveled, load the manifest
             if (cfs.getCompactionStrategy() instanceof LeveledCompactionStrategy)
@@ -130,17 +117,6 @@
                             scrubber.close();
                         }
 
-                        if (manifest != null)
-                        {
-                            if (scrubber.getNewInOrderSSTable() != null)
-                                manifest.add(scrubber.getNewInOrderSSTable());
-
-                            List<SSTableReader> added = scrubber.getNewSSTable() == null
-                                ? Collections.<SSTableReader>emptyList()
-                                : Collections.singletonList(scrubber.getNewSSTable());
-                            manifest.replace(Collections.singletonList(sstable), added);
-                        }
-
                         // Remove the sstable (it's been copied by scrub and snapshotted)
                         sstable.markObsolete();
                         sstable.releaseReference();

diff --git a/src/java/org/apache/cassandra/tools/StandaloneSplitter.java b/src/java/org/apache/cassandra/tools/StandaloneSplitter.java
index 546f22f..75ad959 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneSplitter.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneSplitter.java

@@ -25,14 +25,11 @@
 import org.apache.commons.cli.*;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.compaction.LeveledManifest;
 import org.apache.cassandra.db.compaction.SSTableSplitter;
 import org.apache.cassandra.io.sstable.*;
-import org.apache.cassandra.service.CassandraDaemon;
 import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.tools.BulkLoader.CmdLineOptions;
@@ -41,11 +38,6 @@
 {
     public static final int DEFAULT_SSTABLE_SIZE = 50;
 
-    static
-    {
-        CassandraDaemon.initLog4j();
-    }
-
     private static final String TOOL_NAME = "sstablessplit";
     private static final String VERBOSE_OPTION = "verbose";
     private static final String DEBUG_OPTION = "debug";
@@ -124,6 +116,11 @@
                 try
                 {
                     SSTableReader sstable = SSTableReader.openNoValidation(fn.getKey(), fn.getValue(), cfs.metadata);
+                    if (!isSSTableLargerEnough(sstable, options.sizeInMB)) {
+                        System.out.println(String.format("Skipping %s: it's size (%.3f MB) is less than the split size (%d MB)",
+                                sstable.getFilename(), ((sstable.onDiskLength() * 1.0d) / 1024L) / 1024L, options.sizeInMB));
+                        continue;
+                    }
                     sstables.add(sstable);
 
                     if (options.snapshot) {
@@ -139,6 +136,10 @@
                         e.printStackTrace(System.err);
                 }
             }
+            if (sstables.isEmpty()) {
+                System.out.println("No sstables needed splitting.");
+                System.exit(0);
+            }
             if (options.snapshot)
                 System.out.println(String.format("Pre-split sstables snapshotted into snapshot %s", snapshotName));
 
@@ -148,10 +149,6 @@
                 try
                 {
                     new SSTableSplitter(cfs, sstable, options.sizeInMB).split();
-
-                    // Remove the sstable
-                    sstable.markObsolete();
-                    sstable.releaseReference();
                 }
                 catch (Exception e)
                 {
@@ -172,6 +169,13 @@
         }
     }
 
+    /**
+     * filter the sstable which size is less than the expected max sstable size.
+     */
+    private static boolean isSSTableLargerEnough(SSTableReader sstable, int sizeInMB) {
+        return sstable.onDiskLength() > sizeInMB * 1024L * 1024L;
+    }
+
     private static class Options
     {
         public final List<String> filenames;

diff --git a/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java b/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java
index 7e73e9d..55f206e 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java

@@ -29,18 +29,12 @@
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.compaction.Upgrader;
 import org.apache.cassandra.io.sstable.*;
-import org.apache.cassandra.service.CassandraDaemon;
 import org.apache.cassandra.utils.OutputHandler;
 
 import static org.apache.cassandra.tools.BulkLoader.CmdLineOptions;
 
 public class StandaloneUpgrader
 {
-    static
-    {
-        CassandraDaemon.initLog4j();
-    }
-
     private static final String TOOL_NAME = "sstableupgrade";
     private static final String DEBUG_OPTION  = "debug";
     private static final String HELP_OPTION  = "help";
@@ -103,9 +97,6 @@
                 {
                     Upgrader upgrader = new Upgrader(cfs, sstable, handler);
                     upgrader.upgrade();
-
-                    sstable.markObsolete();
-                    sstable.releaseReference();
                 }
                 catch (Exception e)
                 {

diff --git a/src/java/org/apache/cassandra/tracing/TraceState.java b/src/java/org/apache/cassandra/tracing/TraceState.java
index 5fec633..cfff295 100644
--- a/src/java/org/apache/cassandra/tracing/TraceState.java
+++ b/src/java/org/apache/cassandra/tracing/TraceState.java

@@ -21,6 +21,7 @@
 import java.nio.ByteBuffer;
 import java.util.UUID;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import com.google.common.base.Stopwatch;
 import org.slf4j.helpers.MessageFormatter;
@@ -28,9 +29,9 @@
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.ArrayBackedSortedColumns;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.TreeMapBackedSortedColumns;
+import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
@@ -47,6 +48,10 @@
     public final Stopwatch watch;
     public final ByteBuffer sessionIdBytes;
 
+    // Multiple requests can use the same TraceState at a time, so we need to reference count.
+    // See CASSANDRA-7626 for more details.
+    private final AtomicInteger references = new AtomicInteger(1);
+
     public TraceState(InetAddress coordinator, UUID sessionId)
     {
         assert coordinator != null;
@@ -55,13 +60,12 @@
         this.coordinator = coordinator;
         this.sessionId = sessionId;
         sessionIdBytes = ByteBufferUtil.bytes(sessionId);
-        watch = new Stopwatch();
-        watch.start();
+        watch = Stopwatch.createStarted();
     }
 
     public int elapsed()
     {
-        long elapsed = watch.elapsedTime(TimeUnit.MICROSECONDS);
+        long elapsed = watch.elapsed(TimeUnit.MICROSECONDS);
         return elapsed < Integer.MAX_VALUE ? (int) elapsed : Integer.MAX_VALUE;
     }
 
@@ -95,14 +99,31 @@
             public void runMayThrow()
             {
                 CFMetaData cfMeta = CFMetaData.TraceEventsCf;
-                ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(cfMeta);
+                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfMeta);
                 Tracing.addColumn(cf, Tracing.buildName(cfMeta, eventId, ByteBufferUtil.bytes("activity")), message);
                 Tracing.addColumn(cf, Tracing.buildName(cfMeta, eventId, ByteBufferUtil.bytes("source")), FBUtilities.getBroadcastAddress());
                 if (elapsed >= 0)
                     Tracing.addColumn(cf, Tracing.buildName(cfMeta, eventId, ByteBufferUtil.bytes("source_elapsed")), elapsed);
                 Tracing.addColumn(cf, Tracing.buildName(cfMeta, eventId, ByteBufferUtil.bytes("thread")), threadName);
-                Tracing.mutateWithCatch(new RowMutation(Tracing.TRACE_KS, sessionIdBytes, cf));
+                Tracing.mutateWithCatch(new Mutation(Tracing.TRACE_KS, sessionIdBytes, cf));
             }
         });
     }
+
+    public boolean acquireReference()
+    {
+        while (true)
+        {
+            int n = references.get();
+            if (n <= 0)
+                return false;
+            if (references.compareAndSet(n, n + 1))
+                return true;
+        }
+    }
+
+    public int releaseReference()
+    {
+        return references.decrementAndGet();
+    }
 }

diff --git a/src/java/org/apache/cassandra/tracing/Tracing.java b/src/java/org/apache/cassandra/tracing/Tracing.java
index 88239be..e377c6e 100644
--- a/src/java/org/apache/cassandra/tracing/Tracing.java
+++ b/src/java/org/apache/cassandra/tracing/Tracing.java

@@ -33,8 +33,8 @@
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.ColumnNameBuilder;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
 import org.apache.cassandra.db.marshal.TimeUUIDType;
 import org.apache.cassandra.exceptions.OverloadedException;
 import org.apache.cassandra.exceptions.UnavailableException;
@@ -71,46 +71,43 @@
 
     public static final Tracing instance = new Tracing();
 
-    public static void addColumn(ColumnFamily cf, ByteBuffer name, InetAddress address)
+    public static void addColumn(ColumnFamily cf, CellName name, InetAddress address)
     {
         addColumn(cf, name, ByteBufferUtil.bytes(address));
     }
 
-    public static void addColumn(ColumnFamily cf, ByteBuffer name, int value)
+    public static void addColumn(ColumnFamily cf, CellName name, int value)
     {
         addColumn(cf, name, ByteBufferUtil.bytes(value));
     }
 
-    public static void addColumn(ColumnFamily cf, ByteBuffer name, long value)
+    public static void addColumn(ColumnFamily cf, CellName name, long value)
     {
         addColumn(cf, name, ByteBufferUtil.bytes(value));
     }
 
-    public static void addColumn(ColumnFamily cf, ByteBuffer name, String value)
+    public static void addColumn(ColumnFamily cf, CellName name, String value)
     {
         addColumn(cf, name, ByteBufferUtil.bytes(value));
     }
 
-    private static void addColumn(ColumnFamily cf, ByteBuffer name, ByteBuffer value)
+    private static void addColumn(ColumnFamily cf, CellName name, ByteBuffer value)
     {
-        cf.addColumn(new ExpiringColumn(name, value, System.currentTimeMillis(), TTL));
+        cf.addColumn(new BufferExpiringCell(name, value, System.currentTimeMillis(), TTL));
     }
 
     public void addParameterColumns(ColumnFamily cf, Map<String, String> rawPayload)
     {
         for (Map.Entry<String, String> entry : rawPayload.entrySet())
         {
-            cf.addColumn(new ExpiringColumn(buildName(cf.metadata(), bytes("parameters"), bytes(entry.getKey())),
-                                            bytes(entry.getValue()), System.currentTimeMillis(), TTL));
+            cf.addColumn(new BufferExpiringCell(buildName(CFMetaData.TraceSessionsCf, "parameters", entry.getKey()),
+                                                bytes(entry.getValue()), System.currentTimeMillis(), TTL));
         }
     }
 
-    public static ByteBuffer buildName(CFMetaData meta, ByteBuffer... args)
+    public static CellName buildName(CFMetaData meta, Object... args)
     {
-        ColumnNameBuilder builder = meta.getCfDef().getColumnNameBuilder();
-        for (ByteBuffer arg : args)
-            builder.add(arg);
-        return builder.build();
+        return meta.comparator.makeCellName(args);
     }
 
     public UUID getSessionId()
@@ -143,9 +140,10 @@
         return sessionId;
     }
 
-    public void stopNonLocal(TraceState state)
+    public void doneWithNonLocalSession(TraceState state)
     {
-        sessions.remove(state.sessionId);
+        if (state.releaseReference() == 0)
+            sessions.remove(state.sessionId);
     }
 
     /**
@@ -169,8 +167,8 @@
                 {
                     CFMetaData cfMeta = CFMetaData.TraceSessionsCf;
                     ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfMeta);
-                    addColumn(cf, buildName(cfMeta, bytes("duration")), elapsed);
-                    mutateWithCatch(new RowMutation(TRACE_KS, sessionIdBytes, cf));
+                    addColumn(cf, buildName(cfMeta, "duration"), elapsed);
+                    mutateWithCatch(new Mutation(TRACE_KS, sessionIdBytes, cf));
                 }
             });
 
@@ -206,13 +204,13 @@
             public void run()
             {
                 CFMetaData cfMeta = CFMetaData.TraceSessionsCf;
-                ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(cfMeta);
-                addColumn(cf, buildName(cfMeta, bytes("coordinator")), FBUtilities.getBroadcastAddress());
+                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfMeta);
+                addColumn(cf, buildName(cfMeta, "coordinator"), FBUtilities.getBroadcastAddress());
                 addParameterColumns(cf, parameters);
                 addColumn(cf, buildName(cfMeta, bytes("request")), request);
                 addColumn(cf, buildName(cfMeta, bytes("started_at")), started_at);
                 addParameterColumns(cf, parameters);
-                mutateWithCatch(new RowMutation(TRACE_KS, sessionIdBytes, cf));
+                mutateWithCatch(new Mutation(TRACE_KS, sessionIdBytes, cf));
             }
         });
     }
@@ -232,7 +230,7 @@
         assert sessionBytes.length == 16;
         UUID sessionId = UUIDGen.getUUID(ByteBuffer.wrap(sessionBytes));
         TraceState ts = sessions.get(sessionId);
-        if (ts != null)
+        if (ts != null && ts.acquireReference())
             return ts;
 
         if (message.verb == MessagingService.Verb.REQUEST_RESPONSE)
@@ -284,7 +282,7 @@
         state.trace(format, args);
     }
 
-    static void mutateWithCatch(RowMutation mutation)
+    static void mutateWithCatch(Mutation mutation)
     {
         try
         {

diff --git a/src/java/org/apache/cassandra/transport/CBCodec.java b/src/java/org/apache/cassandra/transport/CBCodec.java
index 67b0cce..0ef619e 100644
--- a/src/java/org/apache/cassandra/transport/CBCodec.java
+++ b/src/java/org/apache/cassandra/transport/CBCodec.java

@@ -17,11 +17,11 @@
  */
 package org.apache.cassandra.transport;
 
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 public interface CBCodec<T>
 {
-    public T decode(ChannelBuffer body, int version);
-    public void encode(T t, ChannelBuffer dest, int version);
+    public T decode(ByteBuf body, int version);
+    public void encode(T t, ByteBuf dest, int version);
     public int encodedSize(T t, int version);
 }

diff --git a/src/java/org/apache/cassandra/transport/CBUtil.java b/src/java/org/apache/cassandra/transport/CBUtil.java
index eb32faa..f9425c3 100644
--- a/src/java/org/apache/cassandra/transport/CBUtil.java
+++ b/src/java/org/apache/cassandra/transport/CBUtil.java

@@ -33,24 +33,26 @@
 import java.util.Map;
 import java.util.UUID;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
-import org.jboss.netty.util.CharsetUtil;
+import io.netty.buffer.*;
+import io.netty.util.CharsetUtil;
 
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 /**
- * ChannelBuffer utility methods.
+ * ByteBuf utility methods.
  * Note that contrarily to ByteBufferUtil, these method do "read" the
- * ChannelBuffer advancing it's (read) position. They also write by
+ * ByteBuf advancing it's (read) position. They also write by
  * advancing the write position. Functions are also provided to create
- * ChannelBuffer while avoiding copies.
+ * ByteBuf while avoiding copies.
  */
 public abstract class CBUtil
 {
+    public static final ByteBufAllocator allocator = new PooledByteBufAllocator(true);
+
     private CBUtil() {}
 
     private final static ThreadLocal<CharsetDecoder> decoder = new ThreadLocal<CharsetDecoder>()
@@ -62,12 +64,12 @@
         }
     };
 
-    private static String readString(ChannelBuffer cb, int length)
+    private static String readString(ByteBuf cb, int length)
     {
         if (length == 0)
             return "";
 
-        ByteBuffer buffer = cb.toByteBuffer(cb.readerIndex(), length);
+        ByteBuffer buffer = cb.nioBuffer(cb.readerIndex(), length);
         try
         {
             String str = decodeString(buffer);
@@ -80,7 +82,7 @@
         }
     }
 
-    public static String readString(ChannelBuffer cb)
+    public static String readString(ByteBuf cb)
     {
         try
         {
@@ -116,7 +118,7 @@
         return dst.flip().toString();
     }
 
-    public static void writeString(String str, ChannelBuffer cb)
+    public static void writeString(String str, ByteBuf cb)
     {
         byte[] bytes = str.getBytes(CharsetUtil.UTF_8);
         cb.writeShort(bytes.length);
@@ -128,7 +130,7 @@
         return 2 + TypeSizes.encodedUTF8Length(str);
     }
 
-    public static String readLongString(ChannelBuffer cb)
+    public static String readLongString(ByteBuf cb)
     {
         try
         {
@@ -141,7 +143,7 @@
         }
     }
 
-    public static void writeLongString(String str, ChannelBuffer cb)
+    public static void writeLongString(String str, ByteBuf cb)
     {
         byte[] bytes = str.getBytes(CharsetUtil.UTF_8);
         cb.writeInt(bytes.length);
@@ -153,7 +155,7 @@
         return 4 + str.getBytes(CharsetUtil.UTF_8).length;
     }
 
-    public static byte[] readBytes(ChannelBuffer cb)
+    public static byte[] readBytes(ByteBuf cb)
     {
         try
         {
@@ -168,7 +170,7 @@
         }
     }
 
-    public static void writeBytes(byte[] bytes, ChannelBuffer cb)
+    public static void writeBytes(byte[] bytes, ByteBuf cb)
     {
         cb.writeShort(bytes.length);
         cb.writeBytes(bytes);
@@ -179,12 +181,12 @@
         return 2 + bytes.length;
     }
 
-    public static ConsistencyLevel readConsistencyLevel(ChannelBuffer cb)
+    public static ConsistencyLevel readConsistencyLevel(ByteBuf cb)
     {
         return ConsistencyLevel.fromCode(cb.readUnsignedShort());
     }
 
-    public static void writeConsistencyLevel(ConsistencyLevel consistency, ChannelBuffer cb)
+    public static void writeConsistencyLevel(ConsistencyLevel consistency, ByteBuf cb)
     {
         cb.writeShort(consistency.code);
     }
@@ -194,7 +196,7 @@
         return 2;
     }
 
-    public static <T extends Enum<T>> T readEnumValue(Class<T> enumType, ChannelBuffer cb)
+    public static <T extends Enum<T>> T readEnumValue(Class<T> enumType, ByteBuf cb)
     {
         String value = CBUtil.readString(cb);
         try
@@ -207,7 +209,7 @@
         }
     }
 
-    public static <T extends Enum<T>> void writeEnumValue(T enumValue, ChannelBuffer cb)
+    public static <T extends Enum<T>> void writeEnumValue(T enumValue, ByteBuf cb)
     {
         writeString(enumValue.toString(), cb);
     }
@@ -217,14 +219,14 @@
         return sizeOfString(enumValue.toString());
     }
 
-    public static UUID readUUID(ChannelBuffer cb)
+    public static UUID readUUID(ByteBuf cb)
     {
         byte[] bytes = new byte[16];
         cb.readBytes(bytes);
         return UUIDGen.getUUID(ByteBuffer.wrap(bytes));
     }
 
-    public static void writeUUID(UUID uuid, ChannelBuffer cb)
+    public static void writeUUID(UUID uuid, ByteBuf cb)
     {
         cb.writeBytes(UUIDGen.decompose(uuid));
     }
@@ -234,7 +236,7 @@
         return 16;
     }
 
-    public static List<String> readStringList(ChannelBuffer cb)
+    public static List<String> readStringList(ByteBuf cb)
     {
         int length = cb.readUnsignedShort();
         List<String> l = new ArrayList<String>(length);
@@ -243,7 +245,7 @@
         return l;
     }
 
-    public static void writeStringList(List<String> l, ChannelBuffer cb)
+    public static void writeStringList(List<String> l, ByteBuf cb)
     {
         cb.writeShort(l.size());
         for (String str : l)
@@ -258,7 +260,7 @@
         return size;
     }
 
-    public static Map<String, String> readStringMap(ChannelBuffer cb)
+    public static Map<String, String> readStringMap(ByteBuf cb)
     {
         int length = cb.readUnsignedShort();
         Map<String, String> m = new HashMap<String, String>(length);
@@ -271,7 +273,7 @@
         return m;
     }
 
-    public static void writeStringMap(Map<String, String> m, ChannelBuffer cb)
+    public static void writeStringMap(Map<String, String> m, ByteBuf cb)
     {
         cb.writeShort(m.size());
         for (Map.Entry<String, String> entry : m.entrySet())
@@ -292,7 +294,7 @@
         return size;
     }
 
-    public static Map<String, List<String>> readStringToStringListMap(ChannelBuffer cb)
+    public static Map<String, List<String>> readStringToStringListMap(ByteBuf cb)
     {
         int length = cb.readUnsignedShort();
         Map<String, List<String>> m = new HashMap<String, List<String>>(length);
@@ -305,7 +307,7 @@
         return m;
     }
 
-    public static void writeStringToStringListMap(Map<String, List<String>> m, ChannelBuffer cb)
+    public static void writeStringToStringListMap(Map<String, List<String>> m, ByteBuf cb)
     {
         cb.writeShort(m.size());
         for (Map.Entry<String, List<String>> entry : m.entrySet())
@@ -326,13 +328,17 @@
         return size;
     }
 
-    public static ByteBuffer readValue(ChannelBuffer cb)
+    public static ByteBuffer readValue(ByteBuf cb)
     {
         int length = cb.readInt();
-        return length < 0 ? null : cb.readSlice(length).toByteBuffer();
+        if (length < 0)
+            return null;
+        ByteBuf slice = cb.readSlice(length);
+
+        return ByteBuffer.wrap(readRawBytes(slice));
     }
 
-    public static void writeValue(byte[] bytes, ChannelBuffer cb)
+    public static void writeValue(byte[] bytes, ByteBuf cb)
     {
         if (bytes == null)
         {
@@ -344,7 +350,7 @@
         cb.writeBytes(bytes);
     }
 
-    public static void writeValue(ByteBuffer bytes, ChannelBuffer cb)
+    public static void writeValue(ByteBuffer bytes, ByteBuf cb)
     {
         if (bytes == null)
         {
@@ -352,8 +358,11 @@
             return;
         }
 
-        cb.writeInt(bytes.remaining());
-        cb.writeBytes(bytes.duplicate());
+        int remaining = bytes.remaining();
+        cb.writeInt(remaining);
+
+        if (remaining > 0)
+            cb.writeBytes(bytes.duplicate());
     }
 
     public static int sizeOfValue(byte[] bytes)
@@ -366,7 +375,7 @@
         return 4 + (bytes == null ? 0 : bytes.remaining());
     }
 
-    public static List<ByteBuffer> readValueList(ChannelBuffer cb)
+    public static List<ByteBuffer> readValueList(ByteBuf cb)
     {
         int size = cb.readUnsignedShort();
         if (size == 0)
@@ -378,7 +387,7 @@
         return l;
     }
 
-    public static void writeValueList(List<ByteBuffer> values, ChannelBuffer cb)
+    public static void writeValueList(List<ByteBuffer> values, ByteBuf cb)
     {
         cb.writeShort(values.size());
         for (ByteBuffer value : values)
@@ -393,7 +402,23 @@
         return size;
     }
 
-    public static InetSocketAddress readInet(ChannelBuffer cb)
+    public static Pair<List<String>, List<ByteBuffer>> readNameAndValueList(ByteBuf cb)
+    {
+        int size = cb.readUnsignedShort();
+        if (size == 0)
+            return Pair.create(Collections.<String>emptyList(), Collections.<ByteBuffer>emptyList());
+
+        List<String> s = new ArrayList<>(size);
+        List<ByteBuffer> l = new ArrayList<>(size);
+        for (int i = 0; i < size; i++)
+        {
+            s.add(readString(cb));
+            l.add(readValue(cb));
+        }
+        return Pair.create(s, l);
+    }
+
+    public static InetSocketAddress readInet(ByteBuf cb)
     {
         int addrSize = cb.readByte();
         byte[] address = new byte[addrSize];
@@ -409,7 +434,7 @@
         }
     }
 
-    public static void writeInet(InetSocketAddress inet, ChannelBuffer cb)
+    public static void writeInet(InetSocketAddress inet, ByteBuf cb)
     {
         byte[] address = inet.getAddress().getAddress();
 
@@ -426,20 +451,12 @@
 
     /*
      * Reads *all* readable bytes from {@code cb} and return them.
-     * If {@code cb} is backed by an array, this will return the underlying array directly, without copy.
      */
-    public static byte[] readRawBytes(ChannelBuffer cb)
+    public static byte[] readRawBytes(ByteBuf cb)
     {
-        if (cb.hasArray() && cb.readableBytes() == cb.array().length)
-        {
-            // Move the readerIndex just so we consistenly consume the input
-            cb.readerIndex(cb.writerIndex());
-            return cb.array();
-        }
-
-        // Otherwise, just read the bytes in a new array
         byte[] bytes = new byte[cb.readableBytes()];
         cb.readBytes(bytes);
         return bytes;
     }
+
 }

diff --git a/src/java/org/apache/cassandra/transport/Client.java b/src/java/org/apache/cassandra/transport/Client.java
index 4a50bde..989b954 100644
--- a/src/java/org/apache/cassandra/transport/Client.java
+++ b/src/java/org/apache/cassandra/transport/Client.java

@@ -128,7 +128,7 @@
                     return null;
                 }
             }
-            return new QueryMessage(query, new QueryOptions(ConsistencyLevel.ONE, Collections.<ByteBuffer>emptyList(), false, pageSize, null, null));
+            return new QueryMessage(query, QueryOptions.create(ConsistencyLevel.ONE, Collections.<ByteBuffer>emptyList(), false, pageSize, null, null));
         }
         else if (msgType.equals("PREPARE"))
         {
@@ -156,7 +156,7 @@
                     }
                     values.add(bb);
                 }
-                return new ExecuteMessage(MD5Digest.wrap(id), new QueryOptions(ConsistencyLevel.ONE, values));
+                return new ExecuteMessage(MD5Digest.wrap(id), QueryOptions.forInternalCalls(ConsistencyLevel.ONE, values));
             }
             catch (Exception e)
             {

diff --git a/src/java/org/apache/cassandra/transport/Connection.java b/src/java/org/apache/cassandra/transport/Connection.java
index a72402f..aa571a7 100644
--- a/src/java/org/apache/cassandra/transport/Connection.java
+++ b/src/java/org/apache/cassandra/transport/Connection.java

@@ -17,10 +17,13 @@
  */
 package org.apache.cassandra.transport;
 
-import org.jboss.netty.channel.Channel;
+import io.netty.channel.Channel;
+import io.netty.util.AttributeKey;
 
 public class Connection
 {
+    static final AttributeKey<Connection> attributeKey = AttributeKey.valueOf("CONN");
+
     private final Channel channel;
     private final int version;
     private final Tracker tracker;

diff --git a/src/java/org/apache/cassandra/transport/DataType.java b/src/java/org/apache/cassandra/transport/DataType.java
index 8cd7194..a45d7ce 100644
--- a/src/java/org/apache/cassandra/transport/DataType.java
+++ b/src/java/org/apache/cassandra/transport/DataType.java

@@ -17,14 +17,14 @@
  */
 package org.apache.cassandra.transport;
 
-import java.nio.charset.StandardCharsets;
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.List;
 
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.db.marshal.*;
@@ -51,7 +51,10 @@
     INET     (16, InetAddressType.instance),
     LIST     (32, null),
     MAP      (33, null),
-    SET      (34, null);
+    SET      (34, null),
+    UDT      (48, null),
+    TUPLE    (49, null);
+
 
     public static final OptionCodec<DataType> codec = new OptionCodec<DataType>(DataType.class);
 
@@ -78,27 +81,45 @@
         return id;
     }
 
-    public Object readValue(ChannelBuffer cb)
+    public Object readValue(ByteBuf cb, int version)
     {
         switch (this)
         {
             case CUSTOM:
                 return CBUtil.readString(cb);
             case LIST:
-                return DataType.toType(codec.decodeOne(cb));
+                return DataType.toType(codec.decodeOne(cb, version));
             case SET:
-                return DataType.toType(codec.decodeOne(cb));
+                return DataType.toType(codec.decodeOne(cb, version));
             case MAP:
                 List<AbstractType> l = new ArrayList<AbstractType>(2);
-                l.add(DataType.toType(codec.decodeOne(cb)));
-                l.add(DataType.toType(codec.decodeOne(cb)));
+                l.add(DataType.toType(codec.decodeOne(cb, version)));
+                l.add(DataType.toType(codec.decodeOne(cb, version)));
                 return l;
+            case UDT:
+                String ks = CBUtil.readString(cb);
+                ByteBuffer name = UTF8Type.instance.decompose(CBUtil.readString(cb));
+                int n = cb.readUnsignedShort();
+                List<ByteBuffer> fieldNames = new ArrayList<>(n);
+                List<AbstractType<?>> fieldTypes = new ArrayList<>(n);
+                for (int i = 0; i < n; i++)
+                {
+                    fieldNames.add(UTF8Type.instance.decompose(CBUtil.readString(cb)));
+                    fieldTypes.add(DataType.toType(codec.decodeOne(cb, version)));
+                }
+                return new UserType(ks, name, fieldNames, fieldTypes);
+            case TUPLE:
+                n = cb.readUnsignedShort();
+                List<AbstractType<?>> types = new ArrayList<>(n);
+                for (int i = 0; i < n; i++)
+                    types.add(DataType.toType(codec.decodeOne(cb, version)));
+                return new TupleType(types);
             default:
                 return null;
         }
     }
 
-    public void writeValue(Object value, ChannelBuffer cb)
+    public void writeValue(Object value, ByteBuf cb, int version)
     {
         switch (this)
         {
@@ -107,40 +128,75 @@
                 CBUtil.writeString((String)value, cb);
                 break;
             case LIST:
-                codec.writeOne(DataType.fromType((AbstractType)value), cb);
+                codec.writeOne(DataType.fromType((AbstractType)value, version), cb, version);
                 break;
             case SET:
-                codec.writeOne(DataType.fromType((AbstractType)value), cb);
+                codec.writeOne(DataType.fromType((AbstractType)value, version), cb, version);
                 break;
             case MAP:
                 List<AbstractType> l = (List<AbstractType>)value;
-                codec.writeOne(DataType.fromType(l.get(0)), cb);
-                codec.writeOne(DataType.fromType(l.get(1)), cb);
+                codec.writeOne(DataType.fromType(l.get(0), version), cb, version);
+                codec.writeOne(DataType.fromType(l.get(1), version), cb, version);
+                break;
+            case UDT:
+                UserType udt = (UserType)value;
+                CBUtil.writeString(udt.keyspace, cb);
+                CBUtil.writeString(UTF8Type.instance.compose(udt.name), cb);
+                cb.writeShort(udt.size());
+                for (int i = 0; i < udt.size(); i++)
+                {
+                    CBUtil.writeString(UTF8Type.instance.compose(udt.fieldName(i)), cb);
+                    codec.writeOne(DataType.fromType(udt.fieldType(i), version), cb, version);
+                }
+                break;
+            case TUPLE:
+                TupleType tt = (TupleType)value;
+                cb.writeShort(tt.size());
+                for (int i = 0; i < tt.size(); i++)
+                    codec.writeOne(DataType.fromType(tt.type(i), version), cb, version);
                 break;
         }
     }
 
-    public int serializedValueSize(Object value)
+    public int serializedValueSize(Object value, int version)
     {
         switch (this)
         {
             case CUSTOM:
-                return 2 + ((String)value).getBytes(StandardCharsets.UTF_8).length;
+                return CBUtil.sizeOfString((String)value);
             case LIST:
             case SET:
-                return codec.oneSerializedSize(DataType.fromType((AbstractType)value));
+                return codec.oneSerializedSize(DataType.fromType((AbstractType)value, version), version);
             case MAP:
                 List<AbstractType> l = (List<AbstractType>)value;
                 int s = 0;
-                s += codec.oneSerializedSize(DataType.fromType(l.get(0)));
-                s += codec.oneSerializedSize(DataType.fromType(l.get(1)));
+                s += codec.oneSerializedSize(DataType.fromType(l.get(0), version), version);
+                s += codec.oneSerializedSize(DataType.fromType(l.get(1), version), version);
                 return s;
+            case UDT:
+                UserType udt = (UserType)value;
+                int size = 0;
+                size += CBUtil.sizeOfString(udt.keyspace);
+                size += CBUtil.sizeOfString(UTF8Type.instance.compose(udt.name));
+                size += 2;
+                for (int i = 0; i < udt.size(); i++)
+                {
+                    size += CBUtil.sizeOfString(UTF8Type.instance.compose(udt.fieldName(i)));
+                    size += codec.oneSerializedSize(DataType.fromType(udt.fieldType(i), version), version);
+                }
+                return size;
+            case TUPLE:
+                TupleType tt = (TupleType)value;
+                size = 2;
+                for (int i = 0; i < tt.size(); i++)
+                    size += codec.oneSerializedSize(DataType.fromType(tt.type(i), version), version);
+                return size;
             default:
                 return 0;
         }
     }
 
-    public static Pair<DataType, Object> fromType(AbstractType type)
+    public static Pair<DataType, Object> fromType(AbstractType type, int version)
     {
         // For CQL3 clients, ReversedType is an implementation detail and they
         // shouldn't have to care about it.
@@ -171,6 +227,13 @@
                     return Pair.<DataType, Object>create(SET, ((SetType)type).elements);
                 }
             }
+
+            if (type instanceof UserType && version >= 3)
+                return Pair.<DataType, Object>create(UDT, type);
+
+            if (type instanceof TupleType && version >= 3)
+                return Pair.<DataType, Object>create(TUPLE, type);
+
             return Pair.<DataType, Object>create(CUSTOM, type.toString());
         }
         else
@@ -194,6 +257,10 @@
                 case MAP:
                     List<AbstractType> l = (List<AbstractType>)entry.right;
                     return MapType.getInstance(l.get(0), l.get(1));
+                case UDT:
+                    return (AbstractType)entry.right;
+                case TUPLE:
+                    return (AbstractType)entry.right;
                 default:
                     return entry.left.type;
             }

diff --git a/src/java/org/apache/cassandra/transport/Event.java b/src/java/org/apache/cassandra/transport/Event.java
index ba9a23d..b7c5e68 100644
--- a/src/java/org/apache/cassandra/transport/Event.java
+++ b/src/java/org/apache/cassandra/transport/Event.java

@@ -20,8 +20,8 @@
 import java.net.InetAddress;
 import java.net.InetSocketAddress;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import com.google.common.base.Objects;
+import io.netty.buffer.ByteBuf;
 
 public abstract class Event
 {
@@ -34,33 +34,33 @@
         this.type = type;
     }
 
-    public static Event deserialize(ChannelBuffer cb)
+    public static Event deserialize(ByteBuf cb, int version)
     {
         switch (CBUtil.readEnumValue(Type.class, cb))
         {
             case TOPOLOGY_CHANGE:
-                return TopologyChange.deserializeEvent(cb);
+                return TopologyChange.deserializeEvent(cb, version);
             case STATUS_CHANGE:
-                return StatusChange.deserializeEvent(cb);
+                return StatusChange.deserializeEvent(cb, version);
             case SCHEMA_CHANGE:
-                return SchemaChange.deserializeEvent(cb);
+                return SchemaChange.deserializeEvent(cb, version);
         }
         throw new AssertionError();
     }
 
-    public void serialize(ChannelBuffer dest)
+    public void serialize(ByteBuf dest, int version)
     {
         CBUtil.writeEnumValue(type, dest);
-        serializeEvent(dest);
+        serializeEvent(dest, version);
     }
 
-    public int serializedSize()
+    public int serializedSize(int version)
     {
-        return CBUtil.sizeOfEnumValue(type) + eventSerializedSize();
+        return CBUtil.sizeOfEnumValue(type) + eventSerializedSize(version);
     }
 
-    protected abstract void serializeEvent(ChannelBuffer dest);
-    protected abstract int eventSerializedSize();
+    protected abstract void serializeEvent(ByteBuf dest, int version);
+    protected abstract int eventSerializedSize(int version);
 
     public static class TopologyChange extends Event
     {
@@ -92,20 +92,20 @@
         }
 
         // Assumes the type has already been deserialized
-        private static TopologyChange deserializeEvent(ChannelBuffer cb)
+        private static TopologyChange deserializeEvent(ByteBuf cb, int version)
         {
             Change change = CBUtil.readEnumValue(Change.class, cb);
             InetSocketAddress node = CBUtil.readInet(cb);
             return new TopologyChange(change, node);
         }
 
-        protected void serializeEvent(ChannelBuffer dest)
+        protected void serializeEvent(ByteBuf dest, int version)
         {
             CBUtil.writeEnumValue(change, dest);
             CBUtil.writeInet(node, dest);
         }
 
-        protected int eventSerializedSize()
+        protected int eventSerializedSize(int version)
         {
             return CBUtil.sizeOfEnumValue(change) + CBUtil.sizeOfInet(node);
         }
@@ -115,6 +115,23 @@
         {
             return change + " " + node;
         }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(change, node);
+        }
+
+        @Override
+        public boolean equals(Object other)
+        {
+            if (!(other instanceof TopologyChange))
+                return false;
+
+            TopologyChange tpc = (TopologyChange)other;
+            return Objects.equal(change, tpc.change)
+                && Objects.equal(node, tpc.node);
+        }
     }
 
     public static class StatusChange extends Event
@@ -142,20 +159,20 @@
         }
 
         // Assumes the type has already been deserialized
-        private static StatusChange deserializeEvent(ChannelBuffer cb)
+        private static StatusChange deserializeEvent(ByteBuf cb, int version)
         {
             Status status = CBUtil.readEnumValue(Status.class, cb);
             InetSocketAddress node = CBUtil.readInet(cb);
             return new StatusChange(status, node);
         }
 
-        protected void serializeEvent(ChannelBuffer dest)
+        protected void serializeEvent(ByteBuf dest, int version)
         {
             CBUtil.writeEnumValue(status, dest);
             CBUtil.writeInet(node, dest);
         }
 
-        protected int eventSerializedSize()
+        protected int eventSerializedSize(int version)
         {
             return CBUtil.sizeOfEnumValue(status) + CBUtil.sizeOfInet(node);
         }
@@ -165,56 +182,149 @@
         {
             return status + " " + node;
         }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(status, node);
+        }
+
+        @Override
+        public boolean equals(Object other)
+        {
+            if (!(other instanceof StatusChange))
+                return false;
+
+            StatusChange stc = (StatusChange)other;
+            return Objects.equal(status, stc.status)
+                && Objects.equal(node, stc.node);
+        }
     }
 
     public static class SchemaChange extends Event
     {
         public enum Change { CREATED, UPDATED, DROPPED }
+        public enum Target { KEYSPACE, TABLE, TYPE }
 
         public final Change change;
+        public final Target target;
         public final String keyspace;
-        public final String table;
+        public final String tableOrType;
 
-        public SchemaChange(Change change, String keyspace, String table)
+        public SchemaChange(Change change, Target target, String keyspace, String tableOrType)
         {
             super(Type.SCHEMA_CHANGE);
             this.change = change;
+            this.target = target;
             this.keyspace = keyspace;
-            this.table = table;
+            this.tableOrType = tableOrType;
+            if (target != Target.KEYSPACE)
+                assert this.tableOrType != null : "Table or type should be set for non-keyspace schema change events";
         }
 
         public SchemaChange(Change change, String keyspace)
         {
-            this(change, keyspace, "");
+            this(change, Target.KEYSPACE, keyspace, null);
         }
 
         // Assumes the type has already been deserialized
-        private static SchemaChange deserializeEvent(ChannelBuffer cb)
+        public static SchemaChange deserializeEvent(ByteBuf cb, int version)
         {
             Change change = CBUtil.readEnumValue(Change.class, cb);
-            String keyspace = CBUtil.readString(cb);
-            String table = CBUtil.readString(cb);
-            return new SchemaChange(change, keyspace, table);
+            if (version >= 3)
+            {
+                Target target = CBUtil.readEnumValue(Target.class, cb);
+                String keyspace = CBUtil.readString(cb);
+                String tableOrType = target == Target.KEYSPACE ? null : CBUtil.readString(cb);
+                return new SchemaChange(change, target, keyspace, tableOrType);
+            }
+            else
+            {
+                String keyspace = CBUtil.readString(cb);
+                String table = CBUtil.readString(cb);
+                return new SchemaChange(change, table.isEmpty() ? Target.KEYSPACE : Target.TABLE, keyspace, table.isEmpty() ? null : table);
+            }
         }
 
-        protected void serializeEvent(ChannelBuffer dest)
+        public void serializeEvent(ByteBuf dest, int version)
         {
-            CBUtil.writeEnumValue(change, dest);
-            CBUtil.writeString(keyspace, dest);
-            CBUtil.writeString(table, dest);
+            if (version >= 3)
+            {
+                CBUtil.writeEnumValue(change, dest);
+                CBUtil.writeEnumValue(target, dest);
+                CBUtil.writeString(keyspace, dest);
+                if (target != Target.KEYSPACE)
+                    CBUtil.writeString(tableOrType, dest);
+            }
+            else
+            {
+                if (target == Target.TYPE)
+                {
+                    // For the v1/v2 protocol, we have no way to represent type changes, so we simply say the keyspace
+                    // was updated.  See CASSANDRA-7617.
+                    CBUtil.writeEnumValue(Change.UPDATED, dest);
+                    CBUtil.writeString(keyspace, dest);
+                    CBUtil.writeString("", dest);
+                }
+                else
+                {
+                    CBUtil.writeEnumValue(change, dest);
+                    CBUtil.writeString(keyspace, dest);
+                    CBUtil.writeString(target == Target.KEYSPACE ? "" : tableOrType, dest);
+                }
+            }
         }
 
-        protected int eventSerializedSize()
+        public int eventSerializedSize(int version)
         {
-            return CBUtil.sizeOfEnumValue(change)
-                 + CBUtil.sizeOfString(keyspace)
-                 + CBUtil.sizeOfString(table);
+            if (version >= 3)
+            {
+                int size = CBUtil.sizeOfEnumValue(change)
+                         + CBUtil.sizeOfEnumValue(target)
+                         + CBUtil.sizeOfString(keyspace);
+
+                if (target != Target.KEYSPACE)
+                    size += CBUtil.sizeOfString(tableOrType);
+
+                return size;
+            }
+            else
+            {
+                if (target == Target.TYPE)
+                {
+                    return CBUtil.sizeOfEnumValue(Change.UPDATED)
+                         + CBUtil.sizeOfString(keyspace)
+                         + CBUtil.sizeOfString("");
+                }
+                return CBUtil.sizeOfEnumValue(change)
+                     + CBUtil.sizeOfString(keyspace)
+                     + CBUtil.sizeOfString(target == Target.KEYSPACE ? "" : tableOrType);
+            }
         }
 
         @Override
         public String toString()
         {
-            return change + " " + keyspace + (table.isEmpty() ? "" : "." + table);
+            return change + " " + target + " " + keyspace + (tableOrType == null ? "" : "." + tableOrType);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(change, target, keyspace, tableOrType);
+        }
+
+        @Override
+        public boolean equals(Object other)
+        {
+            if (!(other instanceof SchemaChange))
+                return false;
+
+            SchemaChange scc = (SchemaChange)other;
+            return Objects.equal(change, scc.change)
+                && Objects.equal(target, scc.target)
+                && Objects.equal(keyspace, scc.keyspace)
+                && Objects.equal(tableOrType, scc.tableOrType);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/Frame.java b/src/java/org/apache/cassandra/transport/Frame.java
index 89755df..01bee10 100644
--- a/src/java/org/apache/cassandra/transport/Frame.java
+++ b/src/java/org/apache/cassandra/transport/Frame.java

@@ -20,27 +20,37 @@
 
 import java.io.IOException;
 import java.util.EnumSet;
+import java.util.List;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
-import org.jboss.netty.channel.*;
-import org.jboss.netty.handler.codec.oneone.OneToOneDecoder;
-import org.jboss.netty.handler.codec.oneone.OneToOneEncoder;
-import org.jboss.netty.handler.codec.frame.*;
+import io.netty.buffer.ByteBuf;
+import io.netty.channel.*;
+import io.netty.handler.codec.ByteToMessageDecoder;
+import io.netty.handler.codec.MessageToMessageDecoder;
+import io.netty.handler.codec.MessageToMessageEncoder;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.transport.messages.ErrorMessage;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class Frame
 {
     public final Header header;
-    public final ChannelBuffer body;
+    public final ByteBuf body;
 
     /**
-     * On-wire frame.
-     * Frames are defined as:
+     * An on-wire frame consists of a header and a body.
+     *
+     * The header is defined the following way in native protocol version 3 and later:
+     *
+     *   0         8        16        24        32         40
+     *   +---------+---------+---------+---------+---------+
+     *   | version |  flags  |      stream       | opcode  |
+     *   +---------+---------+---------+---------+---------+
+     *   |                length                 |
+     *   +---------+---------+---------+---------+
+     *
+     *
+     * In versions 1 and 2 the header has a smaller (1 byte) stream id, and is thus defined the following way:
      *
      *   0         8        16        24        32
      *   +---------+---------+---------+---------+
@@ -49,13 +59,23 @@
      *   |                length                 |
      *   +---------+---------+---------+---------+
      */
-    private Frame(Header header, ChannelBuffer body)
+    private Frame(Header header, ByteBuf body)
     {
         this.header = header;
         this.body = body;
     }
 
-    public static Frame create(Message.Type type, int streamId, int version, EnumSet<Header.Flag> flags, ChannelBuffer body)
+    public void retain()
+    {
+        body.retain();
+    }
+
+    public boolean release()
+    {
+        return body.release();
+    }
+
+    public static Frame create(Message.Type type, int streamId, int version, EnumSet<Header.Flag> flags, ByteBuf body)
     {
         Header header = new Header(version, flags, streamId, type);
         return new Frame(header, body);
@@ -63,9 +83,10 @@
 
     public static class Header
     {
-        public static final int LENGTH = 8;
+        // 8 bytes in protocol versions 1 and 2, 8 bytes in protocol version 3 and later
+        public static final int MODERN_LENGTH = 9;
+        public static final int LEGACY_LENGTH = 8;
 
-        public static final int BODY_LENGTH_OFFSET = 4;
         public static final int BODY_LENGTH_SIZE = 4;
 
         public final int version;
@@ -115,12 +136,12 @@
         }
     }
 
-    public Frame with(ChannelBuffer newBody)
+    public Frame with(ByteBuf newBody)
     {
         return new Frame(header, newBody);
     }
 
-    public static class Decoder extends FrameDecoder
+    public static class Decoder extends ByteToMessageDecoder
     {
         private static final int MAX_FRAME_LENGTH = DatabaseDescriptor.getNativeTransportMaxFrameSize();
 
@@ -137,7 +158,7 @@
         }
 
         @Override
-        protected Object decode(ChannelHandlerContext ctx, Channel channel, ChannelBuffer buffer)
+        protected void decode(ChannelHandlerContext ctx, ByteBuf buffer, List<Object> results)
         throws Exception
         {
             if (discardingTooLongFrame)
@@ -146,37 +167,55 @@
                 // If we have discarded everything, throw the exception
                 if (bytesToDiscard <= 0)
                     fail();
-                return null;
+                return;
             }
 
-            // Wait until we have read at least the header
-            if (buffer.readableBytes() < Header.LENGTH)
-                return null;
+            // Wait until we have read at least the short header
+            if (buffer.readableBytes() < Header.LEGACY_LENGTH)
+                return;
 
             int idx = buffer.readerIndex();
 
-            int firstByte = buffer.getByte(idx);
+            int firstByte = buffer.getByte(idx++);
             Message.Direction direction = Message.Direction.extractFromVersion(firstByte);
             int version = firstByte & 0x7F;
 
             if (version > Server.CURRENT_VERSION)
                 throw new ProtocolException("Invalid or unsupported protocol version: " + version);
 
-            int flags = buffer.getByte(idx + 1);
-            int streamId = buffer.getByte(idx + 2);
+            // Wait until we have the complete V3+ header
+            if (version >= Server.VERSION_3 && buffer.readableBytes() < Header.MODERN_LENGTH)
+                return;
+
+            int flags = buffer.getByte(idx++);
+
+            int streamId, headerLength;
+            if (version >= Server.VERSION_3)
+            {
+                streamId = buffer.getShort(idx);
+                idx += 2;
+                headerLength = Header.MODERN_LENGTH;
+            }
+            else
+            {
+                streamId = buffer.getByte(idx);
+                idx++;
+                headerLength = Header.LEGACY_LENGTH;
+            }
 
             // This throws a protocol exceptions if the opcode is unknown
-            Message.Type type = Message.Type.fromOpcode(buffer.getByte(idx + 3), direction);
+            Message.Type type = Message.Type.fromOpcode(buffer.getByte(idx++), direction);
 
-            long bodyLength = buffer.getUnsignedInt(idx + Header.BODY_LENGTH_OFFSET);
+            long bodyLength = buffer.getUnsignedInt(idx);
+            idx += Header.BODY_LENGTH_SIZE;
 
             if (bodyLength < 0)
             {
-                buffer.skipBytes(Header.LENGTH);
+                buffer.skipBytes(headerLength);
                 throw new ProtocolException("Invalid frame body length: " + bodyLength);
             }
 
-            long frameLength = bodyLength + Header.LENGTH;
+            long frameLength = bodyLength + headerLength;
             if (frameLength > MAX_FRAME_LENGTH)
             {
                 // Enter the discard mode and discard everything received so far.
@@ -186,31 +225,32 @@
                 bytesToDiscard = discard(buffer, frameLength);
                 if (bytesToDiscard <= 0)
                     fail();
-                return null;
+                return;
             }
 
-            // never overflows because it's less than the max frame length
-            int frameLengthInt = (int) frameLength;
-            if (buffer.readableBytes() < frameLengthInt)
-                return null;
+            if (buffer.readableBytes() < frameLength)
+                return;
 
             // extract body
-            ChannelBuffer body = extractFrame(buffer, idx + Header.LENGTH, (int)bodyLength);
-            buffer.readerIndex(idx + frameLengthInt);
+            ByteBuf body = buffer.slice(idx, (int) bodyLength);
+            body.retain();
+            
+            idx += bodyLength;
+            buffer.readerIndex(idx);
 
-            Connection connection = (Connection)channel.getAttachment();
+            Connection connection = ctx.channel().attr(Connection.attributeKey).get();
             if (connection == null)
             {
                 // First message seen on this channel, attach the connection object
-                connection = factory.newConnection(channel, version);
-                channel.setAttachment(connection);
+                connection = factory.newConnection(ctx.channel(), version);
+                ctx.channel().attr(Connection.attributeKey).set(connection);
             }
             else if (connection.getVersion() != version)
             {
                 throw new ProtocolException(String.format("Invalid message version. Got %d but previous messages on this connection had version %d", version, connection.getVersion()));
             }
 
-            return new Frame(new Header(version, flags, streamId, type), body);
+            results.add(new Frame(new Header(version, flags, streamId, type), body));
         }
 
         private void fail()
@@ -225,76 +265,90 @@
     }
 
     // How much remains to be discarded
-    private static long discard(ChannelBuffer buffer, long remainingToDiscard)
+    private static long discard(ByteBuf buffer, long remainingToDiscard)
     {
         int availableToDiscard = (int) Math.min(remainingToDiscard, buffer.readableBytes());
         buffer.skipBytes(availableToDiscard);
         return remainingToDiscard - availableToDiscard;
     }
 
-    public static class Encoder extends OneToOneEncoder
+    @ChannelHandler.Sharable
+    public static class Encoder extends MessageToMessageEncoder<Frame>
     {
-        public Object encode(ChannelHandlerContext ctx, Channel channel, Object msg)
+        public void encode(ChannelHandlerContext ctx, Frame frame, List<Object> results)
         throws IOException
         {
-            assert msg instanceof Frame : "Expecting frame, got " + msg;
+            int headerLength = frame.header.version >= Server.VERSION_3
+                             ? Header.MODERN_LENGTH
+                             : Header.LEGACY_LENGTH;
+            ByteBuf header = CBUtil.allocator.buffer(headerLength);
 
-            Frame frame = (Frame)msg;
-
-            ChannelBuffer header = ChannelBuffers.buffer(Frame.Header.LENGTH);
             Message.Type type = frame.header.type;
             header.writeByte(type.direction.addToVersion(frame.header.version));
             header.writeByte(Header.Flag.serialize(frame.header.flags));
-            header.writeByte(frame.header.streamId);
+
+            if (frame.header.version >= Server.VERSION_3)
+                header.writeShort(frame.header.streamId);
+            else
+                header.writeByte(frame.header.streamId);
+
             header.writeByte(type.opcode);
             header.writeInt(frame.body.readableBytes());
 
-            return ChannelBuffers.wrappedBuffer(header, frame.body);
+            results.add(header);
+            results.add(frame.body);
         }
     }
 
-    public static class Decompressor extends OneToOneDecoder
+    @ChannelHandler.Sharable
+    public static class Decompressor extends MessageToMessageDecoder<Frame>
     {
-        public Object decode(ChannelHandlerContext ctx, Channel channel, Object msg)
+        public void decode(ChannelHandlerContext ctx, Frame frame, List<Object> results)
         throws IOException
         {
-            assert msg instanceof Frame : "Expecting frame, got " + msg;
-
-            Frame frame = (Frame)msg;
-            Connection connection = (Connection)channel.getAttachment();
+            Connection connection = ctx.channel().attr(Connection.attributeKey).get();
 
             if (!frame.header.flags.contains(Header.Flag.COMPRESSED) || connection == null)
-                return frame;
+            {
+                results.add(frame);
+                return;
+            }
 
             FrameCompressor compressor = connection.getCompressor();
             if (compressor == null)
-                return frame;
+            {
+                results.add(frame);
+                return;
+            }
 
-            return compressor.decompress(frame);
+            results.add(compressor.decompress(frame));
         }
     }
 
-    public static class Compressor extends OneToOneEncoder
+    @ChannelHandler.Sharable
+    public static class Compressor extends MessageToMessageEncoder<Frame>
     {
-        public Object encode(ChannelHandlerContext ctx, Channel channel, Object msg)
+        public void encode(ChannelHandlerContext ctx, Frame frame, List<Object> results)
         throws IOException
         {
-            assert msg instanceof Frame : "Expecting frame, got " + msg;
-
-            Frame frame = (Frame)msg;
-            Connection connection = (Connection)channel.getAttachment();
+            Connection connection = ctx.channel().attr(Connection.attributeKey).get();
 
             // Never compress STARTUP messages
             if (frame.header.type == Message.Type.STARTUP || connection == null)
-                return frame;
+            {
+                results.add(frame);
+                return;
+            }
 
             FrameCompressor compressor = connection.getCompressor();
             if (compressor == null)
-                return frame;
+            {
+                results.add(frame);
+                return;
+            }
 
             frame.header.flags.add(Header.Flag.COMPRESSED);
-            return compressor.compress(frame);
-
+            results.add(compressor.compress(frame));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/FrameCompressor.java b/src/java/org/apache/cassandra/transport/FrameCompressor.java
index a3c3848..8ab735f 100644
--- a/src/java/org/apache/cassandra/transport/FrameCompressor.java
+++ b/src/java/org/apache/cassandra/transport/FrameCompressor.java

@@ -19,7 +19,8 @@
 
 import java.io.IOException;
 
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.Unpooled;
 import org.xerial.snappy.Snappy;
 import org.xerial.snappy.SnappyError;
 
@@ -74,10 +75,25 @@
         public Frame compress(Frame frame) throws IOException
         {
             byte[] input = CBUtil.readRawBytes(frame.body);
-            byte[] output = new byte[Snappy.maxCompressedLength(input.length)];
+            ByteBuf output = CBUtil.allocator.heapBuffer(Snappy.maxCompressedLength(input.length));
 
-            int written = Snappy.compress(input, 0, input.length, output, 0);
-            return frame.with(ChannelBuffers.wrappedBuffer(output, 0, written));
+            try
+            {
+                int written = Snappy.compress(input, 0, input.length, output.array(), output.arrayOffset());
+                output.writerIndex(written);
+            }
+            catch (final Throwable e)
+            {
+                output.release();
+                throw e;
+            }
+            finally
+            {
+                //release the old frame
+                frame.release();
+            }
+
+            return frame.with(output);
         }
 
         public Frame decompress(Frame frame) throws IOException
@@ -87,9 +103,25 @@
             if (!Snappy.isValidCompressedBuffer(input, 0, input.length))
                 throw new ProtocolException("Provided frame does not appear to be Snappy compressed");
 
-            byte[] output = new byte[Snappy.uncompressedLength(input)];
-            int size = Snappy.uncompress(input, 0, input.length, output, 0);
-            return frame.with(ChannelBuffers.wrappedBuffer(output, 0, size));
+            ByteBuf output = CBUtil.allocator.heapBuffer(Snappy.uncompressedLength(input));
+
+            try
+            {
+                int size = Snappy.uncompress(input, 0, input.length, output.array(), output.arrayOffset());
+                output.writerIndex(size);
+            }
+            catch (final Throwable e)
+            {
+                output.release();
+                throw e;
+            }
+            finally
+            {
+                //release the old frame
+                frame.release();
+            }
+
+            return frame.with(output);
         }
     }
 
@@ -121,21 +153,32 @@
             byte[] input = CBUtil.readRawBytes(frame.body);
 
             int maxCompressedLength = compressor.maxCompressedLength(input.length);
-            byte[] output = new byte[INTEGER_BYTES + maxCompressedLength];
+            ByteBuf outputBuf = CBUtil.allocator.heapBuffer(INTEGER_BYTES + maxCompressedLength);
 
-            output[0] = (byte) (input.length >>> 24);
-            output[1] = (byte) (input.length >>> 16);
-            output[2] = (byte) (input.length >>>  8);
-            output[3] = (byte) (input.length);
+            byte[] output = outputBuf.array();
+            int outputOffset = outputBuf.arrayOffset();
+
+            output[outputOffset + 0] = (byte) (input.length >>> 24);
+            output[outputOffset + 1] = (byte) (input.length >>> 16);
+            output[outputOffset + 2] = (byte) (input.length >>>  8);
+            output[outputOffset + 3] = (byte) (input.length);
 
             try
             {
-                int written = compressor.compress(input, 0, input.length, output, INTEGER_BYTES, maxCompressedLength);
-                return frame.with(ChannelBuffers.wrappedBuffer(output, 0, INTEGER_BYTES + written));
+                int written = compressor.compress(input, 0, input.length, output, outputOffset + INTEGER_BYTES, maxCompressedLength);
+                outputBuf.writerIndex(INTEGER_BYTES + written);
+
+                return frame.with(outputBuf);
             }
-            catch (LZ4Exception e)
+            catch (final Throwable e)
             {
-                throw new IOException(e);
+                outputBuf.release();
+                throw e;
+            }
+            finally
+            {
+                //release the old frame
+                frame.release();
             }
         }
 
@@ -148,19 +191,27 @@
                                    | ((input[2] & 0xFF) <<  8)
                                    | ((input[3] & 0xFF));
 
-            byte[] output = new byte[uncompressedLength];
+            ByteBuf output = CBUtil.allocator.heapBuffer(uncompressedLength);
 
             try
             {
-                int read = decompressor.decompress(input, INTEGER_BYTES, output, 0, uncompressedLength);
+                int read = decompressor.decompress(input, INTEGER_BYTES, output.array(), output.arrayOffset(), uncompressedLength);
                 if (read != input.length - INTEGER_BYTES)
                     throw new IOException("Compressed lengths mismatch");
 
-                return frame.with(ChannelBuffers.wrappedBuffer(output));
+                output.writerIndex(uncompressedLength);
+
+                return frame.with(output);
             }
-            catch (LZ4Exception e)
+            catch (final Throwable e)
             {
-                throw new IOException(e);
+                output.release();
+                throw e;
+            }
+            finally
+            {
+                //release the old frame
+                frame.release();
             }
         }
     }

diff --git a/src/java/org/apache/cassandra/transport/Message.java b/src/java/org/apache/cassandra/transport/Message.java
index eb38558..9efc424 100644
--- a/src/java/org/apache/cassandra/transport/Message.java
+++ b/src/java/org/apache/cassandra/transport/Message.java

@@ -17,23 +17,32 @@
  */
 package org.apache.cassandra.transport;
 
+import java.util.ArrayList;
 import java.io.IOException;
 import java.util.EnumSet;
+import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.channel.*;
+import io.netty.handler.codec.MessageToMessageDecoder;
+import io.netty.handler.codec.MessageToMessageEncoder;
 
 import com.google.common.base.Predicate;
 import com.google.common.collect.ImmutableSet;
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
-import org.jboss.netty.channel.*;
-import org.jboss.netty.handler.codec.oneone.OneToOneDecoder;
-import org.jboss.netty.handler.codec.oneone.OneToOneEncoder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.transport.messages.*;
 import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
  * A message from the CQL binary protocol.
@@ -134,8 +143,9 @@
     }
 
     public final Type type;
-    protected volatile Connection connection;
-    private volatile int streamId;
+    protected Connection connection;
+    private int streamId;
+    private Frame sourceFrame;
 
     protected Message(Type type)
     {
@@ -163,6 +173,16 @@
         return streamId;
     }
 
+    public void setSourceFrame(Frame sourceFrame)
+    {
+        this.sourceFrame = sourceFrame;
+    }
+
+    public Frame getSourceFrame()
+    {
+        return sourceFrame;
+    }
+
     public static abstract class Request extends Message
     {
         protected boolean tracingRequested;
@@ -212,13 +232,11 @@
         }
     }
 
-    public static class ProtocolDecoder extends OneToOneDecoder
+    @ChannelHandler.Sharable
+    public static class ProtocolDecoder extends MessageToMessageDecoder<Frame>
     {
-        public Object decode(ChannelHandlerContext ctx, Channel channel, Object msg)
+        public void decode(ChannelHandlerContext ctx, Frame frame, List results)
         {
-            assert msg instanceof Frame : "Expecting frame, got " + msg;
-
-            Frame frame = (Frame)msg;
             boolean isRequest = frame.header.type.direction == Direction.REQUEST;
             boolean isTracing = frame.header.flags.contains(Frame.Header.Flag.TRACING);
 
@@ -228,12 +246,14 @@
             {
                 Message message = frame.header.type.codec.decode(frame.body, frame.header.version);
                 message.setStreamId(frame.header.streamId);
+                message.setSourceFrame(frame);
 
                 if (isRequest)
                 {
                     assert message instanceof Request;
                     Request req = (Request)message;
-                    req.attach((Connection)channel.getAttachment());
+                    Connection connection = ctx.channel().attr(Connection.attributeKey).get();
+                    req.attach(connection);
                     if (isTracing)
                         req.setTracingRequested();
                 }
@@ -244,109 +264,224 @@
                         ((Response)message).setTracingId(tracingId);
                 }
 
-                return message;
+                results.add(message);
             }
-            catch (Exception ex)
+            catch (Throwable ex)
             {
+                frame.release();
                 // Remember the streamId
                 throw ErrorMessage.wrap(ex, frame.header.streamId);
             }
         }
     }
 
-    public static class ProtocolEncoder extends OneToOneEncoder
+    @ChannelHandler.Sharable
+    public static class ProtocolEncoder extends MessageToMessageEncoder<Message>
     {
-        public Object encode(ChannelHandlerContext ctx, Channel channel, Object msg)
+        public void encode(ChannelHandlerContext ctx, Message message, List results)
         {
-            assert msg instanceof Message : "Expecting message, got " + msg;
-
-            Message message = (Message)msg;
-
-            Connection connection = (Connection)channel.getAttachment();
+            Connection connection = ctx.channel().attr(Connection.attributeKey).get();
             // The only case the connection can be null is when we send the initial STARTUP message (client side thus)
             int version = connection == null ? Server.CURRENT_VERSION : connection.getVersion();
 
             EnumSet<Frame.Header.Flag> flags = EnumSet.noneOf(Frame.Header.Flag.class);
 
             Codec<Message> codec = (Codec<Message>)message.type.codec;
-            int messageSize = codec.encodedSize(message, version);
-            ChannelBuffer body;
-            if (message instanceof Response)
+            try
             {
-                UUID tracingId = ((Response)message).getTracingId();
-                if (tracingId != null)
+                int messageSize = codec.encodedSize(message, version);
+                ByteBuf body;
+                if (message instanceof Response)
                 {
-                    body = ChannelBuffers.buffer(CBUtil.sizeOfUUID(tracingId) + messageSize);
-                    CBUtil.writeUUID(tracingId, body);
-                    flags.add(Frame.Header.Flag.TRACING);
+                    UUID tracingId = ((Response)message).getTracingId();
+                    if (tracingId != null)
+                    {
+                        body = CBUtil.allocator.buffer(CBUtil.sizeOfUUID(tracingId) + messageSize);
+                        CBUtil.writeUUID(tracingId, body);
+                        flags.add(Frame.Header.Flag.TRACING);
+                    }
+                    else
+                    {
+                        body = CBUtil.allocator.buffer(messageSize);
+                    }
                 }
                 else
                 {
-                    body = ChannelBuffers.buffer(messageSize);
+                    assert message instanceof Request;
+                    body = CBUtil.allocator.buffer(messageSize);
+                    if (((Request)message).isTracingRequested())
+                        flags.add(Frame.Header.Flag.TRACING);
                 }
-            }
-            else
-            {
-                assert message instanceof Request;
-                body = ChannelBuffers.buffer(messageSize);
-                if (((Request)message).isTracingRequested())
-                    flags.add(Frame.Header.Flag.TRACING);
-            }
 
-            codec.encode(message, body, version);
-            return Frame.create(message.type, message.getStreamId(), version, flags, body);
+                try
+                {
+                    codec.encode(message, body, version);
+                }
+                catch (Throwable e)
+                {
+                    body.release();
+                    throw e;
+                }
+
+                results.add(Frame.create(message.type, message.getStreamId(), version, flags, body));
+            }
+            catch (Throwable e)
+            {
+                throw ErrorMessage.wrap(e, message.getStreamId());
+            }
         }
     }
 
-    public static class Dispatcher extends SimpleChannelUpstreamHandler
+    @ChannelHandler.Sharable
+    public static class Dispatcher extends SimpleChannelInboundHandler<Request>
     {
-        @Override
-        public void messageReceived(ChannelHandlerContext ctx, MessageEvent e)
+        private static class FlushItem
         {
-            assert e.getMessage() instanceof Message : "Expecting message, got " + e.getMessage();
+            final ChannelHandlerContext ctx;
+            final Object response;
+            final Frame sourceFrame;
+            private FlushItem(ChannelHandlerContext ctx, Object response, Frame sourceFrame)
+            {
+                this.ctx = ctx;
+                this.sourceFrame = sourceFrame;
+                this.response = response;
+            }
+        }
 
-            if (e.getMessage() instanceof Response)
-                throw new ProtocolException("Invalid response message received, expecting requests");
+        private final class Flusher implements Runnable
+        {
+            final EventLoop eventLoop;
+            final ConcurrentLinkedQueue<FlushItem> queued = new ConcurrentLinkedQueue<>();
+            final AtomicBoolean running = new AtomicBoolean(false);
+            final HashSet<ChannelHandlerContext> channels = new HashSet<>();
+            final List<FlushItem> flushed = new ArrayList<>();
+            int runsSinceFlush = 0;
+            int runsWithNoWork = 0;
+            private Flusher(EventLoop eventLoop)
+            {
+                this.eventLoop = eventLoop;
+            }
+            void start()
+            {
+                if (!running.get() && running.compareAndSet(false, true))
+                {
+                    this.eventLoop.execute(this);
+                }
+            }
+            public void run()
+            {
 
-            Request request = (Request)e.getMessage();
+                boolean doneWork = false;
+                FlushItem flush;
+                while ( null != (flush = queued.poll()) )
+                {
+                    channels.add(flush.ctx);
+                    flush.ctx.write(flush.response, flush.ctx.voidPromise());
+                    flushed.add(flush);
+                    doneWork = true;
+                }
+
+                runsSinceFlush++;
+
+                if (!doneWork || runsSinceFlush > 2 || flushed.size() > 50)
+                {
+                    for (ChannelHandlerContext channel : channels)
+                        channel.flush();
+                    for (FlushItem item : flushed)
+                        item.sourceFrame.release();
+
+                    channels.clear();
+                    flushed.clear();
+                    runsSinceFlush = 0;
+                }
+
+                if (doneWork)
+                {
+                    runsWithNoWork = 0;
+                }
+                else
+                {
+                    // either reschedule or cancel
+                    if (++runsWithNoWork > 5)
+                    {
+                        running.set(false);
+                        if (queued.isEmpty() || !running.compareAndSet(false, true))
+                            return;
+                    }
+                }
+
+                eventLoop.schedule(this, 10000, TimeUnit.NANOSECONDS);
+            }
+        }
+
+        private static final ConcurrentMap<EventLoop, Flusher> flusherLookup = new ConcurrentHashMap<>();
+
+        public Dispatcher()
+        {
+            super(false);
+        }
+
+        @Override
+        public void channelRead0(ChannelHandlerContext ctx, Request request)
+        {
+
+            final Response response;
+            final ServerConnection connection;
 
             try
             {
                 assert request.connection() instanceof ServerConnection;
-                ServerConnection connection = (ServerConnection)request.connection();
+                connection = (ServerConnection)request.connection();
                 QueryState qstate = connection.validateNewMessage(request.type, connection.getVersion(), request.getStreamId());
 
                 logger.debug("Received: {}, v={}", request, connection.getVersion());
 
-                Response response = request.execute(qstate);
+                response = request.execute(qstate);
                 response.setStreamId(request.getStreamId());
                 response.attach(connection);
                 connection.applyStateTransition(request.type, response.type);
-
-                logger.debug("Responding: {}, v={}", response, connection.getVersion());
-
-                ctx.getChannel().write(response);
             }
-            catch (Throwable ex)
+            catch (Throwable t)
             {
-                // Don't let the exception propagate to exceptionCaught() if we can help it so that we can assign the right streamID.
-                ctx.getChannel().write(ErrorMessage.fromException(ex, new UnexpectedChannelExceptionHandler(ctx.getChannel(), true)).setStreamId(request.getStreamId()));
+                JVMStabilityInspector.inspectThrowable(t);
+                UnexpectedChannelExceptionHandler handler = new UnexpectedChannelExceptionHandler(ctx.channel(), true);
+                flush(new FlushItem(ctx, ErrorMessage.fromException(t, handler).setStreamId(request.getStreamId()), request.getSourceFrame()));
+                return;
             }
+
+            logger.debug("Responding: {}, v={}", response, connection.getVersion());
+            flush(new FlushItem(ctx, response, request.getSourceFrame()));
+        }
+
+        private void flush(FlushItem item)
+        {
+            EventLoop loop = item.ctx.channel().eventLoop();
+            Flusher flusher = flusherLookup.get(loop);
+            if (flusher == null)
+            {
+                Flusher alt = flusherLookup.putIfAbsent(loop, flusher = new Flusher(loop));
+                if (alt != null)
+                    flusher = alt;
+            }
+
+            flusher.queued.add(item);
+            flusher.start();
         }
 
         @Override
-        public void exceptionCaught(final ChannelHandlerContext ctx, ExceptionEvent e)
+        public void exceptionCaught(final ChannelHandlerContext ctx, Throwable cause)
         throws Exception
         {
-            if (ctx.getChannel().isOpen())
+            if (ctx.channel().isOpen())
             {
-                ChannelFuture future = ctx.getChannel().write(ErrorMessage.fromException(e.getCause(), new UnexpectedChannelExceptionHandler(ctx.getChannel(), false)));
+                UnexpectedChannelExceptionHandler handler = new UnexpectedChannelExceptionHandler(ctx.channel(), false);
+                ChannelFuture future = ctx.writeAndFlush(ErrorMessage.fromException(cause, handler));
                 // On protocol exception, close the channel as soon as the message have been sent
-                if (e.getCause() instanceof ProtocolException)
+                if (cause instanceof ProtocolException)
                 {
                     future.addListener(new ChannelFutureListener() {
                         public void operationComplete(ChannelFuture future) {
-                            ctx.getChannel().close();
+                            ctx.close();
                         }
                     });
                 }

diff --git a/src/java/org/apache/cassandra/transport/OptionCodec.java b/src/java/org/apache/cassandra/transport/OptionCodec.java
index c562889..ec2a1fa 100644
--- a/src/java/org/apache/cassandra/transport/OptionCodec.java
+++ b/src/java/org/apache/cassandra/transport/OptionCodec.java

@@ -21,9 +21,9 @@
 import java.util.EnumMap;
 import java.util.Map;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
+import io.netty.buffer.Unpooled;
 import org.apache.cassandra.utils.Pair;
 
 public class OptionCodec<T extends Enum<T> & OptionCodec.Codecable<T>>
@@ -32,9 +32,9 @@
     {
         public int getId();
 
-        public Object readValue(ChannelBuffer cb);
-        public void writeValue(Object value, ChannelBuffer cb);
-        public int serializedValueSize(Object obj);
+        public Object readValue(ByteBuf cb, int version);
+        public void writeValue(Object value, ByteBuf cb, int version);
+        public int serializedValueSize(Object obj, int version);
     }
 
     private final Class<T> klass;
@@ -66,14 +66,14 @@
         return opt;
     }
 
-    public Map<T, Object> decode(ChannelBuffer body)
+    public Map<T, Object> decode(ByteBuf body, int version)
     {
         EnumMap<T, Object> options = new EnumMap<T, Object>(klass);
         int n = body.readUnsignedShort();
         for (int i = 0; i < n; i++)
         {
             T opt = fromId(body.readUnsignedShort());
-            Object value = opt.readValue(body);
+            Object value = opt.readValue(body, version);
             if (options.containsKey(opt))
                 throw new ProtocolException(String.format("Duplicate option %s in message", opt.name()));
             options.put(opt, value);
@@ -81,41 +81,41 @@
         return options;
     }
 
-    public ChannelBuffer encode(Map<T, Object> options)
+    public ByteBuf encode(Map<T, Object> options, int version)
     {
         int optLength = 2;
         for (Map.Entry<T, Object> entry : options.entrySet())
-            optLength += 2 + entry.getKey().serializedValueSize(entry.getValue());
-        ChannelBuffer cb = ChannelBuffers.buffer(optLength);
+            optLength += 2 + entry.getKey().serializedValueSize(entry.getValue(), version);
+        ByteBuf cb = Unpooled.buffer(optLength);
         cb.writeShort(options.size());
         for (Map.Entry<T, Object> entry : options.entrySet())
         {
             T opt = entry.getKey();
             cb.writeShort(opt.getId());
-            opt.writeValue(entry.getValue(), cb);
+            opt.writeValue(entry.getValue(), cb, version);
         }
         return cb;
     }
 
-    public Pair<T, Object> decodeOne(ChannelBuffer body)
+    public Pair<T, Object> decodeOne(ByteBuf body, int version)
     {
         T opt = fromId(body.readUnsignedShort());
-        Object value = opt.readValue(body);
+        Object value = opt.readValue(body, version);
         return Pair.create(opt, value);
     }
 
-    public void writeOne(Pair<T, Object> option, ChannelBuffer dest)
+    public void writeOne(Pair<T, Object> option, ByteBuf dest, int version)
     {
         T opt = option.left;
         Object obj = option.right;
         dest.writeShort(opt.getId());
-        opt.writeValue(obj, dest);
+        opt.writeValue(obj, dest, version);
     }
 
-    public int oneSerializedSize(Pair<T, Object> option)
+    public int oneSerializedSize(Pair<T, Object> option, int version)
     {
         T opt = option.left;
         Object obj = option.right;
-        return 2 + opt.serializedValueSize(obj);
+        return 2 + opt.serializedValueSize(obj, version);
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/RequestThreadPoolExecutor.java b/src/java/org/apache/cassandra/transport/RequestThreadPoolExecutor.java
index c2e6033..9cac645 100644
--- a/src/java/org/apache/cassandra/transport/RequestThreadPoolExecutor.java
+++ b/src/java/org/apache/cassandra/transport/RequestThreadPoolExecutor.java

@@ -20,83 +20,77 @@
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 
-import org.jboss.netty.handler.execution.MemoryAwareThreadPoolExecutor;
-import org.jboss.netty.util.ObjectSizeEstimator;
-import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
-import org.apache.cassandra.concurrent.NamedThreadFactory;
+import io.netty.util.concurrent.AbstractEventExecutor;
+import io.netty.util.concurrent.EventExecutorGroup;
+import io.netty.util.concurrent.Future;
+import org.apache.cassandra.concurrent.TracingAwareExecutorService;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.metrics.ThreadPoolMetrics;
 
-public class RequestThreadPoolExecutor extends MemoryAwareThreadPoolExecutor
+import static org.apache.cassandra.concurrent.JMXEnabledSharedExecutorPool.SHARED;
+
+public class RequestThreadPoolExecutor extends AbstractEventExecutor
 {
-    private final static int CORE_THREAD_TIMEOUT_SEC = 30;
-    // Number of request we accept to queue before blocking. We could allow this to be configured...
     private final static int MAX_QUEUED_REQUESTS = 128;
-
     private final static String THREAD_FACTORY_ID = "Native-Transport-Requests";
+    private final TracingAwareExecutorService wrapped = SHARED.newExecutor(DatabaseDescriptor.getNativeTransportMaxThreads(),
+                                                                           MAX_QUEUED_REQUESTS,
+                                                                           THREAD_FACTORY_ID,
+                                                                           "transport");
 
-    private final ThreadPoolMetrics metrics;
-
-    public RequestThreadPoolExecutor()
+    public boolean isShuttingDown()
     {
-        super(DatabaseDescriptor.getNativeTransportMaxThreads(),
-              0, // We don't use the per-channel limit, only the global one
-              MAX_QUEUED_REQUESTS,
-              CORE_THREAD_TIMEOUT_SEC, TimeUnit.SECONDS,
-              sizeEstimator(),
-              new NamedThreadFactory(THREAD_FACTORY_ID));
-        metrics = new ThreadPoolMetrics(this, "transport", THREAD_FACTORY_ID);
+        return wrapped.isShutdown();
     }
 
-    /*
-     * In theory, the ObjectSizeEstimator should estimate the actual size of a
-     * request, and MemoryAwareThreadPoolExecutor sets a memory limit on how
-     * much memory we allow for request before blocking.
-     *
-     * However, the memory size used by a CQL query is not very intersting and
-     * by no mean reflect the memory size it's execution will use (the interesting part).
-     * Furthermore, we're mainly interested in limiting the number of unhandled requests that
-     * piles up to implement some back-pressure, and for that, there is no real need to do
-     * fancy esimation of request size. So we use a trivial estimator that just count the
-     * number of request.
-     *
-     * We could get more fancy later ...
-     */
-    private static ObjectSizeEstimator sizeEstimator()
+    public Future<?> shutdownGracefully(long l, long l2, TimeUnit timeUnit)
     {
-        return new ObjectSizeEstimator()
-        {
-            public int estimateSize(Object o)
-            {
-                return 1;
-            }
-        };
+        throw new IllegalStateException();
     }
 
-    @Override
-    protected void afterExecute(Runnable r, Throwable t)
+    public Future<?> terminationFuture()
     {
-        super.afterExecute(r, t);
-        DebuggableThreadPoolExecutor.logExceptionsAfterExecute(r, t);
+        throw new IllegalStateException();
     }
 
     @Override
     public void shutdown()
     {
-        if (!isShutdown())
-        {
-            metrics.release();
-        }
-        super.shutdown();
+        wrapped.shutdown();
     }
 
     @Override
     public List<Runnable> shutdownNow()
     {
-        if (!isShutdown())
-        {
-            metrics.release();
-        }
-        return super.shutdownNow();
+        return wrapped.shutdownNow();
+    }
+
+    public boolean isShutdown()
+    {
+        return wrapped.isShutdown();
+    }
+
+    public boolean isTerminated()
+    {
+        return wrapped.isTerminated();
+    }
+
+    public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException
+    {
+        return wrapped.awaitTermination(timeout, unit);
+    }
+
+    public EventExecutorGroup parent()
+    {
+        return null;
+    }
+
+    public boolean inEventLoop(Thread thread)
+    {
+        return false;
+    }
+
+    public void execute(Runnable command)
+    {
+        wrapped.execute(command);
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/Server.java b/src/java/org/apache/cassandra/transport/Server.java
index f095776..8af6ee8 100644
--- a/src/java/org/apache/cassandra/transport/Server.java
+++ b/src/java/org/apache/cassandra/transport/Server.java

@@ -23,14 +23,25 @@
 import java.net.UnknownHostException;
 import java.util.EnumMap;
 import java.util.concurrent.Callable;
-import java.util.concurrent.Executors;
 import java.util.concurrent.atomic.AtomicBoolean;
 import javax.net.ssl.SSLContext;
 import javax.net.ssl.SSLEngine;
 
+import io.netty.buffer.ByteBufAllocator;
+import io.netty.buffer.PooledByteBufAllocator;
+import io.netty.channel.epoll.Epoll;
+import io.netty.channel.epoll.EpollEventLoopGroup;
+import io.netty.channel.epoll.EpollServerSocketChannel;
+import io.netty.util.Version;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import io.netty.channel.nio.NioEventLoopGroup;
+import io.netty.channel.socket.nio.NioServerSocketChannel;
+import io.netty.util.concurrent.EventExecutor;
+import io.netty.util.concurrent.GlobalEventExecutor;
+import io.netty.util.internal.logging.InternalLoggerFactory;
+import io.netty.util.internal.logging.Slf4JLoggerFactory;
 import org.apache.cassandra.auth.IAuthenticator;
 import org.apache.cassandra.auth.ISaslAwareAuthenticator;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -39,15 +50,11 @@
 import org.apache.cassandra.security.SSLFactory;
 import org.apache.cassandra.service.*;
 import org.apache.cassandra.transport.messages.EventMessage;
-import org.jboss.netty.bootstrap.ServerBootstrap;
-import org.jboss.netty.channel.*;
-import org.jboss.netty.channel.group.ChannelGroup;
-import org.jboss.netty.channel.group.DefaultChannelGroup;
-import org.jboss.netty.channel.socket.nio.NioServerSocketChannelFactory;
-import org.jboss.netty.handler.execution.ExecutionHandler;
-import org.jboss.netty.handler.ssl.SslHandler;
-import org.jboss.netty.logging.InternalLoggerFactory;
-import org.jboss.netty.logging.Slf4JLoggerFactory;
+import io.netty.bootstrap.ServerBootstrap;
+import io.netty.channel.*;
+import io.netty.channel.group.ChannelGroup;
+import io.netty.channel.group.DefaultChannelGroup;
+import io.netty.handler.ssl.SslHandler;
 
 public class Server implements CassandraDaemon.Server
 {
@@ -57,9 +64,10 @@
     }
 
     private static final Logger logger = LoggerFactory.getLogger(Server.class);
+    private static final boolean enableEpoll = Boolean.valueOf(System.getProperty("cassandra.native.epoll.enabled", "true"));
 
-    /** current version of the native protocol we support */
-    public static final int CURRENT_VERSION = 2;
+    public static final int VERSION_3 = 3;
+    public static final int CURRENT_VERSION = VERSION_3;
 
     private final ConnectionTracker connectionTracker = new ConnectionTracker();
 
@@ -74,8 +82,8 @@
     public final InetSocketAddress socket;
     private final AtomicBoolean isRunning = new AtomicBoolean(false);
 
-    private ChannelFactory factory;
-    private ExecutionHandler executionHandler;
+    private EventLoopGroup workerGroup;
+    private EventExecutor eventExecutorGroup;
 
     public Server(InetSocketAddress socket)
     {
@@ -105,7 +113,7 @@
     {
 	    if(!isRunning())
 	    {
-                run();
+            run();
 	    }
     }
 
@@ -133,29 +141,51 @@
         }
 
         // Configure the server.
-        executionHandler = new ExecutionHandler(new RequestThreadPoolExecutor());
-        factory = new NioServerSocketChannelFactory(Executors.newCachedThreadPool(), Executors.newCachedThreadPool());
-        ServerBootstrap bootstrap = new ServerBootstrap(factory);
+        eventExecutorGroup = new RequestThreadPoolExecutor();
 
-        bootstrap.setOption("child.tcpNoDelay", true);
-        bootstrap.setOption("child.keepAlive", DatabaseDescriptor.getRpcKeepAlive());
 
-        // Set up the event pipeline factory.
+        boolean hasEpoll = enableEpoll ? Epoll.isAvailable() : false;
+        if (hasEpoll)
+        {
+            workerGroup = new EpollEventLoopGroup();
+            logger.info("Netty using native Epoll event loop");
+        }
+        else
+        {
+            workerGroup = new NioEventLoopGroup();
+            logger.info("Netty using Java NIO event loop");
+        }
+
+        ServerBootstrap bootstrap = new ServerBootstrap()
+                                    .group(workerGroup)
+                                    .channel(hasEpoll ? EpollServerSocketChannel.class : NioServerSocketChannel.class)
+                                    .childOption(ChannelOption.TCP_NODELAY, true)
+                                    .childOption(ChannelOption.SO_LINGER, 0)
+                                    .childOption(ChannelOption.SO_KEEPALIVE, DatabaseDescriptor.getRpcKeepAlive())
+                                    .childOption(ChannelOption.ALLOCATOR, CBUtil.allocator)
+                                    .childOption(ChannelOption.WRITE_BUFFER_HIGH_WATER_MARK, 32 * 1024)
+                                    .childOption(ChannelOption.WRITE_BUFFER_LOW_WATER_MARK, 8 * 1024);
+
         final EncryptionOptions.ClientEncryptionOptions clientEnc = DatabaseDescriptor.getClientEncryptionOptions();
         if (clientEnc.enabled)
         {
             logger.info("Enabling encrypted CQL connections between client and server");
-            bootstrap.setPipelineFactory(new SecurePipelineFactory(this, clientEnc));
+            bootstrap.childHandler(new SecureInitializer(this, clientEnc));
         }
         else
         {
-            bootstrap.setPipelineFactory(new PipelineFactory(this));
+            bootstrap.childHandler(new Initializer(this));
         }
 
         // Bind and start to accept incoming connections.
+        logger.info("Using Netty Version: {}", Version.identify().entrySet());
         logger.info("Starting listening for CQL clients on {}...", socket);
-        Channel channel = bootstrap.bind(socket);
-        connectionTracker.allChannels.add(channel);
+
+        ChannelFuture bindFuture = bootstrap.bind(socket);
+        if (!bindFuture.awaitUninterruptibly().isSuccess())
+            throw new IllegalStateException(String.format("Failed to bind port %d on %s.", socket.getPort(), socket.getAddress().getHostAddress()));
+
+        connectionTracker.allChannels.add(bindFuture.channel());
         isRunning.set(true);
     }
 
@@ -175,23 +205,25 @@
     {
         // Close opened connections
         connectionTracker.closeAll();
-        factory.releaseExternalResources();
-        factory = null;
-        executionHandler.releaseExternalResources();
-        executionHandler = null;
+        workerGroup.shutdownGracefully();
+        workerGroup = null;
+
+        eventExecutorGroup.shutdown();
+        eventExecutorGroup = null;
         logger.info("Stop listening for CQL clients");
     }
 
 
     public static class ConnectionTracker implements Connection.Tracker
     {
-        public final ChannelGroup allChannels = new DefaultChannelGroup();
+        // TODO: should we be using the GlobalEventExecutor or defining our own?
+        public final ChannelGroup allChannels = new DefaultChannelGroup(GlobalEventExecutor.INSTANCE);
         private final EnumMap<Event.Type, ChannelGroup> groups = new EnumMap<Event.Type, ChannelGroup>(Event.Type.class);
 
         public ConnectionTracker()
         {
             for (Event.Type type : Event.Type.values())
-                groups.put(type, new DefaultChannelGroup(type.toString()));
+                groups.put(type, new DefaultChannelGroup(type.toString(), GlobalEventExecutor.INSTANCE));
         }
 
         public void addConnection(Channel ch, Connection connection)
@@ -212,7 +244,7 @@
 
         public void send(Event event)
         {
-            groups.get(event.type).write(new EventMessage(event));
+            groups.get(event.type).writeAndFlush(new EventMessage(event));
         }
 
         public void closeAll()
@@ -231,7 +263,7 @@
         }
     }
 
-    private static class PipelineFactory implements ChannelPipelineFactory
+    private static class Initializer extends ChannelInitializer
     {
         // Stateless handlers
         private static final Message.ProtocolDecoder messageDecoder = new Message.ProtocolDecoder();
@@ -243,14 +275,14 @@
 
         private final Server server;
 
-        public PipelineFactory(Server server)
+        public Initializer(Server server)
         {
             this.server = server;
         }
 
-        public ChannelPipeline getPipeline() throws Exception
+        protected void initChannel(Channel channel) throws Exception
         {
-            ChannelPipeline pipeline = Channels.pipeline();
+            ChannelPipeline pipeline = channel.pipeline();
 
             //pipeline.addLast("debug", new LoggingHandler());
 
@@ -263,20 +295,16 @@
             pipeline.addLast("messageDecoder", messageDecoder);
             pipeline.addLast("messageEncoder", messageEncoder);
 
-            pipeline.addLast("executor", server.executionHandler);
-
-            pipeline.addLast("dispatcher", dispatcher);
-
-            return pipeline;
+            pipeline.addLast(server.eventExecutorGroup, "executor", dispatcher);
         }
     }
 
-    private static class SecurePipelineFactory extends PipelineFactory
+    private static class SecureInitializer extends Initializer
     {
         private final SSLContext sslContext;
         private final EncryptionOptions encryptionOptions;
 
-        public SecurePipelineFactory(Server server, EncryptionOptions encryptionOptions)
+        public SecureInitializer(Server server, EncryptionOptions encryptionOptions)
         {
             super(server);
             this.encryptionOptions = encryptionOptions;
@@ -290,18 +318,16 @@
             }
         }
 
-        public ChannelPipeline getPipeline() throws Exception
+        protected void initChannel(Channel channel) throws Exception
         {
             SSLEngine sslEngine = sslContext.createSSLEngine();
             sslEngine.setUseClientMode(false);
             sslEngine.setEnabledCipherSuites(encryptionOptions.cipher_suites);
             sslEngine.setNeedClientAuth(encryptionOptions.require_client_auth);
-            
+
             SslHandler sslHandler = new SslHandler(sslEngine);
-            sslHandler.setIssueHandshake(true);
-            ChannelPipeline pipeline = super.getPipeline();
-            pipeline.addFirst("ssl", sslHandler);
-            return pipeline;
+            super.initChannel(channel);
+            channel.pipeline().addFirst("ssl", sslHandler);
         }
     }
 
@@ -332,6 +358,8 @@
                 InetAddress rpcAddress = InetAddress.getByName(StorageService.instance.getRpcaddress(endpoint));
                 // If rpcAddress == 0.0.0.0 (i.e. bound on all addresses), returning that is not very helpful,
                 // so return the internal address (which is ok since "we're bound on all addresses").
+                // Note that after all nodes are running a version that includes CASSANDRA-5899, rpcAddress should
+                // never be 0.0.0.0, so this can eventually be removed.
                 return rpcAddress.equals(bindAll) ? endpoint : rpcAddress;
             }
             catch (UnknownHostException e)
@@ -375,7 +403,12 @@
 
         public void onCreateColumnFamily(String ksName, String cfName)
         {
-            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.CREATED, ksName, cfName));
+            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TABLE, ksName, cfName));
+        }
+
+        public void onCreateUserType(String ksName, String typeName)
+        {
+            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TYPE, ksName, typeName));
         }
 
         public void onUpdateKeyspace(String ksName)
@@ -385,7 +418,12 @@
 
         public void onUpdateColumnFamily(String ksName, String cfName)
         {
-            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, ksName, cfName));
+            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, ksName, cfName));
+        }
+
+        public void onUpdateUserType(String ksName, String typeName)
+        {
+            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TYPE, ksName, typeName));
         }
 
         public void onDropKeyspace(String ksName)
@@ -395,7 +433,12 @@
 
         public void onDropColumnFamily(String ksName, String cfName)
         {
-            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, ksName, cfName));
+            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TABLE, ksName, cfName));
+        }
+
+        public void onDropUserType(String ksName, String typeName)
+        {
+            server.connectionTracker.send(new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TYPE, ksName, typeName));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/ServerConnection.java b/src/java/org/apache/cassandra/transport/ServerConnection.java
index 9bc07cb..b28866f 100644
--- a/src/java/org/apache/cassandra/transport/ServerConnection.java
+++ b/src/java/org/apache/cassandra/transport/ServerConnection.java

@@ -19,7 +19,7 @@
 
 import java.util.concurrent.ConcurrentMap;
 
-import org.jboss.netty.channel.Channel;
+import io.netty.channel.Channel;
 
 import org.apache.cassandra.auth.IAuthenticator;
 import org.apache.cassandra.auth.ISaslAwareAuthenticator;
@@ -43,7 +43,7 @@
     public ServerConnection(Channel channel, int version, Connection.Tracker tracker)
     {
         super(channel, version, tracker);
-        this.clientState = ClientState.forExternalCalls(channel.getRemoteAddress());
+        this.clientState = ClientState.forExternalCalls(channel.remoteAddress());
         this.state = State.UNINITIALIZED;
     }
 

diff --git a/src/java/org/apache/cassandra/transport/SimpleClient.java b/src/java/org/apache/cassandra/transport/SimpleClient.java
index 5f2efda..3cf9b7b 100644
--- a/src/java/org/apache/cassandra/transport/SimpleClient.java
+++ b/src/java/org/apache/cassandra/transport/SimpleClient.java

@@ -33,6 +33,15 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import io.netty.bootstrap.Bootstrap;
+import io.netty.channel.ChannelHandler;
+import io.netty.channel.ChannelInitializer;
+import io.netty.channel.ChannelOption;
+import io.netty.channel.SimpleChannelInboundHandler;
+import io.netty.channel.nio.NioEventLoopGroup;
+import io.netty.channel.socket.nio.NioServerSocketChannel;
+import io.netty.util.internal.logging.InternalLoggerFactory;
+import io.netty.util.internal.logging.Slf4JLoggerFactory;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.security.SSLFactory;
@@ -44,20 +53,11 @@
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.transport.messages.StartupMessage;
 import org.apache.cassandra.utils.MD5Digest;
-import org.jboss.netty.bootstrap.ClientBootstrap;
-import org.jboss.netty.channel.Channel;
-import org.jboss.netty.channel.ChannelFuture;
-import org.jboss.netty.channel.ChannelHandlerContext;
-import org.jboss.netty.channel.ChannelPipeline;
-import org.jboss.netty.channel.ChannelPipelineFactory;
-import org.jboss.netty.channel.Channels;
-import org.jboss.netty.channel.ExceptionEvent;
-import org.jboss.netty.channel.MessageEvent;
-import org.jboss.netty.channel.SimpleChannelUpstreamHandler;
-import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory;
-import org.jboss.netty.handler.ssl.SslHandler;
-import org.jboss.netty.logging.InternalLoggerFactory;
-import org.jboss.netty.logging.Slf4JLoggerFactory;
+import io.netty.channel.Channel;
+import io.netty.channel.ChannelFuture;
+import io.netty.channel.ChannelHandlerContext;
+import io.netty.channel.ChannelPipeline;
+import io.netty.handler.ssl.SslHandler;
 import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions;
 
 public class SimpleClient
@@ -76,7 +76,7 @@
     protected final Connection.Tracker tracker = new ConnectionTracker();
     // We don't track connection really, so we don't need one Connection per channel
     protected final Connection connection = new Connection(null, Server.CURRENT_VERSION, tracker);
-    protected ClientBootstrap bootstrap;
+    protected Bootstrap bootstrap;
     protected Channel channel;
     protected ChannelFuture lastWriteFuture;
 
@@ -118,30 +118,28 @@
     protected void establishConnection() throws IOException
     {
         // Configure the client.
-        bootstrap = new ClientBootstrap(
-                        new NioClientSocketChannelFactory(
-                            Executors.newCachedThreadPool(),
-                            Executors.newCachedThreadPool()));
-
-        bootstrap.setOption("tcpNoDelay", true);
+        bootstrap = new Bootstrap()
+                    .group(new NioEventLoopGroup())
+                    .channel(io.netty.channel.socket.nio.NioSocketChannel.class)
+                    .option(ChannelOption.TCP_NODELAY, true);
 
         // Configure the pipeline factory.
         if(encryptionOptions.enabled)
         {
-            bootstrap.setPipelineFactory(new SecurePipelineFactory());
+            bootstrap.handler(new SecureInitializer());
         }
         else
         {
-            bootstrap.setPipelineFactory(new PipelineFactory());
+            bootstrap.handler(new Initializer());
         }
         ChannelFuture future = bootstrap.connect(new InetSocketAddress(host, port));
 
         // Wait until the connection attempt succeeds or fails.
-        channel = future.awaitUninterruptibly().getChannel();
+        channel = future.awaitUninterruptibly().channel();
         if (!future.isSuccess())
         {
-            bootstrap.releaseExternalResources();
-            throw new IOException("Connection Error", future.getCause());
+            bootstrap.group().shutdownGracefully();
+            throw new IOException("Connection Error", future.cause());
         }
     }
 
@@ -159,7 +157,7 @@
 
     public ResultMessage execute(String query, List<ByteBuffer> values, ConsistencyLevel consistencyLevel)
     {
-        Message.Response msg = execute(new QueryMessage(query, new QueryOptions(consistencyLevel, values)));
+        Message.Response msg = execute(new QueryMessage(query, QueryOptions.forInternalCalls(consistencyLevel, values)));
         assert msg instanceof ResultMessage;
         return (ResultMessage)msg;
     }
@@ -173,7 +171,7 @@
 
     public ResultMessage executePrepared(byte[] statementId, List<ByteBuffer> values, ConsistencyLevel consistency)
     {
-        Message.Response msg = execute(new ExecuteMessage(MD5Digest.wrap(statementId), new QueryOptions(consistency, values)));
+        Message.Response msg = execute(new ExecuteMessage(MD5Digest.wrap(statementId), QueryOptions.forInternalCalls(consistency, values)));
         assert msg instanceof ResultMessage;
         return (ResultMessage)msg;
     }
@@ -189,7 +187,7 @@
         channel.close().awaitUninterruptibly();
 
         // Shut down all thread pools to exit.
-        bootstrap.releaseExternalResources();
+        bootstrap.group().shutdownGracefully();
     }
 
     protected Message.Response execute(Message.Request request)
@@ -197,7 +195,7 @@
         try
         {
             request.attach(connection);
-            lastWriteFuture = channel.write(request);
+            lastWriteFuture = channel.writeAndFlush(request);
             Message.Response msg = responseHandler.responses.take();
             if (msg instanceof ErrorMessage)
                 throw new RuntimeException((Throwable)((ErrorMessage)msg).error);
@@ -222,14 +220,11 @@
         public void closeAll() {}
     }
 
-    private class PipelineFactory implements ChannelPipelineFactory
+    private class Initializer extends ChannelInitializer<Channel>
     {
-        public ChannelPipeline getPipeline() throws Exception
+        protected void initChannel(Channel channel) throws Exception
         {
-            ChannelPipeline pipeline = Channels.pipeline();
-
-            //pipeline.addLast("debug", new LoggingHandler());
-
+            ChannelPipeline pipeline = channel.pipeline();
             pipeline.addLast("frameDecoder", new Frame.Decoder(connectionFactory));
             pipeline.addLast("frameEncoder", frameEncoder);
 
@@ -240,43 +235,39 @@
             pipeline.addLast("messageEncoder", messageEncoder);
 
             pipeline.addLast("handler", responseHandler);
-
-            return pipeline;
         }
     }
 
-    private class SecurePipelineFactory extends PipelineFactory
+    private class SecureInitializer extends Initializer
     {
         private final SSLContext sslContext;
 
-        public SecurePipelineFactory() throws IOException
+        public SecureInitializer() throws IOException
         {
             this.sslContext = SSLFactory.createSSLContext(encryptionOptions, true);
         }
 
-        public ChannelPipeline getPipeline() throws Exception
+        protected void initChannel(Channel channel) throws Exception
         {
+            super.initChannel(channel);
             SSLEngine sslEngine = sslContext.createSSLEngine();
             sslEngine.setUseClientMode(true);
             sslEngine.setEnabledCipherSuites(encryptionOptions.cipher_suites);
-            ChannelPipeline pipeline = super.getPipeline();
-
-            pipeline.addFirst("ssl", new SslHandler(sslEngine));
-            return pipeline;
+            channel.pipeline().addFirst("ssl", new SslHandler(sslEngine));
         }
     }
 
-    private static class ResponseHandler extends SimpleChannelUpstreamHandler
+    @ChannelHandler.Sharable
+    private static class ResponseHandler extends SimpleChannelInboundHandler<Message.Response>
     {
         public final BlockingQueue<Message.Response> responses = new SynchronousQueue<Message.Response>(true);
 
         @Override
-        public void messageReceived(ChannelHandlerContext ctx, MessageEvent e)
+        public void channelRead0(ChannelHandlerContext ctx, Message.Response r)
         {
-            assert e.getMessage() instanceof Message.Response;
             try
             {
-                responses.put((Message.Response)e.getMessage());
+                responses.put(r);
             }
             catch (InterruptedException ie)
             {
@@ -284,11 +275,11 @@
             }
         }
 
-        public void exceptionCaught(ChannelHandlerContext ctx, ExceptionEvent e) throws Exception
+        public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception
         {
-            if (this == ctx.getPipeline().getLast())
-                logger.error("Exception in response", e.getCause());
-            ctx.sendUpstream(e);
+            if (this == ctx.pipeline().last())
+                logger.error("Exception in response", cause);
+            ctx.fireExceptionCaught(cause);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/messages/AuthChallenge.java b/src/java/org/apache/cassandra/transport/messages/AuthChallenge.java
index b6634ad..15a9a9a 100644
--- a/src/java/org/apache/cassandra/transport/messages/AuthChallenge.java
+++ b/src/java/org/apache/cassandra/transport/messages/AuthChallenge.java

@@ -19,7 +19,7 @@
 
 import org.apache.cassandra.transport.CBUtil;
 import org.apache.cassandra.transport.Message;
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import java.nio.ByteBuffer;
 
@@ -30,7 +30,7 @@
 {
     public static final Message.Codec<AuthChallenge> codec = new Message.Codec<AuthChallenge>()
     {
-        public AuthChallenge decode(ChannelBuffer body, int version)
+        public AuthChallenge decode(ByteBuf body, int version)
         {
             ByteBuffer b = CBUtil.readValue(body);
             byte[] token = new byte[b.remaining()];
@@ -38,7 +38,7 @@
             return new AuthChallenge(token);
         }
 
-        public void encode(AuthChallenge challenge, ChannelBuffer dest, int version)
+        public void encode(AuthChallenge challenge, ByteBuf dest, int version)
         {
             CBUtil.writeValue(challenge.token, dest);
         }

diff --git a/src/java/org/apache/cassandra/transport/messages/AuthResponse.java b/src/java/org/apache/cassandra/transport/messages/AuthResponse.java
index 37245a7..3f3f774 100644
--- a/src/java/org/apache/cassandra/transport/messages/AuthResponse.java
+++ b/src/java/org/apache/cassandra/transport/messages/AuthResponse.java

@@ -26,7 +26,7 @@
 import org.apache.cassandra.transport.ProtocolException;
 import org.apache.cassandra.transport.ServerConnection;
 
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import java.nio.ByteBuffer;
 
@@ -39,7 +39,7 @@
 {
     public static final Message.Codec<AuthResponse> codec = new Message.Codec<AuthResponse>()
     {
-        public AuthResponse decode(ChannelBuffer body, int version)
+        public AuthResponse decode(ByteBuf body, int version)
         {
             if (version == 1)
                 throw new ProtocolException("SASL Authentication is not supported in version 1 of the protocol");
@@ -50,7 +50,7 @@
             return new AuthResponse(token);
         }
 
-        public void encode(AuthResponse response, ChannelBuffer dest, int version)
+        public void encode(AuthResponse response, ByteBuf dest, int version)
         {
             CBUtil.writeValue(response.token, dest);
         }

diff --git a/src/java/org/apache/cassandra/transport/messages/AuthSuccess.java b/src/java/org/apache/cassandra/transport/messages/AuthSuccess.java
index 2595f28..98f50db 100644
--- a/src/java/org/apache/cassandra/transport/messages/AuthSuccess.java
+++ b/src/java/org/apache/cassandra/transport/messages/AuthSuccess.java

@@ -19,7 +19,7 @@
 
 import org.apache.cassandra.transport.CBUtil;
 import org.apache.cassandra.transport.Message;
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import java.nio.ByteBuffer;
 
@@ -33,7 +33,7 @@
 {
     public static final Message.Codec<AuthSuccess> codec = new Message.Codec<AuthSuccess>()
     {
-        public AuthSuccess decode(ChannelBuffer body, int version)
+        public AuthSuccess decode(ByteBuf body, int version)
         {
             ByteBuffer b = CBUtil.readValue(body);
             byte[] token = new byte[b.remaining()];
@@ -41,7 +41,7 @@
             return new AuthSuccess(token);
         }
 
-        public void encode(AuthSuccess success, ChannelBuffer dest, int version)
+        public void encode(AuthSuccess success, ByteBuf dest, int version)
         {
             CBUtil.writeValue(success.token, dest);
         }

diff --git a/src/java/org/apache/cassandra/transport/messages/AuthenticateMessage.java b/src/java/org/apache/cassandra/transport/messages/AuthenticateMessage.java
index 22207ab..230f0f2 100644
--- a/src/java/org/apache/cassandra/transport/messages/AuthenticateMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/AuthenticateMessage.java

@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.transport.messages;
 
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.transport.CBUtil;
 import org.apache.cassandra.transport.Message;
@@ -29,13 +29,13 @@
 {
     public static final Message.Codec<AuthenticateMessage> codec = new Message.Codec<AuthenticateMessage>()
     {
-        public AuthenticateMessage decode(ChannelBuffer body, int version)
+        public AuthenticateMessage decode(ByteBuf body, int version)
         {
             String authenticator = CBUtil.readString(body);
             return new AuthenticateMessage(authenticator);
         }
 
-        public void encode(AuthenticateMessage msg, ChannelBuffer dest, int version)
+        public void encode(AuthenticateMessage msg, ByteBuf dest, int version)
         {
             CBUtil.writeString(msg.authenticator, dest);
         }

diff --git a/src/java/org/apache/cassandra/transport/messages/BatchMessage.java b/src/java/org/apache/cassandra/transport/messages/BatchMessage.java
index 34dd8fe..19fa6aa 100644
--- a/src/java/org/apache/cassandra/transport/messages/BatchMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/BatchMessage.java

@@ -23,14 +23,15 @@
 import java.util.List;
 import java.util.UUID;
 
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.statements.BatchStatement;
 import org.apache.cassandra.cql3.statements.ModificationStatement;
-import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.cql3.statements.ParsedStatement;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.PreparedQueryNotFoundException;
+import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.*;
@@ -41,7 +42,7 @@
 {
     public static final Message.Codec<BatchMessage> codec = new Message.Codec<BatchMessage>()
     {
-        public BatchMessage decode(ChannelBuffer body, int version)
+        public BatchMessage decode(ByteBuf body, int version)
         {
             if (version == 1)
                 throw new ProtocolException("BATCH messages are not support in version 1 of the protocol");
@@ -61,11 +62,14 @@
                     throw new ProtocolException("Invalid query kind in BATCH messages. Must be 0 or 1 but got " + kind);
                 variables.add(CBUtil.readValueList(body));
             }
-            ConsistencyLevel consistency = CBUtil.readConsistencyLevel(body);
-            return new BatchMessage(toType(type), queryOrIds, variables, consistency);
+            QueryOptions options = version < 3
+                                 ? QueryOptions.fromPreV3Batch(CBUtil.readConsistencyLevel(body))
+                                 : QueryOptions.codec.decode(body, version);
+
+            return new BatchMessage(toType(type), queryOrIds, variables, options);
         }
 
-        public void encode(BatchMessage msg, ChannelBuffer dest, int version)
+        public void encode(BatchMessage msg, ByteBuf dest, int version)
         {
             int queries = msg.queryOrIdList.size();
 
@@ -84,7 +88,10 @@
                 CBUtil.writeValueList(msg.values.get(i), dest);
             }
 
-            CBUtil.writeConsistencyLevel(msg.consistency, dest);
+            if (version < 3)
+                CBUtil.writeConsistencyLevel(msg.options.getConsistency(), dest);
+            else
+                QueryOptions.codec.encode(msg.options, dest, version);
         }
 
         public int encodedSize(BatchMessage msg, int version)
@@ -99,7 +106,9 @@
 
                 size += CBUtil.sizeOfValueList(msg.values.get(i));
             }
-            size += CBUtil.sizeOfConsistencyLevel(msg.consistency);
+            size += version < 3
+                  ? CBUtil.sizeOfConsistencyLevel(msg.options.getConsistency())
+                  : QueryOptions.codec.encodedSize(msg.options, version);
             return size;
         }
 
@@ -131,15 +140,15 @@
     public final BatchStatement.Type type;
     public final List<Object> queryOrIdList;
     public final List<List<ByteBuffer>> values;
-    public final ConsistencyLevel consistency;
+    public final QueryOptions options;
 
-    public BatchMessage(BatchStatement.Type type, List<Object> queryOrIdList, List<List<ByteBuffer>> values, ConsistencyLevel consistency)
+    public BatchMessage(BatchStatement.Type type, List<Object> queryOrIdList, List<List<ByteBuffer>> values, QueryOptions options)
     {
         super(Message.Type.BATCH);
         this.type = type;
         this.queryOrIdList = queryOrIdList;
         this.values = values;
-        this.consistency = consistency;
+        this.options = options;
     }
 
     public Message.Response execute(QueryState state)
@@ -160,51 +169,49 @@
                 Tracing.instance.begin("Execute batch of CQL3 queries", Collections.<String, String>emptyMap());
             }
 
-            QueryHandler handler = state.getClientState().getCQLQueryHandler();
-            List<ModificationStatement> statements = new ArrayList<ModificationStatement>(queryOrIdList.size());
-            boolean hasConditions = false;
+            QueryHandler handler = ClientState.getCQLQueryHandler();
+            List<ParsedStatement.Prepared> prepared = new ArrayList<>(queryOrIdList.size());
             for (int i = 0; i < queryOrIdList.size(); i++)
             {
                 Object query = queryOrIdList.get(i);
-                CQLStatement statement;
+                ParsedStatement.Prepared p;
                 if (query instanceof String)
                 {
-                    statement = QueryProcessor.parseStatement((String)query, state);
+                    p = QueryProcessor.parseStatement((String)query, state);
                 }
                 else
                 {
-                    statement = handler.getPrepared((MD5Digest)query);
-                    if (statement == null)
+                    p = handler.getPrepared((MD5Digest)query);
+                    if (p == null)
                         throw new PreparedQueryNotFoundException((MD5Digest)query);
                 }
 
                 List<ByteBuffer> queryValues = values.get(i);
-                if (queryValues.size() != statement.getBoundTerms())
+                if (queryValues.size() != p.statement.getBoundTerms())
                     throw new InvalidRequestException(String.format("There were %d markers(?) in CQL but %d bound variables",
-                                                                    statement.getBoundTerms(),
+                                                                    p.statement.getBoundTerms(),
                                                                     queryValues.size()));
-                if (!(statement instanceof ModificationStatement))
+
+                prepared.add(p);
+            }
+
+            BatchQueryOptions batchOptions = BatchQueryOptions.withPerStatementVariables(options, values, queryOrIdList);
+            List<ModificationStatement> statements = new ArrayList<>(prepared.size());
+            for (int i = 0; i < prepared.size(); i++)
+            {
+                ParsedStatement.Prepared p = prepared.get(i);
+                batchOptions.forStatement(i).prepare(p.boundNames);
+
+                if (!(p.statement instanceof ModificationStatement))
                     throw new InvalidRequestException("Invalid statement in batch: only UPDATE, INSERT and DELETE statements are allowed.");
 
-                ModificationStatement mst = (ModificationStatement)statement;
-                hasConditions |= mst.hasConditions();
-                if (mst.isCounter())
-                {
-                    if (type != BatchStatement.Type.COUNTER)
-                        throw new InvalidRequestException("Cannot include counter statement in a non-counter batch");
-                }
-                else
-                {
-                    if (type == BatchStatement.Type.COUNTER)
-                        throw new InvalidRequestException("Cannot include non-counter statement in a counter batch");
-                }
-                statements.add(mst);
+                statements.add((ModificationStatement)p.statement);
             }
 
             // Note: It's ok at this point to pass a bogus value for the number of bound terms in the BatchState ctor
             // (and no value would be really correct, so we prefer passing a clearly wrong one).
-            BatchStatement batch = new BatchStatement(-1, type, statements, Attributes.none(), hasConditions);
-            Message.Response response = handler.processBatch(batch, state, new BatchQueryOptions(consistency, values, queryOrIdList));
+            BatchStatement batch = new BatchStatement(-1, type, statements, Attributes.none());
+            Message.Response response = handler.processBatch(batch, state, batchOptions);
 
             if (tracingId != null)
                 response.setTracingId(tracingId);
@@ -231,7 +238,7 @@
             if (i > 0) sb.append(", ");
             sb.append(queryOrIdList.get(i)).append(" with ").append(values.get(i).size()).append(" values");
         }
-        sb.append("] at consistency ").append(consistency);
+        sb.append("] at consistency ").append(options.getConsistency());
         return sb.toString();
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/messages/CredentialsMessage.java b/src/java/org/apache/cassandra/transport/messages/CredentialsMessage.java
index 2c93afd..eb39e30 100644
--- a/src/java/org/apache/cassandra/transport/messages/CredentialsMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/CredentialsMessage.java

@@ -23,8 +23,7 @@
 import org.apache.cassandra.auth.AuthenticatedUser;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.transport.ProtocolException;
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.exceptions.AuthenticationException;
 import org.apache.cassandra.service.QueryState;
@@ -38,7 +37,7 @@
 {
     public static final Message.Codec<CredentialsMessage> codec = new Message.Codec<CredentialsMessage>()
     {
-        public CredentialsMessage decode(ChannelBuffer body, int version)
+        public CredentialsMessage decode(ByteBuf body, int version)
         {
             if (version > 1)
                 throw new ProtocolException("Legacy credentials authentication is not supported in " +
@@ -48,7 +47,7 @@
             return new CredentialsMessage(credentials);
         }
 
-        public void encode(CredentialsMessage msg, ChannelBuffer dest, int version)
+        public void encode(CredentialsMessage msg, ByteBuf dest, int version)
         {
             CBUtil.writeStringMap(msg.credentials, dest);
         }

diff --git a/src/java/org/apache/cassandra/transport/messages/ErrorMessage.java b/src/java/org/apache/cassandra/transport/messages/ErrorMessage.java
index 4d60a1f..7e4a3a9 100644
--- a/src/java/org/apache/cassandra/transport/messages/ErrorMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/ErrorMessage.java

@@ -17,8 +17,9 @@
  */
 package org.apache.cassandra.transport.messages;
 
+import io.netty.buffer.ByteBuf;
+import io.netty.handler.codec.CodecException;
 import com.google.common.base.Predicate;
-import org.jboss.netty.buffer.ChannelBuffer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -40,7 +41,7 @@
 
     public static final Message.Codec<ErrorMessage> codec = new Message.Codec<ErrorMessage>()
     {
-        public ErrorMessage decode(ChannelBuffer body, int version)
+        public ErrorMessage decode(ByteBuf body, int version)
         {
             ExceptionCode code = ExceptionCode.fromValue(body.readInt());
             String msg = CBUtil.readString(body);
@@ -120,7 +121,7 @@
             return new ErrorMessage(te);
         }
 
-        public void encode(ErrorMessage msg, ChannelBuffer dest, int version)
+        public void encode(ErrorMessage msg, ByteBuf dest, int version)
         {
             dest.writeInt(msg.error.code().value);
             CBUtil.writeString(msg.error.getMessage(), dest);
@@ -216,7 +217,19 @@
     public static ErrorMessage fromException(Throwable e, Predicate<Throwable> unexpectedExceptionHandler)
     {
         int streamId = 0;
-        if (e instanceof WrappedException)
+
+        // Netty will wrap exceptions during decoding in a CodecException. If the cause was one of our ProtocolExceptions
+        // or some other internal exception, extract that and use it.
+        if (e instanceof CodecException)
+        {
+            Throwable cause = e.getCause();
+            if (cause != null && cause instanceof WrappedException)
+            {
+                streamId = ((WrappedException)cause).streamId;
+                e = cause.getCause();
+            }
+        }
+        else if (e instanceof WrappedException)
         {
             streamId = ((WrappedException)e).streamId;
             e = e.getCause();

diff --git a/src/java/org/apache/cassandra/transport/messages/EventMessage.java b/src/java/org/apache/cassandra/transport/messages/EventMessage.java
index 9acb401..f3ab526 100644
--- a/src/java/org/apache/cassandra/transport/messages/EventMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/EventMessage.java

@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.transport.messages;
 
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.transport.Message;
@@ -26,19 +26,19 @@
 {
     public static final Message.Codec<EventMessage> codec = new Message.Codec<EventMessage>()
     {
-        public EventMessage decode(ChannelBuffer body, int version)
+        public EventMessage decode(ByteBuf body, int version)
         {
-            return new EventMessage(Event.deserialize(body));
+            return new EventMessage(Event.deserialize(body, version));
         }
 
-        public void encode(EventMessage msg, ChannelBuffer dest, int version)
+        public void encode(EventMessage msg, ByteBuf dest, int version)
         {
-            msg.event.serialize(dest);
+            msg.event.serialize(dest, version);
         }
 
         public int encodedSize(EventMessage msg, int version)
         {
-            return msg.event.serializedSize();
+            return msg.event.serializedSize(version);
         }
     };
 

diff --git a/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java b/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
index 0a2b26d..78182b2 100644
--- a/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java

@@ -18,17 +18,16 @@
 package org.apache.cassandra.transport.messages;
 
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
 import java.util.List;
 import java.util.UUID;
 
 import com.google.common.collect.ImmutableMap;
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.cql3.CQLStatement;
 import org.apache.cassandra.cql3.QueryHandler;
 import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.statements.ParsedStatement;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.exceptions.PreparedQueryNotFoundException;
 import org.apache.cassandra.service.QueryState;
@@ -41,7 +40,7 @@
 {
     public static final Message.Codec<ExecuteMessage> codec = new Message.Codec<ExecuteMessage>()
     {
-        public ExecuteMessage decode(ChannelBuffer body, int version)
+        public ExecuteMessage decode(ByteBuf body, int version)
         {
             byte[] id = CBUtil.readBytes(body);
             if (version == 1)
@@ -56,7 +55,7 @@
             }
         }
 
-        public void encode(ExecuteMessage msg, ChannelBuffer dest, int version)
+        public void encode(ExecuteMessage msg, ByteBuf dest, int version)
         {
             CBUtil.writeBytes(msg.statementId.bytes, dest);
             if (version == 1)
@@ -102,11 +101,13 @@
         try
         {
             QueryHandler handler = state.getClientState().getCQLQueryHandler();
-            CQLStatement statement = handler.getPrepared(statementId);
-
-            if (statement == null)
+            ParsedStatement.Prepared prepared = handler.getPrepared(statementId);
+            if (prepared == null)
                 throw new PreparedQueryNotFoundException(statementId);
 
+            options.prepare(prepared.boundNames);
+            CQLStatement statement = prepared.statement;
+
             if (options.getPageSize() == 0)
                 throw new ProtocolException("The page size cannot be 0");
 

diff --git a/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java b/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java
index dd99c60..2f6e3da 100644
--- a/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java

@@ -22,8 +22,7 @@
 import java.util.List;
 import java.util.Map;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.service.QueryState;
@@ -37,12 +36,12 @@
 {
     public static final Message.Codec<OptionsMessage> codec = new Message.Codec<OptionsMessage>()
     {
-        public OptionsMessage decode(ChannelBuffer body, int version)
+        public OptionsMessage decode(ByteBuf body, int version)
         {
             return new OptionsMessage();
         }
 
-        public void encode(OptionsMessage msg, ChannelBuffer dest, int version)
+        public void encode(OptionsMessage msg, ByteBuf dest, int version)
         {
         }
 

diff --git a/src/java/org/apache/cassandra/transport/messages/PrepareMessage.java b/src/java/org/apache/cassandra/transport/messages/PrepareMessage.java
index 4b00f19..2899cf8 100644
--- a/src/java/org/apache/cassandra/transport/messages/PrepareMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/PrepareMessage.java

@@ -20,7 +20,7 @@
 import java.util.UUID;
 
 import com.google.common.collect.ImmutableMap;
-import org.jboss.netty.buffer.ChannelBuffer;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.service.QueryState;
@@ -32,13 +32,13 @@
 {
     public static final Message.Codec<PrepareMessage> codec = new Message.Codec<PrepareMessage>()
     {
-        public PrepareMessage decode(ChannelBuffer body, int version)
+        public PrepareMessage decode(ByteBuf body, int version)
         {
             String query = CBUtil.readLongString(body);
             return new PrepareMessage(query);
         }
 
-        public void encode(PrepareMessage msg, ChannelBuffer dest, int version)
+        public void encode(PrepareMessage msg, ByteBuf dest, int version)
         {
             CBUtil.writeLongString(msg.query, dest);
         }

diff --git a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
index b0a48e7..377cdaf 100644
--- a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java

@@ -22,8 +22,7 @@
 import java.util.UUID;
 
 import com.google.common.collect.ImmutableMap;
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.cql3.QueryHandler;
 import org.apache.cassandra.cql3.QueryOptions;
@@ -41,7 +40,7 @@
 {
     public static final Message.Codec<QueryMessage> codec = new Message.Codec<QueryMessage>()
     {
-        public QueryMessage decode(ChannelBuffer body, int version)
+        public QueryMessage decode(ByteBuf body, int version)
         {
             String query = CBUtil.readLongString(body);
             if (version == 1)
@@ -55,7 +54,7 @@
             }
         }
 
-        public void encode(QueryMessage msg, ChannelBuffer dest, int version)
+        public void encode(QueryMessage msg, ByteBuf dest, int version)
         {
             CBUtil.writeLongString(msg.query, dest);
             if (version == 1)

diff --git a/src/java/org/apache/cassandra/transport/messages/ReadyMessage.java b/src/java/org/apache/cassandra/transport/messages/ReadyMessage.java
index 10bff86..f0a4681 100644
--- a/src/java/org/apache/cassandra/transport/messages/ReadyMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/ReadyMessage.java

@@ -17,8 +17,7 @@
  */
 package org.apache.cassandra.transport.messages;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.transport.Message;
 
@@ -29,12 +28,12 @@
 {
     public static final Message.Codec<ReadyMessage> codec = new Message.Codec<ReadyMessage>()
     {
-        public ReadyMessage decode(ChannelBuffer body, int version)
+        public ReadyMessage decode(ByteBuf body, int version)
         {
             return new ReadyMessage();
         }
 
-        public void encode(ReadyMessage msg, ChannelBuffer dest, int version)
+        public void encode(ReadyMessage msg, ByteBuf dest, int version)
         {
         }
 

diff --git a/src/java/org/apache/cassandra/transport/messages/RegisterMessage.java b/src/java/org/apache/cassandra/transport/messages/RegisterMessage.java
index fc6ade7..ee410bb 100644
--- a/src/java/org/apache/cassandra/transport/messages/RegisterMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/RegisterMessage.java

@@ -20,8 +20,7 @@
 import java.util.ArrayList;
 import java.util.List;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.*;
@@ -30,7 +29,7 @@
 {
     public static final Message.Codec<RegisterMessage> codec = new Message.Codec<RegisterMessage>()
     {
-        public RegisterMessage decode(ChannelBuffer body, int version)
+        public RegisterMessage decode(ByteBuf body, int version)
         {
             int length = body.readUnsignedShort();
             List<Event.Type> eventTypes = new ArrayList<Event.Type>(length);
@@ -39,7 +38,7 @@
             return new RegisterMessage(eventTypes);
         }
 
-        public void encode(RegisterMessage msg, ChannelBuffer dest, int version)
+        public void encode(RegisterMessage msg, ByteBuf dest, int version)
         {
             dest.writeShort(msg.eventTypes.size());
             for (Event.Type type : msg.eventTypes)

diff --git a/src/java/org/apache/cassandra/transport/messages/ResultMessage.java b/src/java/org/apache/cassandra/transport/messages/ResultMessage.java
index 3d41793..723beed 100644
--- a/src/java/org/apache/cassandra/transport/messages/ResultMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/ResultMessage.java

@@ -19,14 +19,14 @@
 
 import java.util.*;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.cql3.ColumnSpecification;
 import org.apache.cassandra.cql3.CQLStatement;
 import org.apache.cassandra.cql3.ResultSet;
 import org.apache.cassandra.cql3.statements.SelectStatement;
 import org.apache.cassandra.cql3.statements.ParsedStatement;
+import org.apache.cassandra.service.pager.PagingState;
 import org.apache.cassandra.transport.*;
 import org.apache.cassandra.thrift.CqlPreparedResult;
 import org.apache.cassandra.thrift.CqlResult;
@@ -37,13 +37,13 @@
 {
     public static final Message.Codec<ResultMessage> codec = new Message.Codec<ResultMessage>()
     {
-        public ResultMessage decode(ChannelBuffer body, int version)
+        public ResultMessage decode(ByteBuf body, int version)
         {
             Kind kind = Kind.fromId(body.readInt());
             return kind.subcodec.decode(body, version);
         }
 
-        public void encode(ResultMessage msg, ChannelBuffer dest, int version)
+        public void encode(ResultMessage msg, ByteBuf dest, int version)
         {
             dest.writeInt(msg.kind.id);
             msg.kind.subcodec.encode(msg, dest, version);
@@ -117,12 +117,12 @@
 
         public static final Message.Codec<ResultMessage> subcodec = new Message.Codec<ResultMessage>()
         {
-            public ResultMessage decode(ChannelBuffer body, int version)
+            public ResultMessage decode(ByteBuf body, int version)
             {
                 return new Void();
             }
 
-            public void encode(ResultMessage msg, ChannelBuffer dest, int version)
+            public void encode(ResultMessage msg, ByteBuf dest, int version)
             {
                 assert msg instanceof Void;
             }
@@ -157,13 +157,13 @@
 
         public static final Message.Codec<ResultMessage> subcodec = new Message.Codec<ResultMessage>()
         {
-            public ResultMessage decode(ChannelBuffer body, int version)
+            public ResultMessage decode(ByteBuf body, int version)
             {
                 String keyspace = CBUtil.readString(body);
                 return new SetKeyspace(keyspace);
             }
 
-            public void encode(ResultMessage msg, ChannelBuffer dest, int version)
+            public void encode(ResultMessage msg, ByteBuf dest, int version)
             {
                 assert msg instanceof SetKeyspace;
                 CBUtil.writeString(((SetKeyspace)msg).keyspace, dest);
@@ -192,12 +192,12 @@
     {
         public static final Message.Codec<ResultMessage> subcodec = new Message.Codec<ResultMessage>()
         {
-            public ResultMessage decode(ChannelBuffer body, int version)
+            public ResultMessage decode(ByteBuf body, int version)
             {
                 return new Rows(ResultSet.codec.decode(body, version));
             }
 
-            public void encode(ResultMessage msg, ChannelBuffer dest, int version)
+            public void encode(ResultMessage msg, ByteBuf dest, int version)
             {
                 assert msg instanceof Rows;
                 Rows rowMsg = (Rows)msg;
@@ -230,14 +230,13 @@
         {
             return "ROWS " + result;
         }
-
     }
 
     public static class Prepared extends ResultMessage
     {
         public static final Message.Codec<ResultMessage> subcodec = new Message.Codec<ResultMessage>()
         {
-            public ResultMessage decode(ChannelBuffer body, int version)
+            public ResultMessage decode(ByteBuf body, int version)
             {
                 MD5Digest id = MD5Digest.wrap(CBUtil.readBytes(body));
                 ResultSet.Metadata metadata = ResultSet.Metadata.codec.decode(body, version);
@@ -249,7 +248,7 @@
                 return new Prepared(id, -1, metadata, resultMetadata);
             }
 
-            public void encode(ResultMessage msg, ChannelBuffer dest, int version)
+            public void encode(ResultMessage msg, ByteBuf dest, int version)
             {
                 assert msg instanceof Prepared;
                 Prepared prepared = (Prepared)msg;
@@ -277,7 +276,11 @@
         };
 
         public final MD5Digest statementId;
+
+        /** Describes the variables to be bound in the prepared statement */
         public final ResultSet.Metadata metadata;
+
+        /** Describes the results of executing this prepared statement */
         public final ResultSet.Metadata resultMetadata;
 
         // statement id for CQL-over-thrift compatibility. The binary protocol ignore that.
@@ -336,56 +339,33 @@
 
     public static class SchemaChange extends ResultMessage
     {
-        public enum Change { CREATED, UPDATED, DROPPED }
+        public final Event.SchemaChange change;
 
-        public final Change change;
-        public final String keyspace;
-        public final String columnFamily;
-
-        public SchemaChange(Change change, String keyspace)
-        {
-            this(change, keyspace, "");
-        }
-
-        public SchemaChange(Change change, String keyspace, String columnFamily)
+        public SchemaChange(Event.SchemaChange change)
         {
             super(Kind.SCHEMA_CHANGE);
             this.change = change;
-            this.keyspace = keyspace;
-            this.columnFamily = columnFamily;
         }
 
         public static final Message.Codec<ResultMessage> subcodec = new Message.Codec<ResultMessage>()
         {
-            public ResultMessage decode(ChannelBuffer body, int version)
+            public ResultMessage decode(ByteBuf body, int version)
             {
-                Change change = CBUtil.readEnumValue(Change.class, body);
-                String keyspace = CBUtil.readString(body);
-                String columnFamily = CBUtil.readString(body);
-                return new SchemaChange(change, keyspace, columnFamily);
-
+                return new SchemaChange(Event.SchemaChange.deserializeEvent(body, version));
             }
 
-            public void encode(ResultMessage msg, ChannelBuffer dest, int version)
+            public void encode(ResultMessage msg, ByteBuf dest, int version)
             {
                 assert msg instanceof SchemaChange;
                 SchemaChange scm = (SchemaChange)msg;
-
-                CBUtil.writeEnumValue(scm.change, dest);
-                CBUtil.writeString(scm.keyspace, dest);
-                CBUtil.writeString(scm.columnFamily, dest);
+                scm.change.serializeEvent(dest, version);
             }
 
             public int encodedSize(ResultMessage msg, int version)
             {
                 assert msg instanceof SchemaChange;
                 SchemaChange scm = (SchemaChange)msg;
-
-                int size = 0;
-                size += CBUtil.sizeOfEnumValue(scm.change);
-                size += CBUtil.sizeOfString(scm.keyspace);
-                size += CBUtil.sizeOfString(scm.columnFamily);
-                return size;
+                return scm.change.eventSerializedSize(version);
             }
         };
 
@@ -397,7 +377,7 @@
         @Override
         public String toString()
         {
-            return "RESULT schema change " + change + " on " + keyspace + (columnFamily.isEmpty() ? "" : "." + columnFamily);
+            return "RESULT schema change " + change;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/messages/StartupMessage.java b/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
index d56a037..1a5071f 100644
--- a/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/StartupMessage.java

@@ -20,8 +20,7 @@
 import java.util.HashMap;
 import java.util.Map;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -41,12 +40,12 @@
 
     public static final Message.Codec<StartupMessage> codec = new Message.Codec<StartupMessage>()
     {
-        public StartupMessage decode(ChannelBuffer body, int version)
+        public StartupMessage decode(ByteBuf body, int version)
         {
             return new StartupMessage(upperCaseKeys(CBUtil.readStringMap(body)));
         }
 
-        public void encode(StartupMessage msg, ChannelBuffer dest, int version)
+        public void encode(StartupMessage msg, ByteBuf dest, int version)
         {
             CBUtil.writeStringMap(msg.options, dest);
         }

diff --git a/src/java/org/apache/cassandra/transport/messages/SupportedMessage.java b/src/java/org/apache/cassandra/transport/messages/SupportedMessage.java
index 44a95e7..539085f 100644
--- a/src/java/org/apache/cassandra/transport/messages/SupportedMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/SupportedMessage.java

@@ -20,8 +20,7 @@
 import java.util.List;
 import java.util.Map;
 
-import org.jboss.netty.buffer.ChannelBuffer;
-import org.jboss.netty.buffer.ChannelBuffers;
+import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.transport.CBUtil;
 import org.apache.cassandra.transport.Message;
@@ -33,12 +32,12 @@
 {
     public static final Message.Codec<SupportedMessage> codec = new Message.Codec<SupportedMessage>()
     {
-        public SupportedMessage decode(ChannelBuffer body, int version)
+        public SupportedMessage decode(ByteBuf body, int version)
         {
             return new SupportedMessage(CBUtil.readStringToStringListMap(body));
         }
 
-        public void encode(SupportedMessage msg, ChannelBuffer dest, int version)
+        public void encode(SupportedMessage msg, ByteBuf dest, int version)
         {
             CBUtil.writeStringToStringListMap(msg.supported, dest);
         }

diff --git a/src/java/org/apache/cassandra/triggers/ITrigger.java b/src/java/org/apache/cassandra/triggers/ITrigger.java
index 15ed7ba..4701b61 100644
--- a/src/java/org/apache/cassandra/triggers/ITrigger.java
+++ b/src/java/org/apache/cassandra/triggers/ITrigger.java

@@ -25,7 +25,7 @@
 import java.util.Collection;
 
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.RowMutation;
+import org.apache.cassandra.db.Mutation;
 
 /**
  * Trigger interface, For every Mutation received by the coordinator {@link #augment(ByteBuffer, ColumnFamily)}
@@ -44,9 +44,9 @@
     /**
      * Called exactly once per CF update, returned mutations are atomically updated.
      *
-     * @param key - Row Key for the update.
-     * @param update - Update received for the CF
+     * @param partitionKey - partition Key for the update.
+     * @param update - update received for the CF
      * @return modifications to be applied, null if no action to be performed.
      */
-    public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update);
+    public Collection<Mutation> augment(ByteBuffer partitionKey, ColumnFamily update);
 }

diff --git a/src/java/org/apache/cassandra/triggers/TriggerExecutor.java b/src/java/org/apache/cassandra/triggers/TriggerExecutor.java
index 988c6a7..4416a57 100644
--- a/src/java/org/apache/cassandra/triggers/TriggerExecutor.java
+++ b/src/java/org/apache/cassandra/triggers/TriggerExecutor.java

@@ -28,14 +28,11 @@
 
 import org.apache.cassandra.config.TriggerDefinition;
 import org.apache.cassandra.cql.QueryProcessor;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.CounterMutation;
-import org.apache.cassandra.db.IMutation;
-import org.apache.cassandra.db.RowMutation;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.HeapAllocator;
 import org.apache.cassandra.utils.Pair;
 
 public class TriggerExecutor
@@ -66,23 +63,26 @@
 
     public ColumnFamily execute(ByteBuffer key, ColumnFamily updates) throws InvalidRequestException
     {
-        List<RowMutation> intermediate = executeInternal(key, updates);
+        List<Mutation> intermediate = executeInternal(key, updates);
         if (intermediate == null || intermediate.isEmpty())
             return updates;
 
         validateForSinglePartition(updates.metadata().getKeyValidator(), updates.id(), key, intermediate);
 
-        for (RowMutation mutation : intermediate)
+        for (Mutation mutation : intermediate)
+        {
             for (ColumnFamily cf : mutation.getColumnFamilies())
-                updates.addAll(cf, HeapAllocator.instance);
-
+            {
+                updates.addAll(cf);
+            }
+        }
         return updates;
     }
 
-    public Collection<RowMutation> execute(Collection<? extends IMutation> mutations) throws InvalidRequestException
+    public Collection<Mutation> execute(Collection<? extends IMutation> mutations) throws InvalidRequestException
     {
         boolean hasCounters = false;
-        List<RowMutation> augmentedMutations = null;
+        List<Mutation> augmentedMutations = null;
 
         for (IMutation mutation : mutations)
         {
@@ -91,7 +91,7 @@
 
             for (ColumnFamily cf : mutation.getColumnFamilies())
             {
-                List<RowMutation> augmentations = executeInternal(mutation.key(), cf);
+                List<Mutation> augmentations = executeInternal(mutation.key(), cf);
                 if (augmentations == null || augmentations.isEmpty())
                     continue;
 
@@ -110,19 +110,19 @@
             throw new InvalidRequestException("Counter mutations and trigger mutations cannot be applied together atomically.");
 
         @SuppressWarnings("unchecked")
-        Collection<RowMutation> originalMutations = (Collection<RowMutation>) mutations;
+        Collection<Mutation> originalMutations = (Collection<Mutation>) mutations;
 
         return mergeMutations(Iterables.concat(originalMutations, augmentedMutations));
     }
 
-    private Collection<RowMutation> mergeMutations(Iterable<RowMutation> mutations)
+    private Collection<Mutation> mergeMutations(Iterable<Mutation> mutations)
     {
-        Map<Pair<String, ByteBuffer>, RowMutation> groupedMutations = new HashMap<>();
+        Map<Pair<String, ByteBuffer>, Mutation> groupedMutations = new HashMap<>();
 
-        for (RowMutation mutation : mutations)
+        for (Mutation mutation : mutations)
         {
             Pair<String, ByteBuffer> key = Pair.create(mutation.getKeyspaceName(), mutation.key());
-            RowMutation current = groupedMutations.get(key);
+            Mutation current = groupedMutations.get(key);
             if (current == null)
             {
                 // copy in case the mutation's modifications map is backed by an immutable Collections#singletonMap().
@@ -140,28 +140,30 @@
     private void validateForSinglePartition(AbstractType<?> keyValidator,
                                             UUID cfId,
                                             ByteBuffer key,
-                                            Collection<RowMutation> tmutations)
+                                            Collection<Mutation> tmutations)
     throws InvalidRequestException
     {
-        for (RowMutation mutation : tmutations)
+        for (Mutation mutation : tmutations)
         {
             if (keyValidator.compare(mutation.key(), key) != 0)
                 throw new InvalidRequestException("Partition key of additional mutation does not match primary update key");
 
             for (ColumnFamily cf : mutation.getColumnFamilies())
-                if (!cf.id().equals(cfId))
+            {
+                if (! cf.id().equals(cfId))
                     throw new InvalidRequestException("Column family of additional mutation does not match primary update cf");
+            }
         }
         validate(tmutations);
     }
 
-    private void validate(Collection<RowMutation> tmutations) throws InvalidRequestException
+    private void validate(Collection<Mutation> tmutations) throws InvalidRequestException
     {
-        for (RowMutation mutation : tmutations)
+        for (Mutation mutation : tmutations)
         {
             QueryProcessor.validateKey(mutation.key());
             for (ColumnFamily tcf : mutation.getColumnFamilies())
-                for (ByteBuffer tName : tcf.getColumnNames())
+                for (CellName tName : tcf.getColumnNames())
                     QueryProcessor.validateColumn(tcf.metadata(), tName, tcf.getColumn(tName).value());
         }
     }
@@ -170,12 +172,12 @@
      * Switch class loader before using the triggers for the column family, if
      * not loaded them with the custom class loader.
      */
-    private List<RowMutation> executeInternal(ByteBuffer key, ColumnFamily columnFamily)
+    private List<Mutation> executeInternal(ByteBuffer key, ColumnFamily columnFamily)
     {
         Map<String, TriggerDefinition> triggers = columnFamily.metadata().getTriggers();
         if (triggers.isEmpty())
             return null;
-        List<RowMutation> tmutations = Lists.newLinkedList();
+        List<Mutation> tmutations = Lists.newLinkedList();
         Thread.currentThread().setContextClassLoader(customClassLoader);
         try
         {
@@ -187,7 +189,7 @@
                     trigger = loadTriggerInstance(td.classOption);
                     cachedTriggers.put(td.classOption, trigger);
                 }
-                Collection<RowMutation> temp = trigger.augment(key, columnFamily);
+                Collection<Mutation> temp = trigger.augment(key, columnFamily);
                 if (temp != null)
                     tmutations.addAll(temp);
             }

diff --git a/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java b/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java
index 0f5136b..83d8f3a 100644
--- a/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java
+++ b/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.IOException;
 import java.nio.ByteBuffer;
 
 public class AlwaysPresentFilter implements IFilter
@@ -31,7 +30,7 @@
 
     public void clear() { }
 
-    public void close() throws IOException { }
+    public void close() { }
 
     public long serializedSize() { return 0; }
 }

diff --git a/src/java/org/apache/cassandra/utils/BloomCalculations.java b/src/java/org/apache/cassandra/utils/BloomCalculations.java
index 17966e8..b73f531 100644
--- a/src/java/org/apache/cassandra/utils/BloomCalculations.java
+++ b/src/java/org/apache/cassandra/utils/BloomCalculations.java

@@ -35,7 +35,7 @@
 
     /**
      * In the following keyspaceName, the row 'i' shows false positive rates if i buckets
-     * per element are used.  Column 'j' shows false positive rates if j hash
+     * per element are used.  Cell 'j' shows false positive rates if j hash
      * functions are used.  The first row is 'i=0', the first column is 'j=0'.
      * Each cell (i,j) the false positive rate determined by using i buckets per
      * element and j hash functions.

diff --git a/src/java/org/apache/cassandra/utils/BloomFilter.java b/src/java/org/apache/cassandra/utils/BloomFilter.java
index 9fbb38e..ceba89b 100644
--- a/src/java/org/apache/cassandra/utils/BloomFilter.java
+++ b/src/java/org/apache/cassandra/utils/BloomFilter.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.IOException;
 import java.nio.ByteBuffer;
 
 import com.google.common.annotations.VisibleForTesting;
@@ -112,7 +111,7 @@
         bitset.clear();
     }
 
-    public void close() throws IOException
+    public void close()
     {
         bitset.close();
     }

diff --git a/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java b/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
index 1ec1459..b95544c 100644
--- a/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
+++ b/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java

@@ -18,18 +18,18 @@
 package org.apache.cassandra.utils;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.obs.IBitSet;
 import org.apache.cassandra.utils.obs.OffHeapBitSet;
 import org.apache.cassandra.utils.obs.OpenBitSet;
 
 abstract class BloomFilterSerializer implements ISerializer<BloomFilter>
 {
-    public void serialize(BloomFilter bf, DataOutput out) throws IOException
+    public void serialize(BloomFilter bf, DataOutputPlus out) throws IOException
     {
         out.writeInt(bf.hashCount);
         bf.bitset.serialize(out);
@@ -51,7 +51,7 @@
 
     /**
      * Calculates a serialized size of the given Bloom Filter
-     * @see BloomFilterSerializer#serialize(BloomFilter, DataOutput)
+     * @see org.apache.cassandra.io.ISerializer#serialize(Object, org.apache.cassandra.io.util.DataOutputPlus)
      *
      * @param bf Bloom filter to calculate serialized size
      *

diff --git a/src/java/org/apache/cassandra/utils/BooleanSerializer.java b/src/java/org/apache/cassandra/utils/BooleanSerializer.java
index 0c37f67..f1707c3 100644
--- a/src/java/org/apache/cassandra/utils/BooleanSerializer.java
+++ b/src/java/org/apache/cassandra/utils/BooleanSerializer.java

@@ -22,12 +22,13 @@
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class BooleanSerializer implements IVersionedSerializer<Boolean>
 {
     public static BooleanSerializer serializer = new BooleanSerializer();
 
-    public void serialize(Boolean b, DataOutput out, int version) throws IOException
+    public void serialize(Boolean b, DataOutputPlus out, int version) throws IOException
     {
         out.writeBoolean(b);
     }

diff --git a/src/java/org/apache/cassandra/utils/BoundedStatsDeque.java b/src/java/org/apache/cassandra/utils/BoundedStatsDeque.java
index 3983b74..2ad28c4 100644
--- a/src/java/org/apache/cassandra/utils/BoundedStatsDeque.java
+++ b/src/java/org/apache/cassandra/utils/BoundedStatsDeque.java

@@ -18,12 +18,8 @@
 package org.apache.cassandra.utils;
 
 import java.util.Iterator;
-import java.util.NoSuchElementException;
 import java.util.concurrent.LinkedBlockingDeque;
 import java.util.concurrent.atomic.AtomicLong;
-import java.util.concurrent.atomic.AtomicReference;
-
-import com.google.common.util.concurrent.AtomicDouble;
 
 /**
  * bounded threadsafe deque

diff --git a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
index 4970fe6..e41069f 100644
--- a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
+++ b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java

@@ -32,6 +32,8 @@
 import java.util.Arrays;
 import java.util.UUID;
 
+import net.nicoulaj.compilecommand.annotations.Inline;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileUtils;
 
@@ -77,39 +79,22 @@
 {
     public static final ByteBuffer EMPTY_BYTE_BUFFER = ByteBuffer.wrap(new byte[0]);
 
+    @Inline
     public static int compareUnsigned(ByteBuffer o1, ByteBuffer o2)
     {
-        assert o1 != null;
-        assert o2 != null;
-        if (o1 == o2)
-            return 0;
-
-        if (o1.hasArray() && o2.hasArray())
-        {
-            return FBUtilities.compareUnsigned(o1.array(), o2.array(), o1.position() + o1.arrayOffset(),
-                    o2.position() + o2.arrayOffset(), o1.remaining(), o2.remaining());
-        }
-
-        int end1 = o1.position() + o1.remaining();
-        int end2 = o2.position() + o2.remaining();
-        for (int i = o1.position(), j = o2.position(); i < end1 && j < end2; i++, j++)
-        {
-            int a = (o1.get(i) & 0xff);
-            int b = (o2.get(j) & 0xff);
-            if (a != b)
-                return a - b;
-        }
-        return o1.remaining() - o2.remaining();
+        return FastByteOperations.compareUnsigned(o1, o2);
     }
 
+    @Inline
     public static int compare(byte[] o1, ByteBuffer o2)
     {
-        return compareUnsigned(ByteBuffer.wrap(o1), o2);
+        return FastByteOperations.compareUnsigned(o1, 0, o1.length, o2);
     }
 
+    @Inline
     public static int compare(ByteBuffer o1, byte[] o2)
     {
-        return compareUnsigned(o1, ByteBuffer.wrap(o2));
+        return FastByteOperations.compareUnsigned(o1, o2, 0, o2.length);
     }
 
     /**
@@ -177,10 +162,7 @@
         if (buffer.hasArray())
         {
             int boff = buffer.arrayOffset() + buffer.position();
-            if (boff == 0 && length == buffer.array().length)
-                return buffer.array();
-            else
-                return Arrays.copyOfRange(buffer.array(), boff, boff + length);
+            return Arrays.copyOfRange(buffer.array(), boff, boff + length);
         }
         // else, DirectByteBuffer.get() is the fastest route
         byte[] bytes = new byte[length];
@@ -190,7 +172,7 @@
     }
 
     /**
-     * ByteBuffer adaptation of org.apache.commons.lang.ArrayUtils.lastIndexOf method
+     * ByteBuffer adaptation of org.apache.commons.lang3.ArrayUtils.lastIndexOf method
      *
      * @param buffer the array to traverse for looking for the object, may be <code>null</code>
      * @param valueToFind the value to find
@@ -270,12 +252,9 @@
         return clone;
     }
 
-    public static void arrayCopy(ByteBuffer buffer, int position, byte[] bytes, int offset, int length)
+    public static void arrayCopy(ByteBuffer src, int srcPos, byte[] dst, int dstPos, int length)
     {
-        if (buffer.hasArray())
-            System.arraycopy(buffer.array(), buffer.arrayOffset() + position, bytes, offset, length);
-        else
-            ((ByteBuffer) buffer.duplicate().position(position)).get(bytes, offset, length);
+        FastByteOperations.copy(src, srcPos, dst, dstPos, length);
     }
 
     /**
@@ -290,29 +269,13 @@
      */
     public static void arrayCopy(ByteBuffer src, int srcPos, ByteBuffer dst, int dstPos, int length)
     {
-        if (src.hasArray() && dst.hasArray())
-        {
-            System.arraycopy(src.array(),
-                             src.arrayOffset() + srcPos,
-                             dst.array(),
-                             dst.arrayOffset() + dstPos,
-                             length);
-        }
-        else
-        {
-            if (src.limit() - srcPos < length || dst.limit() - dstPos < length)
-                throw new IndexOutOfBoundsException();
-
-            for (int i = 0; i < length; i++)
-                // TODO: ByteBuffer.put is polymorphic, and might be slow here
-                dst.put(dstPos++, src.get(srcPos++));
-        }
+        FastByteOperations.copy(src, srcPos, dst, dstPos, length);
     }
 
-    public static void writeWithLength(ByteBuffer bytes, DataOutput out) throws IOException
+    public static void writeWithLength(ByteBuffer bytes, DataOutputPlus out) throws IOException
     {
         out.writeInt(bytes.remaining());
-        write(bytes, out); // writing data bytes to output source
+        out.write(bytes);
     }
 
     public static void writeWithLength(byte[] bytes, DataOutput out) throws IOException
@@ -321,27 +284,20 @@
         out.write(bytes);
     }
 
-    public static void write(ByteBuffer buffer, DataOutput out) throws IOException
-    {
-        if (buffer.hasArray())
-        {
-            out.write(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
-        }
-        else
-        {
-            for (int i = buffer.position(); i < buffer.limit(); i++)
-            {
-                out.writeByte(buffer.get(i));
-            }
-        }
-    }
-
-    public static void writeWithShortLength(ByteBuffer buffer, DataOutput out) throws IOException
+    public static void writeWithShortLength(ByteBuffer buffer, DataOutputPlus out) throws IOException
     {
         int length = buffer.remaining();
         assert 0 <= length && length <= FBUtilities.MAX_UNSIGNED_SHORT : length;
         out.writeShort(length);
-        write(buffer, out); // writing data bytes to output source
+        out.write(buffer);
+    }
+
+    public static void writeWithShortLength(byte[] buffer, DataOutput out) throws IOException
+    {
+        int length = buffer.length;
+        assert 0 <= length && length <= FBUtilities.MAX_UNSIGNED_SHORT : length;
+        out.writeShort(length);
+        out.write(buffer);
     }
 
     public static ByteBuffer readWithLength(DataInput in) throws IOException
@@ -556,7 +512,7 @@
     /** trims size of bytebuffer to exactly number of bytes in it, to do not hold too much memory */
     public static ByteBuffer minimalBufferFor(ByteBuffer buf)
     {
-        return buf.capacity() > buf.remaining() ? ByteBuffer.wrap(getArray(buf)) : buf;
+        return buf.capacity() > buf.remaining() || !buf.hasArray() ? ByteBuffer.wrap(getArray(buf)) : buf;
     }
 
     // Doesn't change bb position

diff --git a/src/java/org/apache/cassandra/utils/CLibrary.java b/src/java/org/apache/cassandra/utils/CLibrary.java
index 6711098..1927578 100644
--- a/src/java/org/apache/cassandra/utils/CLibrary.java
+++ b/src/java/org/apache/cassandra/utils/CLibrary.java

@@ -18,6 +18,9 @@
 package org.apache.cassandra.utils;
 
 import java.io.FileDescriptor;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.RandomAccessFile;
 import java.lang.reflect.Field;
 
 import org.slf4j.Logger;
@@ -47,8 +50,8 @@
     private static final int POSIX_FADV_WILLNEED   = 3; /* fadvise.h */
     private static final int POSIX_FADV_DONTNEED   = 4; /* fadvise.h */
     private static final int POSIX_FADV_NOREUSE    = 5; /* fadvise.h */
-    
-    static boolean jnaAvailable = false;
+
+    static boolean jnaAvailable = true;
     static boolean jnaLockable = false;
 
     static
@@ -56,35 +59,31 @@
         try
         {
             Native.register("c");
-            jnaAvailable = true;
         }
         catch (NoClassDefFoundError e)
         {
-            logger.info("JNA not found. Native methods will be disabled.");
+            logger.warn("JNA not found. Native methods will be disabled.");
+            jnaAvailable = false;
         }
         catch (UnsatisfiedLinkError e)
         {
-            logger.info("JNA link failure, one or more native method will be unavailable.");
-            logger.debug("JNA link failure details: " + e.getMessage());
+            logger.warn("JNA link failure, one or more native method will be unavailable.");
+            logger.debug("JNA link failure details: {}", e.getMessage());
         }
         catch (NoSuchMethodError e)
         {
             logger.warn("Obsolete version of JNA present; unable to register C library. Upgrade to JNA 3.2.7 or later");
+            jnaAvailable = false;
         }
     }
 
     private static native int mlockall(int flags) throws LastErrorException;
     private static native int munlockall() throws LastErrorException;
-
-    // fcntl - manipulate file descriptor, `man 2 fcntl`
-    public static native int fcntl(int fd, int command, long flags) throws LastErrorException;
-
-    // fadvice
-    public static native int posix_fadvise(int fd, long offset, int len, int flag) throws LastErrorException;
-
-    public static native int open(String path, int flags) throws LastErrorException;
-    public static native int fsync(int fd) throws LastErrorException;
-    public static native int close(int fd) throws LastErrorException;
+    private static native int fcntl(int fd, int command, long flags) throws LastErrorException;
+    private static native int posix_fadvise(int fd, long offset, int len, int flag) throws LastErrorException;
+    private static native int open(String path, int flags) throws LastErrorException;
+    private static native int fsync(int fd) throws LastErrorException;
+    private static native int close(int fd) throws LastErrorException;
 
     private static int errno(RuntimeException e)
     {
@@ -101,12 +100,12 @@
     }
 
     private CLibrary() {}
-    
+
     public static boolean jnaAvailable()
     {
         return jnaAvailable;
     }
-    
+
     public static boolean jnaMemoryLockable()
     {
         return jnaLockable;
@@ -128,20 +127,40 @@
         {
             if (!(e instanceof LastErrorException))
                 throw e;
+
             if (errno(e) == ENOMEM && System.getProperty("os.name").toLowerCase().contains("linux"))
             {
                 logger.warn("Unable to lock JVM memory (ENOMEM)."
-                             + " This can result in part of the JVM being swapped out, especially with mmapped I/O enabled."
-                             + " Increase RLIMIT_MEMLOCK or run Cassandra as root.");
+                        + " This can result in part of the JVM being swapped out, especially with mmapped I/O enabled."
+                        + " Increase RLIMIT_MEMLOCK or run Cassandra as root.");
             }
             else if (!System.getProperty("os.name").toLowerCase().contains("mac"))
             {
                 // OS X allows mlockall to be called, but always returns an error
-                logger.warn("Unknown mlockall error " + errno(e));
+                logger.warn("Unknown mlockall error {}", errno(e));
             }
         }
     }
 
+    public static void trySkipCache(String path, long offset, long len)
+    {
+        trySkipCache(getfd(path), offset, len);
+    }
+
+    public static void trySkipCache(int fd, long offset, long len)
+    {
+        if (len == 0)
+            trySkipCache(fd, 0, 0);
+
+        while (len > 0)
+        {
+            int sublen = (int) Math.min(Integer.MAX_VALUE, len);
+            trySkipCache(fd, offset, sublen);
+            len -= sublen;
+            offset -= sublen;
+        }
+    }
+
     public static void trySkipCache(int fd, long offset, int len)
     {
         if (fd < 0)
@@ -159,6 +178,13 @@
             // if JNA is unavailable just skipping Direct I/O
             // instance of this class will act like normal RandomAccessFile
         }
+        catch (RuntimeException e)
+        {
+            if (!(e instanceof LastErrorException))
+                throw e;
+
+            logger.warn(String.format("posix_fadvise(%d, %d) failed, errno (%d).", fd, offset, errno(e)));
+        }
     }
 
     public static int tryFcntl(int fd, int command, int flags)
@@ -168,15 +194,18 @@
 
         try
         {
-            result = CLibrary.fcntl(fd, command, flags);
+            result = fcntl(fd, command, flags);
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            // if JNA is unavailable just skipping
         }
         catch (RuntimeException e)
         {
             if (!(e instanceof LastErrorException))
                 throw e;
 
-            logger.warn(String.format("fcntl(%d, %d, %d) failed, errno (%d).",
-                                      fd, command, flags, CLibrary.errno(e)));
+            logger.warn(String.format("fcntl(%d, %d, %d) failed, errno (%d).", fd, command, flags, errno(e)));
         }
 
         return result;
@@ -199,7 +228,7 @@
             if (!(e instanceof LastErrorException))
                 throw e;
 
-            logger.warn(String.format("open(%s, O_RDONLY) failed, errno (%d).", path, CLibrary.errno(e)));
+            logger.warn(String.format("open(%s, O_RDONLY) failed, errno (%d).", path, errno(e)));
         }
 
         return fd;
@@ -223,7 +252,7 @@
             if (!(e instanceof LastErrorException))
                 throw e;
 
-            logger.warn(String.format("fsync(%d) failed, errno (%d).", fd, CLibrary.errno(e)));
+            logger.warn(String.format("fsync(%d) failed, errno (%d).", fd, errno(e)));
         }
     }
 
@@ -245,7 +274,7 @@
             if (!(e instanceof LastErrorException))
                 throw e;
 
-            logger.warn(String.format("close(%d) failed, errno (%d).", fd, CLibrary.errno(e)));
+            logger.warn(String.format("close(%d) failed, errno (%d).", fd, errno(e)));
         }
     }
 
@@ -273,24 +302,31 @@
         return -1;
     }
 
-    /**
-     * Suggest kernel to preheat one page for the given file.
-     *
-     * @param fd The file descriptor of file to preheat.
-     * @param position The offset of the block.
-     *
-     * @return On success, zero is returned. On error, an error number is returned.
-     */
-    public static int preheatPage(int fd, long position)
+    public static int getfd(String path)
     {
+        RandomAccessFile file = null;
         try
         {
-            // 4096 is good for SSD because they operate on "Pages" 4KB in size
-            return posix_fadvise(fd, position, 4096, POSIX_FADV_WILLNEED);
+            file = new RandomAccessFile(path, "r");
+            return getfd(file.getFD());
         }
-        catch (UnsatisfiedLinkError e)
+        catch (Throwable t)
         {
+            JVMStabilityInspector.inspectThrowable(t);
+            // ignore
             return -1;
         }
+        finally
+        {
+            try
+            {
+                if (file != null)
+                    file.close();
+            }
+            catch (Throwable t)
+            {
+                // ignore
+            }
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/CloseableIterator.java b/src/java/org/apache/cassandra/utils/CloseableIterator.java
index 399c6d1..7474f3d 100644
--- a/src/java/org/apache/cassandra/utils/CloseableIterator.java
+++ b/src/java/org/apache/cassandra/utils/CloseableIterator.java

@@ -21,6 +21,6 @@
 import java.util.Iterator;
 
 // so we can instantiate anonymous classes implementing both interfaces
-public interface CloseableIterator<T> extends Iterator<T>, Closeable
+public interface CloseableIterator<T> extends Iterator<T>, AutoCloseable, Closeable
 {
 }

diff --git a/src/java/org/apache/cassandra/utils/CounterId.java b/src/java/org/apache/cassandra/utils/CounterId.java
index 4b6fd46..2552178 100644
--- a/src/java/org/apache/cassandra/utils/CounterId.java
+++ b/src/java/org/apache/cassandra/utils/CounterId.java

@@ -18,65 +18,31 @@
 package org.apache.cassandra.utils;
 
 import java.nio.ByteBuffer;
-import java.util.List;
 import java.util.concurrent.atomic.AtomicReference;
-import java.util.concurrent.CopyOnWriteArrayList;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import com.google.common.base.Objects;
-
-import org.apache.cassandra.db.CounterColumn;
 import org.apache.cassandra.db.SystemKeyspace;
 
 public class CounterId implements Comparable<CounterId>
 {
-    private static final Logger logger = LoggerFactory.getLogger(CounterId.class);
-
     public static final int LENGTH = 16; // we assume a fixed length size for all CounterIds
 
     // Lazy holder because this opens the system keyspace and we want to avoid
     // having this triggered during class initialization
-    private static class LocalIds
+    private static class LocalId
     {
-        static final LocalCounterIdHistory instance = new LocalCounterIdHistory();
+        static final LocalCounterIdHolder instance = new LocalCounterIdHolder();
     }
 
     private final ByteBuffer id;
 
-    private static LocalCounterIdHistory localIds()
+    private static LocalCounterIdHolder localId()
     {
-        return LocalIds.instance;
+        return LocalId.instance;
     }
 
     public static CounterId getLocalId()
     {
-        return localIds().current.get();
-    }
-
-    /**
-     * Renew the local counter id.
-     * To use only when this strictly necessary, as using this will make all
-     * counter context grow with time.
-     */
-    public static void renewLocalId()
-    {
-        renewLocalId(FBUtilities.timestampMicros());
-    }
-
-    public static synchronized void renewLocalId(long now)
-    {
-        localIds().renewCurrent(now);
-    }
-
-    /**
-     * Return the list of old local counter id of this node.
-     * It is guaranteed that the returned list is sorted by growing counter id
-     * (and hence the first item will be the oldest counter id for this host)
-     */
-    public static List<CounterIdRecord> getOldLocalCounterIds()
-    {
-        return localIds().olds;
+        return localId().get();
     }
 
     /**
@@ -163,94 +129,18 @@
         return id.hashCode();
     }
 
-    public static class OneShotRenewer
-    {
-        private boolean renewed;
-        private final CounterId initialId;
-
-        public OneShotRenewer()
-        {
-            renewed = false;
-            initialId = getLocalId();
-        }
-
-        public void maybeRenew(CounterColumn column)
-        {
-            if (!renewed && column.hasCounterId(initialId))
-            {
-                renewLocalId();
-                renewed = true;
-            }
-        }
-    }
-
-    private static class LocalCounterIdHistory
+    private static class LocalCounterIdHolder
     {
         private final AtomicReference<CounterId> current;
-        private final List<CounterIdRecord> olds;
 
-        LocalCounterIdHistory()
+        LocalCounterIdHolder()
         {
-            CounterId id = SystemKeyspace.getCurrentLocalCounterId();
-            if (id == null)
-            {
-                // no recorded local counter id, generating a new one and saving it
-                id = generate();
-                logger.info("No saved local counter id, using newly generated: {}", id);
-                SystemKeyspace.writeCurrentLocalCounterId(id, FBUtilities.timestampMicros());
-                current = new AtomicReference<>(id);
-                olds = new CopyOnWriteArrayList<>();
-            }
-            else
-            {
-                logger.info("Saved local counter id: {}", id);
-                current = new AtomicReference<>(id);
-                olds = new CopyOnWriteArrayList<>(SystemKeyspace.getOldLocalCounterIds());
-            }
+            current = new AtomicReference<>(wrap(ByteBufferUtil.bytes(SystemKeyspace.getLocalHostId())));
         }
 
-        synchronized void renewCurrent(long now)
+        CounterId get()
         {
-            CounterId newCounterId = generate();
-            CounterId old = current.get();
-            SystemKeyspace.writeCurrentLocalCounterId(newCounterId, now);
-            current.set(newCounterId);
-            olds.add(new CounterIdRecord(old, now));
-        }
-    }
-
-    public static class CounterIdRecord
-    {
-        public final CounterId id;
-        public final long timestamp;
-
-        public CounterIdRecord(CounterId id, long timestamp)
-        {
-            this.id = id;
-            this.timestamp = timestamp;
-        }
-
-        @Override
-        public boolean equals(Object o)
-        {
-            if (this == o)
-                return true;
-            if (o == null || getClass() != o.getClass())
-                return false;
-
-            CounterIdRecord otherRecord = (CounterIdRecord)o;
-            return id.equals(otherRecord.id) && timestamp == otherRecord.timestamp;
-        }
-
-        @Override
-        public int hashCode()
-        {
-            return Objects.hashCode(id, timestamp);
-        }
-
-        public String toString()
-        {
-            return String.format("(%s, %d)", id.toString(), timestamp);
+            return current.get();
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/EstimatedHistogram.java b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
index 5941057..196a3b9 100644
--- a/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
+++ b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.utils;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicLongArray;
@@ -27,6 +26,8 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
 import org.slf4j.Logger;
 
 public class EstimatedHistogram
@@ -317,7 +318,7 @@
 
     public static class EstimatedHistogramSerializer implements ISerializer<EstimatedHistogram>
     {
-        public void serialize(EstimatedHistogram eh, DataOutput out) throws IOException
+        public void serialize(EstimatedHistogram eh, DataOutputPlus out) throws IOException
         {
             long[] offsets = eh.getBucketOffsets();
             long[] buckets = eh.getBuckets(false);
@@ -342,9 +343,19 @@
             return new EstimatedHistogram(offsets, buckets);
         }
 
-        public long serializedSize(EstimatedHistogram object, TypeSizes typeSizes)
+        public long serializedSize(EstimatedHistogram eh, TypeSizes typeSizes)
         {
-            throw new UnsupportedOperationException();
+            int size = 0;
+
+            long[] offsets = eh.getBucketOffsets();
+            long[] buckets = eh.getBuckets(false);
+            size += typeSizes.sizeof(buckets.length);
+            for (int i = 0; i < buckets.length; i++)
+            {
+                size += typeSizes.sizeof(offsets[i == 0 ? 0 : i - 1]);
+                size += typeSizes.sizeof(buckets[i]);
+            }
+            return size;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/ExpiringMap.java b/src/java/org/apache/cassandra/utils/ExpiringMap.java
index 7eec40e..e7b626c 100644
--- a/src/java/org/apache/cassandra/utils/ExpiringMap.java
+++ b/src/java/org/apache/cassandra/utils/ExpiringMap.java

@@ -91,10 +91,12 @@
                 {
                     if (entry.getValue().isReadyToDieAt(start))
                     {
-                        cache.remove(entry.getKey());
-                        n++;
-                        if (postExpireHook != null)
-                            postExpireHook.apply(Pair.create(entry.getKey(), entry.getValue()));
+                        if (cache.remove(entry.getKey()) != null)
+                        {
+                            n++;
+                            if (postExpireHook != null)
+                                postExpireHook.apply(Pair.create(entry.getKey(), entry.getValue()));
+                        }
                     }
                 }
                 logger.trace("Expired {} entries", n);

diff --git a/src/java/org/apache/cassandra/utils/FBUtilities.java b/src/java/org/apache/cassandra/utils/FBUtilities.java
index 9f09868..7e9adab 100644
--- a/src/java/org/apache/cassandra/utils/FBUtilities.java
+++ b/src/java/org/apache/cassandra/utils/FBUtilities.java

@@ -100,15 +100,6 @@
         }
     };
 
-    private static final ThreadLocal<Random> localRandom = new ThreadLocal<Random>()
-    {
-        @Override
-        protected Random initialValue()
-        {
-            return new Random();
-        }
-    };
-
     public static final int MAX_UNSIGNED_SHORT = 0xFFFF;
 
     public static MessageDigest threadLocalMD5Digest()
@@ -128,11 +119,6 @@
         }
     }
 
-    public static Random threadLocalRandom()
-    {
-        return localRandom.get();
-    }
-
     /**
      * Please use getBroadcastAddress instead. You need this only when you have to listen/connect.
      */
@@ -213,7 +199,7 @@
 
     public static int compareUnsigned(byte[] bytes1, byte[] bytes2, int offset1, int offset2, int len1, int len2)
     {
-        return FastByteComparisons.compareTo(bytes1, offset1, len1, bytes2, offset2, len2);
+        return FastByteOperations.compareUnsigned(bytes1, offset1, len1, bytes2, offset2, len2);
     }
 
     public static int compareUnsigned(byte[] bytes1, byte[] bytes2)
@@ -245,13 +231,6 @@
         return out;
     }
 
-    public static BigInteger hashToBigInteger(ByteBuffer data)
-    {
-        byte[] result = hash(data);
-        BigInteger hash = new BigInteger(result);
-        return hash.abs();
-    }
-
     public static byte[] hash(ByteBuffer... data)
     {
         MessageDigest messageDigest = localMD5Digest.get();
@@ -266,6 +245,11 @@
         return messageDigest.digest();
     }
 
+    public static BigInteger hashToBigInteger(ByteBuffer data)
+    {
+        return new BigInteger(hash(data)).abs();
+    }
+
     @Deprecated
     public static void serialize(TSerializer serializer, TBase struct, DataOutput out)
     throws IOException
@@ -315,8 +299,8 @@
             {
                 public int compare(DecoratedKey o1, DecoratedKey o2)
                 {
-                    if ((right.compareTo(o1.token) < 0 && right.compareTo(o2.token) < 0)
-                        || (right.compareTo(o1.token) > 0 && right.compareTo(o2.token) > 0))
+                    if ((right.compareTo(o1.getToken()) < 0 && right.compareTo(o2.getToken()) < 0)
+                        || (right.compareTo(o1.getToken()) > 0 && right.compareTo(o2.getToken()) > 0))
                     {
                         // both tokens are on the same side of the wrap point
                         return o1.compareTo(o2);
@@ -372,7 +356,7 @@
             in = FBUtilities.class.getClassLoader().getResourceAsStream("org/apache/cassandra/config/version.properties");
             if (in == null)
             {
-                return "Unknown";
+                return System.getProperty("cassandra.releaseVersion", "Unknown");
             }
             Properties props = new Properties();
             props.load(in);
@@ -504,9 +488,9 @@
         }
     }
 
-    public static <T> SortedSet<T> singleton(T column, Comparator<T> comparator)
+    public static <T> SortedSet<T> singleton(T column, Comparator<? super T> comparator)
     {
-        TreeSet<T> s = new TreeSet<>(comparator);
+        SortedSet<T> s = new TreeSet<T>(comparator);
         s.add(column);
         return s;
     }
@@ -695,4 +679,35 @@
     {
         return OPERATING_SYSTEM.contains("nix") || OPERATING_SYSTEM.contains("nux") || OPERATING_SYSTEM.contains("aix");
     }
+
+    public static void updateWithShort(MessageDigest digest, int val)
+    {
+        digest.update((byte) ((val >> 8) & 0xFF));
+        digest.update((byte) (val & 0xFF));
+    }
+
+    public static void updateWithByte(MessageDigest digest, int val)
+    {
+        digest.update((byte) (val & 0xFF));
+    }
+
+    public static void updateWithInt(MessageDigest digest, int val)
+    {
+        digest.update((byte) ((val >>> 24) & 0xFF));
+        digest.update((byte) ((val >>> 16) & 0xFF));
+        digest.update((byte) ((val >>>  8) & 0xFF));
+        digest.update((byte) ((val >>> 0) & 0xFF));
+    }
+
+    public static void updateWithLong(MessageDigest digest, long val)
+    {
+        digest.update((byte) ((val >>> 56) & 0xFF));
+        digest.update((byte) ((val >>> 48) & 0xFF));
+        digest.update((byte) ((val >>> 40) & 0xFF));
+        digest.update((byte) ((val >>> 32) & 0xFF));
+        digest.update((byte) ((val >>> 24) & 0xFF));
+        digest.update((byte) ((val >>> 16) & 0xFF));
+        digest.update((byte) ((val >>>  8) & 0xFF));
+        digest.update((byte)  ((val >>> 0) & 0xFF));
+    }
 }

diff --git a/src/java/org/apache/cassandra/utils/FastByteComparisons.java b/src/java/org/apache/cassandra/utils/FastByteComparisons.java
deleted file mode 100644
index 4be6cd4..0000000
--- a/src/java/org/apache/cassandra/utils/FastByteComparisons.java
+++ /dev/null

@@ -1,240 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.lang.reflect.Field;
-import java.nio.ByteOrder;
-import java.security.AccessController;
-import java.security.PrivilegedAction;
-
-import sun.misc.Unsafe;
-
-import com.google.common.primitives.Longs;
-import com.google.common.primitives.UnsignedBytes;
-
-/**
- * Utility code to do optimized byte-array comparison.
- * This is borrowed and slightly modified from Guava's {@link UnsignedBytes}
- * class to be able to compare arrays that start at non-zero offsets.
- */
-abstract class FastByteComparisons {
-
-  /**
-   * Lexicographically compare two byte arrays.
-   */
-  public static int compareTo(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
-    return LexicographicalComparerHolder.BEST_COMPARER.compareTo(
-        b1, s1, l1, b2, s2, l2);
-  }
-
-  private interface Comparer<T> {
-    abstract public int compareTo(T buffer1, int offset1, int length1,
-        T buffer2, int offset2, int length2);
-  }
-
-  private static Comparer<byte[]> lexicographicalComparerJavaImpl() {
-    return LexicographicalComparerHolder.PureJavaComparer.INSTANCE;
-  }
-
-
-  /**
-   * Provides a lexicographical comparer implementation; either a Java
-   * implementation or a faster implementation based on {@link Unsafe}.
-   *
-   * <p>Uses reflection to gracefully fall back to the Java implementation if
-   * {@code Unsafe} isn't available.
-   */
-  private static class LexicographicalComparerHolder {
-    static final String UNSAFE_COMPARER_NAME =
-        LexicographicalComparerHolder.class.getName() + "$UnsafeComparer";
-
-    static final Comparer<byte[]> BEST_COMPARER = getBestComparer();
-    /**
-     * Returns the Unsafe-using Comparer, or falls back to the pure-Java
-     * implementation if unable to do so.
-     */
-    static Comparer<byte[]> getBestComparer() {
-      String arch = System.getProperty("os.arch");
-      boolean unaligned = arch.equals("i386") || arch.equals("x86")
-                    || arch.equals("amd64") || arch.equals("x86_64");
-      if (!unaligned)
-        return lexicographicalComparerJavaImpl();
-      try {
-        Class<?> theClass = Class.forName(UNSAFE_COMPARER_NAME);
-
-        // yes, UnsafeComparer does implement Comparer<byte[]>
-        @SuppressWarnings("unchecked")
-        Comparer<byte[]> comparer =
-          (Comparer<byte[]>) theClass.getEnumConstants()[0];
-        return comparer;
-      } catch (Throwable t) { // ensure we really catch *everything*
-        return lexicographicalComparerJavaImpl();
-      }
-    }
-
-    private enum PureJavaComparer implements Comparer<byte[]> {
-      INSTANCE;
-
-      @Override
-      public int compareTo(byte[] buffer1, int offset1, int length1,
-          byte[] buffer2, int offset2, int length2) {
-        // Short circuit equal case
-        if (buffer1 == buffer2 &&
-            offset1 == offset2 &&
-            length1 == length2) {
-          return 0;
-        }
-        int end1 = offset1 + length1;
-        int end2 = offset2 + length2;
-        for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) {
-          int a = (buffer1[i] & 0xff);
-          int b = (buffer2[j] & 0xff);
-          if (a != b) {
-            return a - b;
-          }
-        }
-        return length1 - length2;
-      }
-    }
-
-    @SuppressWarnings("unused") // used via reflection
-    private enum UnsafeComparer implements Comparer<byte[]> {
-      INSTANCE;
-
-      static final Unsafe theUnsafe;
-
-      /** The offset to the first element in a byte array. */
-      static final int BYTE_ARRAY_BASE_OFFSET;
-
-      static {
-        theUnsafe = (Unsafe) AccessController.doPrivileged(
-            new PrivilegedAction<Object>() {
-              @Override
-              public Object run() {
-                try {
-                  Field f = Unsafe.class.getDeclaredField("theUnsafe");
-                  f.setAccessible(true);
-                  return f.get(null);
-                } catch (NoSuchFieldException e) {
-                  // It doesn't matter what we throw;
-                  // it's swallowed in getBestComparer().
-                  throw new Error();
-                } catch (IllegalAccessException e) {
-                  throw new Error();
-                }
-              }
-            });
-
-        BYTE_ARRAY_BASE_OFFSET = theUnsafe.arrayBaseOffset(byte[].class);
-
-        // sanity check - this should never fail
-        if (theUnsafe.arrayIndexScale(byte[].class) != 1) {
-          throw new AssertionError();
-        }
-      }
-
-      static final boolean littleEndian =
-        ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN);
-
-      /**
-       * Returns true if x1 is less than x2, when both values are treated as
-       * unsigned.
-       */
-      static boolean lessThanUnsigned(long x1, long x2) {
-        return (x1 + Long.MIN_VALUE) < (x2 + Long.MIN_VALUE);
-      }
-
-      /**
-       * Lexicographically compare two arrays.
-       *
-       * @param buffer1 left operand
-       * @param buffer2 right operand
-       * @param offset1 Where to start comparing in the left buffer
-       * @param offset2 Where to start comparing in the right buffer
-       * @param length1 How much to compare from the left buffer
-       * @param length2 How much to compare from the right buffer
-       * @return 0 if equal, < 0 if left is less than right, etc.
-       */
-      @Override
-      public int compareTo(byte[] buffer1, int offset1, int length1,
-          byte[] buffer2, int offset2, int length2) {
-        // Short circuit equal case
-        if (buffer1 == buffer2 &&
-            offset1 == offset2 &&
-            length1 == length2) {
-          return 0;
-        }
-        int minLength = Math.min(length1, length2);
-        int minWords = minLength / Longs.BYTES;
-        int offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET;
-        int offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET;
-
-        /*
-         * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a
-         * time is no slower than comparing 4 bytes at a time even on 32-bit.
-         * On the other hand, it is substantially faster on 64-bit.
-         */
-        for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) {
-          long lw = theUnsafe.getLong(buffer1, offset1Adj + (long) i);
-          long rw = theUnsafe.getLong(buffer2, offset2Adj + (long) i);
-          long diff = lw ^ rw;
-
-          if (diff != 0) {
-            if (!littleEndian) {
-              return lessThanUnsigned(lw, rw) ? -1 : 1;
-            }
-
-            // Use binary search
-            int n = 0;
-            int y;
-            int x = (int) diff;
-            if (x == 0) {
-              x = (int) (diff >>> 32);
-              n = 32;
-            }
-
-            y = x << 16;
-            if (y == 0) {
-              n += 16;
-            } else {
-              x = y;
-            }
-
-            y = x << 8;
-            if (y == 0) {
-              n += 8;
-            }
-            return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL));
-          }
-        }
-
-        // The epilogue to cover the last (minLength % 8) elements.
-        for (int i = minWords * Longs.BYTES; i < minLength; i++) {
-          int result = UnsignedBytes.compare(
-              buffer1[offset1 + i],
-              buffer2[offset2 + i]);
-          if (result != 0) {
-            return result;
-          }
-        }
-        return length1 - length2;
-      }
-
-    }
-  }
-}

diff --git a/src/java/org/apache/cassandra/utils/FastByteOperations.java b/src/java/org/apache/cassandra/utils/FastByteOperations.java
new file mode 100644
index 0000000..6e25492
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/FastByteOperations.java

@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.lang.reflect.Field;
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.security.AccessController;
+import java.security.PrivilegedAction;
+
+import com.google.common.primitives.*;
+
+import net.nicoulaj.compilecommand.annotations.Inline;
+import sun.misc.Unsafe;
+
+/**
+ * Utility code to do optimized byte-array comparison.
+ * This is borrowed and slightly modified from Guava's {@link UnsignedBytes}
+ * class to be able to compare arrays that start at non-zero offsets.
+ */
+public class FastByteOperations
+{
+
+    /**
+     * Lexicographically compare two byte arrays.
+     */
+    public static int compareUnsigned(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2)
+    {
+        return BestHolder.BEST.compare(b1, s1, l1, b2, s2, l2);
+    }
+
+    public static int compareUnsigned(ByteBuffer b1, byte[] b2, int s2, int l2)
+    {
+        return BestHolder.BEST.compare(b1, b2, s2, l2);
+    }
+
+    public static int compareUnsigned(byte[] b1, int s1, int l1, ByteBuffer b2)
+    {
+        return -BestHolder.BEST.compare(b2, b1, s1, l1);
+    }
+
+    public static int compareUnsigned(ByteBuffer b1, ByteBuffer b2)
+    {
+        return BestHolder.BEST.compare(b1, b2);
+    }
+
+    public static void copy(ByteBuffer src, int srcPosition, byte[] trg, int trgPosition, int length)
+    {
+        BestHolder.BEST.copy(src, srcPosition, trg, trgPosition, length);
+    }
+
+    public static void copy(ByteBuffer src, int srcPosition, ByteBuffer trg, int trgPosition, int length)
+    {
+        BestHolder.BEST.copy(src, srcPosition, trg, trgPosition, length);
+    }
+
+    public interface ByteOperations
+    {
+        abstract public int compare(byte[] buffer1, int offset1, int length1,
+                                    byte[] buffer2, int offset2, int length2);
+
+        abstract public int compare(ByteBuffer buffer1, byte[] buffer2, int offset2, int length2);
+
+        abstract public int compare(ByteBuffer buffer1, ByteBuffer buffer2);
+
+        abstract public void copy(ByteBuffer src, int srcPosition, byte[] trg, int trgPosition, int length);
+
+        abstract public void copy(ByteBuffer src, int srcPosition, ByteBuffer trg, int trgPosition, int length);
+    }
+
+    /**
+     * Provides a lexicographical comparer implementation; either a Java
+     * implementation or a faster implementation based on {@link Unsafe}.
+     * <p/>
+     * <p>Uses reflection to gracefully fall back to the Java implementation if
+     * {@code Unsafe} isn't available.
+     */
+    private static class BestHolder
+    {
+        static final String UNSAFE_COMPARER_NAME = FastByteOperations.class.getName() + "$UnsafeOperations";
+        static final ByteOperations BEST = getBest();
+
+        /**
+         * Returns the Unsafe-using Comparer, or falls back to the pure-Java
+         * implementation if unable to do so.
+         */
+        static ByteOperations getBest()
+        {
+            String arch = System.getProperty("os.arch");
+            boolean unaligned = arch.equals("i386") || arch.equals("x86")
+                                || arch.equals("amd64") || arch.equals("x86_64");
+            if (!unaligned)
+                return new PureJavaOperations();
+            try
+            {
+                Class<?> theClass = Class.forName(UNSAFE_COMPARER_NAME);
+
+                // yes, UnsafeComparer does implement Comparer<byte[]>
+                @SuppressWarnings("unchecked")
+                ByteOperations comparer = (ByteOperations) theClass.getConstructor().newInstance();
+                return comparer;
+            }
+            catch (Throwable t)
+            {
+                JVMStabilityInspector.inspectThrowable(t);
+                // ensure we really catch *everything*
+                return new PureJavaOperations();
+            }
+        }
+
+    }
+
+    @SuppressWarnings("unused") // used via reflection
+    public static final class UnsafeOperations implements ByteOperations
+    {
+        static final Unsafe theUnsafe;
+        /**
+         * The offset to the first element in a byte array.
+         */
+        static final long BYTE_ARRAY_BASE_OFFSET;
+        static final long DIRECT_BUFFER_ADDRESS_OFFSET;
+
+        static
+        {
+            theUnsafe = (Unsafe) AccessController.doPrivileged(
+                      new PrivilegedAction<Object>()
+                      {
+                          @Override
+                          public Object run()
+                          {
+                              try
+                              {
+                                  Field f = Unsafe.class.getDeclaredField("theUnsafe");
+                                  f.setAccessible(true);
+                                  return f.get(null);
+                              }
+                              catch (NoSuchFieldException e)
+                              {
+                                  // It doesn't matter what we throw;
+                                  // it's swallowed in getBest().
+                                  throw new Error();
+                              }
+                              catch (IllegalAccessException e)
+                              {
+                                  throw new Error();
+                              }
+                          }
+                      });
+
+            try
+            {
+                BYTE_ARRAY_BASE_OFFSET = theUnsafe.arrayBaseOffset(byte[].class);
+                DIRECT_BUFFER_ADDRESS_OFFSET = theUnsafe.objectFieldOffset(Buffer.class.getDeclaredField("address"));
+            }
+            catch (Exception e)
+            {
+                throw new AssertionError(e);
+            }
+
+            // sanity check - this should never fail
+            if (theUnsafe.arrayIndexScale(byte[].class) != 1)
+            {
+                throw new AssertionError();
+            }
+        }
+
+        static final boolean BIG_ENDIAN = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);
+
+        public int compare(byte[] buffer1, int offset1, int length1, byte[] buffer2, int offset2, int length2)
+        {
+            return compareTo(buffer1, BYTE_ARRAY_BASE_OFFSET + offset1, length1,
+                             buffer2, BYTE_ARRAY_BASE_OFFSET + offset2, length2);
+        }
+
+        public int compare(ByteBuffer buffer1, byte[] buffer2, int offset2, int length2)
+        {
+            Object obj1;
+            long offset1;
+            if (buffer1.hasArray())
+            {
+                obj1 = buffer1.array();
+                offset1 = BYTE_ARRAY_BASE_OFFSET + buffer1.arrayOffset();
+            }
+            else
+            {
+                obj1 = null;
+                offset1 = theUnsafe.getLong(buffer1, DIRECT_BUFFER_ADDRESS_OFFSET);
+            }
+            int length1;
+            {
+                int position = buffer1.position();
+                int limit = buffer1.limit();
+                length1 = limit - position;
+                offset1 += position;
+            }
+            return compareTo(obj1, offset1, length1, buffer2, BYTE_ARRAY_BASE_OFFSET + offset2, length2);
+        }
+
+        public int compare(ByteBuffer buffer1, ByteBuffer buffer2)
+        {
+            return compareTo(buffer1, buffer2);
+        }
+
+        public void copy(ByteBuffer src, int srcPosition, byte[] trg, int trgPosition, int length)
+        {
+            if (src.hasArray())
+                System.arraycopy(src.array(), src.arrayOffset() + srcPosition, trg, trgPosition, length);
+            else
+                copy(null, srcPosition + theUnsafe.getLong(src, DIRECT_BUFFER_ADDRESS_OFFSET), trg, trgPosition, length);
+        }
+
+        public void copy(ByteBuffer srcBuf, int srcPosition, ByteBuffer trgBuf, int trgPosition, int length)
+        {
+            Object src;
+            long srcOffset;
+            if (srcBuf.hasArray())
+            {
+                src = srcBuf.array();
+                srcOffset = BYTE_ARRAY_BASE_OFFSET + srcBuf.arrayOffset();
+            }
+            else
+            {
+                src = null;
+                srcOffset = theUnsafe.getLong(srcBuf, DIRECT_BUFFER_ADDRESS_OFFSET);
+            }
+            copy(src, srcOffset + srcPosition, trgBuf, trgPosition, length);
+        }
+
+        public static void copy(Object src, long srcOffset, ByteBuffer trgBuf, int trgPosition, int length)
+        {
+            if (trgBuf.hasArray())
+                copy(src, srcOffset, trgBuf.array(), trgBuf.arrayOffset() + trgPosition, length);
+            else
+                copy(src, srcOffset, null, trgPosition + theUnsafe.getLong(trgBuf, DIRECT_BUFFER_ADDRESS_OFFSET), length);
+        }
+
+        public static void copy(Object src, long srcOffset, byte[] trg, int trgPosition, int length)
+        {
+            if (length <= MIN_COPY_THRESHOLD)
+            {
+                for (int i = 0 ; i < length ; i++)
+                    trg[trgPosition + i] = theUnsafe.getByte(src, srcOffset + i);
+            }
+            else
+            {
+                copy(src, srcOffset, trg, BYTE_ARRAY_BASE_OFFSET + trgPosition, length);
+            }
+        }
+
+        // 1M, copied from java.nio.Bits (unfortunately a package-private class)
+        private static final long UNSAFE_COPY_THRESHOLD = 1 << 20;
+        private static final long MIN_COPY_THRESHOLD = 6;
+
+        public static void copy(Object src, long srcOffset, Object dst, long dstOffset, long length)
+        {
+            while (length > 0) {
+                long size = (length > UNSAFE_COPY_THRESHOLD) ? UNSAFE_COPY_THRESHOLD : length;
+                // if src or dst are null, the offsets are absolute base addresses:
+                theUnsafe.copyMemory(src, srcOffset, dst, dstOffset, size);
+                length -= size;
+                srcOffset += size;
+                dstOffset += size;
+            }
+        }
+
+        @Inline
+        public static int compareTo(ByteBuffer buffer1, ByteBuffer buffer2)
+        {
+            Object obj1;
+            long offset1;
+            int length1;
+            if (buffer1.hasArray())
+            {
+                obj1 = buffer1.array();
+                offset1 = BYTE_ARRAY_BASE_OFFSET + buffer1.arrayOffset();
+            }
+            else
+            {
+                obj1 = null;
+                offset1 = theUnsafe.getLong(buffer1, DIRECT_BUFFER_ADDRESS_OFFSET);
+            }
+            offset1 += buffer1.position();
+            length1 = buffer1.remaining();
+            return compareTo(obj1, offset1, length1, buffer2);
+        }
+
+        @Inline
+        public static int compareTo(Object buffer1, long offset1, int length1, ByteBuffer buffer)
+        {
+            Object obj2;
+            long offset2;
+
+            int position = buffer.position();
+            int limit = buffer.limit();
+            if (buffer.hasArray())
+            {
+                obj2 = buffer.array();
+                offset2 = BYTE_ARRAY_BASE_OFFSET + buffer.arrayOffset();
+            }
+            else
+            {
+                obj2 = null;
+                offset2 = theUnsafe.getLong(buffer, DIRECT_BUFFER_ADDRESS_OFFSET);
+            }
+            int length2 = limit - position;
+            offset2 += position;
+
+            return compareTo(buffer1, offset1, length1, obj2, offset2, length2);
+        }
+
+        /**
+         * Lexicographically compare two arrays.
+         *
+         * @param buffer1 left operand: a byte[] or null
+         * @param buffer2 right operand: a byte[] or null
+         * @param memoryOffset1 Where to start comparing in the left buffer (pure memory address if buffer1 is null, or relative otherwise)
+         * @param memoryOffset2 Where to start comparing in the right buffer (pure memory address if buffer1 is null, or relative otherwise)
+         * @param length1 How much to compare from the left buffer
+         * @param length2 How much to compare from the right buffer
+         * @return 0 if equal, < 0 if left is less than right, etc.
+         */
+        @Inline
+        public static int compareTo(Object buffer1, long memoryOffset1, int length1,
+                             Object buffer2, long memoryOffset2, int length2)
+        {
+            int minLength = Math.min(length1, length2);
+
+            /*
+             * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a
+             * time is no slower than comparing 4 bytes at a time even on 32-bit.
+             * On the other hand, it is substantially faster on 64-bit.
+             */
+            int wordComparisons = minLength & ~7;
+            for (int i = 0; i < wordComparisons ; i += Longs.BYTES)
+            {
+                long lw = theUnsafe.getLong(buffer1, memoryOffset1 + (long) i);
+                long rw = theUnsafe.getLong(buffer2, memoryOffset2 + (long) i);
+
+                if (lw != rw)
+                {
+                    if (BIG_ENDIAN)
+                        return UnsignedLongs.compare(lw, rw);
+
+                    return UnsignedLongs.compare(Long.reverseBytes(lw), Long.reverseBytes(rw));
+                }
+            }
+
+            for (int i = wordComparisons ; i < minLength ; i++)
+            {
+                int b1 = theUnsafe.getByte(buffer1, memoryOffset1 + i) & 0xFF;
+                int b2 = theUnsafe.getByte(buffer2, memoryOffset2 + i) & 0xFF;
+                if (b1 != b2)
+                    return b1 - b2;
+            }
+
+            return length1 - length2;
+        }
+
+    }
+
+    @SuppressWarnings("unused")
+    public static final class PureJavaOperations implements ByteOperations
+    {
+        @Override
+        public int compare(byte[] buffer1, int offset1, int length1,
+                           byte[] buffer2, int offset2, int length2)
+        {
+            // Short circuit equal case
+            if (buffer1 == buffer2 && offset1 == offset2 && length1 == length2)
+                return 0;
+
+            int end1 = offset1 + length1;
+            int end2 = offset2 + length2;
+            for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++)
+            {
+                int a = (buffer1[i] & 0xff);
+                int b = (buffer2[j] & 0xff);
+                if (a != b)
+                {
+                    return a - b;
+                }
+            }
+            return length1 - length2;
+        }
+
+        public int compare(ByteBuffer buffer1, byte[] buffer2, int offset2, int length2)
+        {
+            if (buffer1.hasArray())
+                return compare(buffer1.array(), buffer1.arrayOffset() + buffer1.position(), buffer1.remaining(),
+                               buffer2, offset2, length2);
+            return compare(buffer1, ByteBuffer.wrap(buffer2, offset2, length2));
+        }
+
+        public int compare(ByteBuffer buffer1, ByteBuffer buffer2)
+        {
+            int end1 = buffer1.limit();
+            int end2 = buffer2.limit();
+            for (int i = buffer1.position(), j = buffer2.position(); i < end1 && j < end2; i++, j++)
+            {
+                int a = (buffer1.get(i) & 0xff);
+                int b = (buffer2.get(j) & 0xff);
+                if (a != b)
+                {
+                    return a - b;
+                }
+            }
+            return buffer1.remaining() - buffer2.remaining();
+        }
+
+        public void copy(ByteBuffer src, int srcPosition, byte[] trg, int trgPosition, int length)
+        {
+            if (src.hasArray())
+            {
+                System.arraycopy(src.array(), src.arrayOffset() + srcPosition, trg, trgPosition, length);
+                return;
+            }
+            src = src.duplicate();
+            src.position(srcPosition);
+            src.get(trg, trgPosition, length);
+        }
+
+        public void copy(ByteBuffer src, int srcPosition, ByteBuffer trg, int trgPosition, int length)
+        {
+            if (src.hasArray() && trg.hasArray())
+            {
+                System.arraycopy(src.array(), src.arrayOffset() + srcPosition, trg.array(), trg.arrayOffset() + trgPosition, length);
+                return;
+            }
+            src = src.duplicate();
+            src.position(srcPosition).limit(srcPosition + length);
+            trg = trg.duplicate();
+            trg.position(trgPosition);
+            trg.put(src);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/FilterFactory.java b/src/java/org/apache/cassandra/utils/FilterFactory.java
index e50cbda..757e8dd 100644
--- a/src/java/org/apache/cassandra/utils/FilterFactory.java
+++ b/src/java/org/apache/cassandra/utils/FilterFactory.java

@@ -18,10 +18,9 @@
 package org.apache.cassandra.utils;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.obs.IBitSet;
 import org.apache.cassandra.utils.obs.OffHeapBitSet;
 import org.apache.cassandra.utils.obs.OpenBitSet;
@@ -36,7 +35,7 @@
     private static final Logger logger = LoggerFactory.getLogger(FilterFactory.class);
     private static final long BITSET_EXCESS = 20;
 
-    public static void serialize(IFilter bf, DataOutput output) throws IOException
+    public static void serialize(IFilter bf, DataOutputPlus output) throws IOException
     {
         Murmur3BloomFilter.serializer.serialize((Murmur3BloomFilter) bf, output);
     }

diff --git a/src/java/org/apache/cassandra/utils/Hex.java b/src/java/org/apache/cassandra/utils/Hex.java
index 463c49a..5ba0b1a 100644
--- a/src/java/org/apache/cassandra/utils/Hex.java
+++ b/src/java/org/apache/cassandra/utils/Hex.java

@@ -18,11 +18,16 @@
 package org.apache.cassandra.utils;
 
 import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class Hex
 {
     private static final Constructor<String> stringConstructor = getProtectedConstructor(String.class, int.class, int.class, char[].class);
     private final static byte[] charToByte = new byte[256];
+    private static final Logger logger = LoggerFactory.getLogger(Hex.class);
 
     // package protected for use by ByteBufferUtil. Do not modify this array !!
     static final char[] byteToChar = new char[16];
@@ -91,6 +96,12 @@
             try
             {
                 s = stringConstructor.newInstance(0, c.length, c);
+            } 
+            catch (InvocationTargetException ite) {
+                // The underlying constructor failed. Unwrapping the exception.
+                Throwable cause = ite.getCause();
+                logger.error("Underlying string constructor threw an error: {}",
+                    cause == null ? ite.getMessage() : cause.getMessage());
             }
             catch (Exception e)
             {

diff --git a/src/java/org/apache/cassandra/utils/IFilter.java b/src/java/org/apache/cassandra/utils/IFilter.java
index 10f6df2..91c0e36 100644
--- a/src/java/org/apache/cassandra/utils/IFilter.java
+++ b/src/java/org/apache/cassandra/utils/IFilter.java

@@ -29,4 +29,6 @@
     void clear();
 
     long serializedSize();
+
+    void close();
 }

diff --git a/src/java/org/apache/cassandra/utils/IntervalTree.java b/src/java/org/apache/cassandra/utils/IntervalTree.java
index 2b81516..3755c54 100644
--- a/src/java/org/apache/cassandra/utils/IntervalTree.java
+++ b/src/java/org/apache/cassandra/utils/IntervalTree.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.utils;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
@@ -34,6 +33,7 @@
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class IntervalTree<C, D, I extends Interval<C, D>> implements Iterable<I>
 {
@@ -91,7 +91,7 @@
 
     public static <C, D, I extends Interval<C, D>> Serializer<C, D, I> serializer(ISerializer<C> pointSerializer, ISerializer<D> dataSerializer, Constructor<I> constructor)
     {
-        return new Serializer(pointSerializer, dataSerializer, constructor);
+        return new Serializer<>(pointSerializer, dataSerializer, constructor);
     }
 
     @SuppressWarnings("unchecked")
@@ -390,7 +390,7 @@
             this.constructor = constructor;
         }
 
-        public void serialize(IntervalTree<C, D, I> it, DataOutput out, int version) throws IOException
+        public void serialize(IntervalTree<C, D, I> it, DataOutputPlus out, int version) throws IOException
         {
             out.writeInt(it.count);
             for (Interval<C, D> interval : it)

diff --git a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
new file mode 100644
index 0000000..9fdc5ea
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.cassandra.service.StorageService;
+
+public class JVMStabilityInspector
+{
+    private static final Logger logger = LoggerFactory.getLogger(JVMStabilityInspector.class);
+    /**
+     * Certain Throwables and Exceptions represent "Stop" conditions for the server.
+     * @param t
+     *      The Throwable to check for server-stop conditions
+     */
+    public static void inspectThrowable(Throwable t)
+    {
+        boolean isUnstable = false;
+        if (t instanceof OutOfMemoryError)
+            isUnstable = true;
+        if (isUnstable)
+        {
+            t.printStackTrace(System.err);
+            logger.error("JVM state determined to be unstable.  Exiting forcefully due to:", t);
+            StorageService.instance.removeShutdownHook();
+            System.exit(100);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/LatencyTracker.java b/src/java/org/apache/cassandra/utils/LatencyTracker.java
deleted file mode 100644
index 02a07e9..0000000
--- a/src/java/org/apache/cassandra/utils/LatencyTracker.java
+++ /dev/null

@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.util.concurrent.atomic.AtomicLong;
-
-public class LatencyTracker
-{
-    private final AtomicLong opCount = new AtomicLong(0);
-    private final AtomicLong totalLatency = new AtomicLong(0);
-    private long lastLatency = 0;
-    private long lastOpCount = 0;
-    private final EstimatedHistogram totalHistogram = new EstimatedHistogram();
-    private final EstimatedHistogram recentHistogram = new EstimatedHistogram();
-
-    /** takes nanoseconds **/
-    public void addNano(long nanos)
-    {
-        // convert to microseconds.  1 millionth
-        addMicro(nanos / 1000);
-    }
-
-    public void addMicro(long micros)
-    {
-        opCount.incrementAndGet();
-        totalLatency.addAndGet(micros);
-        totalHistogram.add(micros);
-        recentHistogram.add(micros);
-    }
-
-    public long getOpCount()
-    {
-        return opCount.get();
-    }
-
-    /** returns  microseconds */
-    public long getTotalLatencyMicros()
-    {
-        return totalLatency.get();
-    }
-
-    /** returns microseconds */
-    public double getRecentLatencyMicros()
-    {
-        long ops = opCount.get();
-        long n = totalLatency.get();
-        try
-        {
-            return ((double)n - lastLatency) / (ops - lastOpCount);
-        }
-        finally
-        {
-            lastLatency = n;
-            lastOpCount = ops;
-        }
-    }
-
-    public long[] getTotalLatencyHistogramMicros()
-    {
-        return totalHistogram.getBuckets(false);
-    }
-
-    public long[] getRecentLatencyHistogramMicros()
-    {
-        return recentHistogram.getBuckets(true);
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/MergeIterator.java b/src/java/org/apache/cassandra/utils/MergeIterator.java
index 4c89edb..e61326e 100644
--- a/src/java/org/apache/cassandra/utils/MergeIterator.java
+++ b/src/java/org/apache/cassandra/utils/MergeIterator.java

@@ -35,15 +35,17 @@
         this.reducer = reducer;
     }
 
-    public static <In, Out> IMergeIterator<In, Out> get(final List<? extends Iterator<In>> sources,
-                                                    Comparator<In> comparator,
-                                                    final Reducer<In, Out> reducer)
+    public static <In, Out> IMergeIterator<In, Out> get(List<? extends Iterator<In>> sources,
+                                                        Comparator<In> comparator,
+                                                        Reducer<In, Out> reducer)
     {
         if (sources.size() == 1)
+        {
             return reducer.trivialReduceIsTrivial()
-                   ? new TrivialOneToOne<In, Out>(sources, reducer)
-                   : new OneToOne<In, Out>(sources, reducer);
-        return new ManyToOne<In, Out>(sources, comparator, reducer);
+                 ? new TrivialOneToOne<>(sources, reducer)
+                 : new OneToOne<>(sources, reducer);
+        }
+        return new ManyToOne<>(sources, comparator, reducer);
     }
 
     public Iterable<? extends Iterator<In>> iterators()
@@ -80,16 +82,16 @@
         public ManyToOne(List<? extends Iterator<In>> iters, Comparator<In> comp, Reducer<In, Out> reducer)
         {
             super(iters, reducer);
-            this.queue = new PriorityQueue<Candidate<In>>(Math.max(1, iters.size()));
+            this.queue = new PriorityQueue<>(Math.max(1, iters.size()));
             for (Iterator<In> iter : iters)
             {
-                Candidate<In> candidate = new Candidate<In>(iter, comp);
+                Candidate<In> candidate = new Candidate<>(iter, comp);
                 if (!candidate.advance())
                     // was empty
                     continue;
                 this.queue.add(candidate);
             }
-            this.candidates = new ArrayDeque<Candidate<In>>(queue.size());
+            this.candidates = new ArrayDeque<>(queue.size());
         }
 
         protected final Out computeNext()
@@ -174,8 +176,8 @@
         protected abstract Out getReduced();
 
         /**
-         * Called at the begining of each new key, before any reduce is called.
-         * To be overriden by implementing classes.
+         * Called at the beginning of each new key, before any reduce is called.
+         * To be overridden by implementing classes.
          */
         protected void onKeyChange() {}
 
@@ -215,6 +217,7 @@
             source = sources.get(0);
         }
 
+        @SuppressWarnings("unchecked")
         protected Out computeNext()
         {
             if (!source.hasNext())

diff --git a/src/java/org/apache/cassandra/utils/MerkleTree.java b/src/java/org/apache/cassandra/utils/MerkleTree.java
index ce71ec4..8e6d5c0 100644
--- a/src/java/org/apache/cassandra/utils/MerkleTree.java
+++ b/src/java/org/apache/cassandra/utils/MerkleTree.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.utils;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.*;
@@ -33,6 +32,7 @@
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
  * A MerkleTree implemented as a binary tree.
@@ -78,7 +78,7 @@
 
     public static class MerkleTreeSerializer implements IVersionedSerializer<MerkleTree>
     {
-        public void serialize(MerkleTree mt, DataOutput out, int version) throws IOException
+        public void serialize(MerkleTree mt, DataOutputPlus out, int version) throws IOException
         {
             out.writeByte(mt.hashdepth);
             out.writeLong(mt.maxsize);
@@ -813,7 +813,7 @@
 
         private static class InnerSerializer implements IVersionedSerializer<Inner>
         {
-            public void serialize(Inner inner, DataOutput out, int version) throws IOException
+            public void serialize(Inner inner, DataOutputPlus out, int version) throws IOException
             {
                 if (inner.hash == null)
                     out.writeInt(-1);
@@ -894,7 +894,7 @@
 
         private static class LeafSerializer implements IVersionedSerializer<Leaf>
         {
-            public void serialize(Leaf leaf, DataOutput out, int version) throws IOException
+            public void serialize(Leaf leaf, DataOutputPlus out, int version) throws IOException
             {
                 if (leaf.hash == null)
                 {
@@ -1035,7 +1035,7 @@
 
         private static class HashableSerializer implements IVersionedSerializer<Hashable>
         {
-            public void serialize(Hashable h, DataOutput out, int version) throws IOException
+            public void serialize(Hashable h, DataOutputPlus out, int version) throws IOException
             {
                 if (h instanceof Inner)
                 {

diff --git a/src/java/org/apache/cassandra/utils/MurmurHash.java b/src/java/org/apache/cassandra/utils/MurmurHash.java
index 9dcde6d..8d17ea9 100644
--- a/src/java/org/apache/cassandra/utils/MurmurHash.java
+++ b/src/java/org/apache/cassandra/utils/MurmurHash.java

@@ -17,6 +17,8 @@
  */
 package org.apache.cassandra.utils;
 
+import net.nicoulaj.compilecommand.annotations.Inline;
+
 import java.nio.ByteBuffer;
 
 /**

diff --git a/src/java/org/apache/cassandra/utils/ObjectSizes.java b/src/java/org/apache/cassandra/utils/ObjectSizes.java
index acee919..3720385 100644
--- a/src/java/org/apache/cassandra/utils/ObjectSizes.java
+++ b/src/java/org/apache/cassandra/utils/ObjectSizes.java

@@ -21,229 +21,143 @@
  */
 
 
-import java.lang.management.ManagementFactory;
-import java.lang.management.MemoryPoolMXBean;
 import java.nio.ByteBuffer;
 
+import org.github.jamm.MemoryLayoutSpecification;
 import org.github.jamm.MemoryMeter;
 
 /**
- * Modified version of the code from.
- * https://github.com/twitter/commons/blob/master
- * /src/java/com/twitter/common/objectsize/ObjectSizeCalculator.java
- *
- * Difference is that we don't use reflection.
+ * A convenience class for wrapping access to MemoryMeter
  */
 public class ObjectSizes
 {
-    public static final MemoryLayoutSpecification SPEC = getEffectiveMemoryLayoutSpecification();
-    private static final MemoryMeter meter = new MemoryMeter().omitSharedBufferOverhead();
+    private static final MemoryMeter meter = new MemoryMeter()
+                                             .omitSharedBufferOverhead()
+                                             .withGuessing(MemoryMeter.Guess.FALLBACK_UNSAFE);
 
-    /**
-     * Describes constant memory overheads for various constructs in a JVM
-     * implementation.
-     */
-    public interface MemoryLayoutSpecification
-    {
-        int getArrayHeaderSize();
-
-        int getObjectHeaderSize();
-
-        int getObjectPadding();
-
-        int getReferenceSize();
-
-        int getSuperclassFieldPadding();
-    }
-
-    /**
-     * Memory a class consumes, including the object header and the size of the fields.
-     * @param fieldsSize Total size of the primitive fields of a class
-     * @return Total in-memory size of the class
-     */
-    public static long getFieldSize(long fieldsSize)
-    {
-        return roundTo(SPEC.getObjectHeaderSize() + fieldsSize, SPEC.getObjectPadding());
-    }
-
-    /**
-     * Memory a super class consumes, given the primitive field sizes
-     * @param fieldsSize Total size of the primitive fields of the super class
-     * @return Total additional in-memory that the super class takes up
-     */
-    public static long getSuperClassFieldSize(long fieldsSize)
-    {
-        return roundTo(fieldsSize, SPEC.getSuperclassFieldPadding());
-    }
-
-    /**
-     * Memory an array will consume
-     * @param length Number of elements in the array
-     * @param elementSize In-memory size of each element's primitive stored
-     * @return In-memory size of the array
-     */
-    public static long getArraySize(int length, long elementSize)
-    {
-        return roundTo(SPEC.getArrayHeaderSize() + length * elementSize, SPEC.getObjectPadding());
-    }
+    private static final long BUFFER_EMPTY_SIZE = measure(ByteBufferUtil.EMPTY_BYTE_BUFFER);
+    private static final long STRING_EMPTY_SIZE = measure("");
 
     /**
      * Memory a byte array consumes
      * @param bytes byte array to get memory size
-     * @return In-memory size of the array
+     * @return heap-size of the array
      */
-    public static long getArraySize(byte[] bytes)
+    public static long sizeOfArray(byte[] bytes)
     {
-        return getArraySize(bytes.length, 1);
+        return sizeOfArray(bytes.length, 1);
     }
 
     /**
+     * Memory a long array consumes
+     * @param longs byte array to get memory size
+     * @return heap-size of the array
+     */
+    public static long sizeOfArray(long[] longs)
+    {
+        return sizeOfArray(longs.length, 8);
+    }
+
+    /**
+     * Memory an int array consumes
+     * @param ints byte array to get memory size
+     * @return heap-size of the array
+     */
+    public static long sizeOfArray(int[] ints)
+    {
+        return sizeOfArray(ints.length, 4);
+    }
+
+    /**
+     * Memory a reference array consumes
+     * @param length the length of the reference array
+     * @return heap-size of the array
+     */
+    public static long sizeOfReferenceArray(int length)
+    {
+        return sizeOfArray(length, MemoryLayoutSpecification.SPEC.getReferenceSize());
+    }
+
+    /**
+     * Memory a reference array consumes itself only
+     * @param objects the array to size
+     * @return heap-size of the array (excluding memory retained by referenced objects)
+     */
+    public static long sizeOfArray(Object[] objects)
+    {
+        return sizeOfReferenceArray(objects.length);
+    }
+
+    private static long sizeOfArray(int length, long elementSize)
+    {
+        return MemoryLayoutSpecification.sizeOfArray(length, elementSize);
+    }
+
+    /**
+     * Memory a ByteBuffer array consumes.
+     */
+    public static long sizeOnHeapOf(ByteBuffer[] array)
+    {
+        long allElementsSize = 0;
+        for (int i = 0; i < array.length; i++)
+            if (array[i] != null)
+                allElementsSize += sizeOnHeapOf(array[i]);
+
+        return allElementsSize + sizeOfArray(array);
+    }
+
+    public static long sizeOnHeapExcludingData(ByteBuffer[] array)
+    {
+        return BUFFER_EMPTY_SIZE * array.length + sizeOfArray(array);
+    }
+    /**
      * Memory a byte buffer consumes
      * @param buffer ByteBuffer to calculate in memory size
      * @return Total in-memory size of the byte buffer
      */
-    public static long getSize(ByteBuffer buffer)
+    public static long sizeOnHeapOf(ByteBuffer buffer)
     {
-        long size = 0;
-        /* BB Class */
-        // final byte[] hb;
-        // final int offset;
-        // boolean isReadOnly;
-        size += ObjectSizes.getFieldSize(1L + 4 + ObjectSizes.getReferenceSize() + ObjectSizes.getArraySize(buffer.capacity(), 1));
-        /* Super Class */
-        // private int mark;
-        // private int position;
-        // private int limit;
-        // private int capacity;
-        size += ObjectSizes.getSuperClassFieldSize(4L + 4 + 4 + 4 + 8);
-        return size;
+        if (buffer.isDirect())
+            return BUFFER_EMPTY_SIZE;
+        // if we're only referencing a sub-portion of the ByteBuffer, don't count the array overhead (assume it's slab
+        // allocated, so amortized over all the allocations the overhead is negligible and better to undercount than over)
+        if (buffer.capacity() > buffer.remaining())
+            return buffer.remaining();
+        return BUFFER_EMPTY_SIZE + sizeOfArray(buffer.capacity(), 1);
     }
 
-    public static long roundTo(long x, int multiple)
+    public static long sizeOnHeapExcludingData(ByteBuffer buffer)
     {
-        return ((x + multiple - 1) / multiple) * multiple;
+        return BUFFER_EMPTY_SIZE;
     }
 
     /**
-     * @return Memory a reference consumes on the current architecture.
+     * Memory a String consumes
+     * @param str String to calculate memory size of
+     * @return Total in-memory size of the String
      */
-    public static int getReferenceSize()
+    public static long sizeOf(String str)
     {
-        return SPEC.getReferenceSize();
+        return STRING_EMPTY_SIZE + sizeOfArray(str.length(), 2);
     }
 
-    private static MemoryLayoutSpecification getEffectiveMemoryLayoutSpecification()
-    {
-        final String dataModel = System.getProperty("sun.arch.data.model");
-        if ("32".equals(dataModel))
-        {
-            // Running with 32-bit data model
-            return new MemoryLayoutSpecification()
-            {
-                public int getArrayHeaderSize()
-                {
-                    return 12;
-                }
-
-                public int getObjectHeaderSize()
-                {
-                    return 8;
-                }
-
-                public int getObjectPadding()
-                {
-                    return 8;
-                }
-
-                public int getReferenceSize()
-                {
-                    return 4;
-                }
-
-                public int getSuperclassFieldPadding()
-                {
-                    return 4;
-                }
-            };
-        }
-
-        final String strVmVersion = System.getProperty("java.vm.version");
-        final int vmVersion = Integer.parseInt(strVmVersion.substring(0, strVmVersion.indexOf('.')));
-        if (vmVersion >= 17)
-        {
-            long maxMemory = 0;
-            for (MemoryPoolMXBean mp : ManagementFactory.getMemoryPoolMXBeans())
-            {
-                maxMemory += mp.getUsage().getMax();
-            }
-            if (maxMemory < 30L * 1024 * 1024 * 1024)
-            {
-                // HotSpot 17.0 and above use compressed OOPs below 30GB of RAM
-                // total for all memory pools (yes, including code cache).
-                return new MemoryLayoutSpecification()
-                {
-                    public int getArrayHeaderSize()
-                    {
-                        return 16;
-                    }
-
-                    public int getObjectHeaderSize()
-                    {
-                        return 12;
-                    }
-
-                    public int getObjectPadding()
-                    {
-                        return 8;
-                    }
-
-                    public int getReferenceSize()
-                    {
-                        return 4;
-                    }
-
-                    public int getSuperclassFieldPadding()
-                    {
-                        return 4;
-                    }
-                };
-            }
-        }
-
-        /* Worst case we over count. */
-
-        // In other cases, it's a 64-bit uncompressed OOPs object model
-        return new MemoryLayoutSpecification()
-        {
-            public int getArrayHeaderSize()
-            {
-                return 24;
-            }
-
-            public int getObjectHeaderSize()
-            {
-                return 16;
-            }
-
-            public int getObjectPadding()
-            {
-                return 8;
-            }
-
-            public int getReferenceSize()
-            {
-                return 8;
-            }
-
-            public int getSuperclassFieldPadding()
-            {
-                return 8;
-            }
-        };
-    }
-
+    /**
+     * @param pojo the object to measure
+     * @return the size on the heap of the instance and all retained heap referenced by it, excluding portions of
+     * ByteBuffer that are not directly referenced by it but including any other referenced that may also be retained
+     * by other objects.
+     */
     public static long measureDeep(Object pojo)
     {
         return meter.measureDeep(pojo);
     }
+
+    /**
+     * @param pojo the object to measure
+     * @return the size on the heap of the instance only, excluding any referenced objects
+     */
+    public static long measure(Object pojo)
+    {
+        return meter.measure(pojo);
+    }
 }

diff --git a/src/java/org/apache/cassandra/utils/PureJavaCrc32.java b/src/java/org/apache/cassandra/utils/PureJavaCrc32.java
index da03986..9a1ac02 100644
--- a/src/java/org/apache/cassandra/utils/PureJavaCrc32.java
+++ b/src/java/org/apache/cassandra/utils/PureJavaCrc32.java
Binary files differ

diff --git a/src/java/org/apache/cassandra/utils/ResourceWatcher.java b/src/java/org/apache/cassandra/utils/ResourceWatcher.java
index ac695a7..2dfab95 100644
--- a/src/java/org/apache/cassandra/utils/ResourceWatcher.java
+++ b/src/java/org/apache/cassandra/utils/ResourceWatcher.java

@@ -60,6 +60,7 @@
             }
             catch (Throwable t)
             {
+                JVMStabilityInspector.inspectThrowable(t);
                 logger.error(String.format("Timed run of %s failed.", callback.getClass()), t);
             }
         }

diff --git a/src/java/org/apache/cassandra/utils/SimpleCondition.java b/src/java/org/apache/cassandra/utils/SimpleCondition.java
deleted file mode 100644
index 4d5f896..0000000
--- a/src/java/org/apache/cassandra/utils/SimpleCondition.java
+++ /dev/null

@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.util.Date;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.locks.Condition;
-
-// fulfils the Condition interface without spurious wakeup problems
-// (or lost notify problems either: that is, even if you call await()
-// _after_ signal(), it will work as desired.)
-public class SimpleCondition implements Condition
-{
-    private boolean set;
-
-    public synchronized void await() throws InterruptedException
-    {
-        while (!set)
-            wait();
-    }
-
-    public synchronized void reset()
-    {
-        set = false;
-    }
-
-    public synchronized boolean await(long time, TimeUnit unit) throws InterruptedException
-    {
-        long start = System.nanoTime();
-        long timeout = unit.toNanos(time);
-        long elapsed;
-        while (!set && (elapsed = System.nanoTime() - start) < timeout)
-        {
-            TimeUnit.NANOSECONDS.timedWait(this, timeout - elapsed);
-        }
-        return set;
-    }
-
-    public void signal()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public synchronized void signalAll()
-    {
-        set = true;
-        notifyAll();
-    }
-
-    public synchronized boolean isSignaled()
-    {
-        return set;
-    }
-
-    public void awaitUninterruptibly()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public long awaitNanos(long nanosTimeout) throws InterruptedException
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public boolean awaitUntil(Date deadline) throws InterruptedException
-    {
-        throw new UnsupportedOperationException();
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/SkipNullRepresenter.java b/src/java/org/apache/cassandra/utils/SkipNullRepresenter.java
deleted file mode 100644
index 2845bc3..0000000
--- a/src/java/org/apache/cassandra/utils/SkipNullRepresenter.java
+++ /dev/null

@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import org.yaml.snakeyaml.introspector.Property;
-import org.yaml.snakeyaml.nodes.NodeTuple;
-import org.yaml.snakeyaml.nodes.Tag;
-import org.yaml.snakeyaml.representer.Representer;
-
-
-/* used to prevent null values from being included in generated YAML */
-public class SkipNullRepresenter extends Representer
-{
-    protected NodeTuple representJavaBeanProperty(Object javaBean, Property property, Object propertyValue, Tag customTag)
-    {
-        if (propertyValue == null)
-        {
-            return null;
-        }
-        else
-        {
-            return super.representJavaBeanProperty(javaBean, property, propertyValue, customTag);
-        }
-    }
-}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/utils/StatusLogger.java b/src/java/org/apache/cassandra/utils/StatusLogger.java
index 7df94aa..66ff985 100644
--- a/src/java/org/apache/cassandra/utils/StatusLogger.java
+++ b/src/java/org/apache/cassandra/utils/StatusLogger.java

@@ -19,8 +19,6 @@
 
 import java.lang.management.ManagementFactory;
 import java.util.Set;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.ThreadPoolExecutor;
 import javax.management.JMX;
 import javax.management.MBeanServer;
 import javax.management.MalformedObjectNameException;
@@ -36,9 +34,7 @@
 import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutorMBean;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.CacheService;
@@ -78,8 +74,6 @@
         // one offs
         logger.info(String.format("%-25s%10s%10s",
                                   "CompactionManager", CompactionManager.instance.getActiveCompactions(), CompactionManager.instance.getPendingTasks()));
-        logger.info(String.format("%-25s%10s%10s",
-                                  "Commitlog", "n/a", CommitLog.instance.getPendingTasks()));
         int pendingCommands = 0;
         for (int n : MessagingService.instance().getCommandPendingTasks().values())
         {

diff --git a/src/java/org/apache/cassandra/utils/StreamingHistogram.java b/src/java/org/apache/cassandra/utils/StreamingHistogram.java
index c4ba956..3f5a715 100644
--- a/src/java/org/apache/cassandra/utils/StreamingHistogram.java
+++ b/src/java/org/apache/cassandra/utils/StreamingHistogram.java

@@ -17,15 +17,16 @@
  */
 package org.apache.cassandra.utils;
 
-import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.io.ISerializer;
-
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.*;
+
 import com.google.common.base.Objects;
 
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
 /**
  * Histogram that can be constructed from streaming of data.
  *
@@ -50,13 +51,13 @@
     public StreamingHistogram(int maxBinSize)
     {
         this.maxBinSize = maxBinSize;
-        bin = new TreeMap<Double, Long>();
+        bin = new TreeMap<>();
     }
 
     private StreamingHistogram(int maxBinSize, Map<Double, Long> bin)
     {
         this.maxBinSize = maxBinSize;
-        this.bin = new TreeMap<Double, Long>(bin);
+        this.bin = new TreeMap<>(bin);
     }
 
     /**
@@ -169,7 +170,7 @@
 
     public static class StreamingHistogramSerializer implements ISerializer<StreamingHistogram>
     {
-        public void serialize(StreamingHistogram histogram, DataOutput out) throws IOException
+        public void serialize(StreamingHistogram histogram, DataOutputPlus out) throws IOException
         {
             out.writeInt(histogram.maxBinSize);
             Map<Double, Long> entries = histogram.getAsMap();
@@ -185,7 +186,7 @@
         {
             int maxBinSize = in.readInt();
             int size = in.readInt();
-            Map<Double, Long> tmp = new HashMap<Double, Long>(size);
+            Map<Double, Long> tmp = new HashMap<>(size);
             for (int i = 0; i < size; i++)
             {
                 tmp.put(in.readDouble(), in.readLong());
@@ -196,7 +197,12 @@
 
         public long serializedSize(StreamingHistogram histogram, TypeSizes typeSizes)
         {
-            throw new UnsupportedOperationException();
+            long size = typeSizes.sizeof(histogram.maxBinSize);
+            Map<Double, Long> entries = histogram.getAsMap();
+            size += typeSizes.sizeof(entries.size());
+            // size of entries = size * (8(double) + 8(long))
+            size += entries.size() * (8 + 8);
+            return size;
         }
     }
 

diff --git a/src/java/org/apache/cassandra/utils/UUIDGen.java b/src/java/org/apache/cassandra/utils/UUIDGen.java
index f385744..53293b2 100644
--- a/src/java/org/apache/cassandra/utils/UUIDGen.java
+++ b/src/java/org/apache/cassandra/utils/UUIDGen.java

@@ -25,6 +25,8 @@
 import java.util.Random;
 import java.util.UUID;
 
+import com.google.common.annotations.VisibleForTesting;
+
 
 /**
  * The goods are here: www.ietf.org/rfc/rfc4122.txt.
@@ -80,6 +82,12 @@
         return new UUID(createTime(fromUnixTimestamp(when)), clockSeqAndNode);
     }
 
+    @VisibleForTesting
+    public static UUID getTimeUUID(long when, long clockSeqAndNode)
+    {
+        return new UUID(createTime(fromUnixTimestamp(when)), clockSeqAndNode);
+    }
+
     /** creates a type 1 uuid from raw bytes. */
     public static UUID getUUID(ByteBuffer raw)
     {

diff --git a/src/java/org/apache/cassandra/utils/UUIDSerializer.java b/src/java/org/apache/cassandra/utils/UUIDSerializer.java
index afaed92..2aa2b4e 100644
--- a/src/java/org/apache/cassandra/utils/UUIDSerializer.java
+++ b/src/java/org/apache/cassandra/utils/UUIDSerializer.java

@@ -18,18 +18,18 @@
 package org.apache.cassandra.utils;
 
 import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.util.UUID;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class UUIDSerializer implements IVersionedSerializer<UUID>
 {
     public static UUIDSerializer serializer = new UUIDSerializer();
 
-    public void serialize(UUID uuid, DataOutput out, int version) throws IOException
+    public void serialize(UUID uuid, DataOutputPlus out, int version) throws IOException
     {
         out.writeLong(uuid.getMostSignificantBits());
         out.writeLong(uuid.getLeastSignificantBits());

diff --git a/src/java/org/apache/cassandra/utils/btree/BTree.java b/src/java/org/apache/cassandra/utils/btree/BTree.java
new file mode 100644
index 0000000..1145d12
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/BTree.java

@@ -0,0 +1,431 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Queue;
+
+import org.apache.cassandra.utils.ObjectSizes;
+
+import static org.apache.cassandra.utils.btree.UpdateFunction.NoOp;
+
+public class BTree
+{
+    /**
+     * Leaf Nodes are a raw array of values: Object[V1, V1, ...,].
+     *
+     * Branch Nodes: Object[V1, V2, ..., child[&lt;V1.key], child[&lt;V2.key], ..., child[&lt; Inf]], where
+     * each child is another node, i.e., an Object[].  Thus, the value elements in a branch node are the
+     * first half of the array, rounding down.  In our implementation, each value must include its own key;
+     * we access these via Comparator, rather than directly. 
+     *
+     * So we can quickly distinguish between leaves and branches, we require that leaf nodes are always even number
+     * of elements (padded with a null, if necessary), and branches are always an odd number of elements.
+     *
+     * BTrees are immutable; updating one returns a new tree that reuses unmodified nodes.
+     *
+     * There are no references back to a parent node from its children.  (This would make it impossible to re-use
+     * subtrees when modifying the tree, since the modified tree would need new parent references.)
+     * Instead, we store these references in a Path as needed when navigating the tree.
+     */
+
+    // The maximum fan factor used for B-Trees
+    static final int FAN_SHIFT;
+    static
+    {
+        int fanfactor = 32;
+        if (System.getProperty("cassandra.btree.fanfactor") != null)
+            fanfactor = Integer.parseInt(System.getProperty("cassandra.btree.fanfactor"));
+        int shift = 1;
+        while (1 << shift < fanfactor)
+            shift += 1;
+        FAN_SHIFT = shift;
+    }
+    // NB we encode Path indexes as Bytes, so this needs to be less than Byte.MAX_VALUE / 2
+    static final int FAN_FACTOR = 1 << FAN_SHIFT;
+
+    // An empty BTree Leaf - which is the same as an empty BTree
+    static final Object[] EMPTY_LEAF = new Object[0];
+
+    // An empty BTree branch - used only for internal purposes in Modifier
+    static final Object[] EMPTY_BRANCH = new Object[1];
+
+    /**
+     * Returns an empty BTree
+     *
+     * @return
+     */
+    public static Object[] empty()
+    {
+        return EMPTY_LEAF;
+    }
+
+    public static <V> Object[] build(Collection<V> source, Comparator<V> comparator, boolean sorted, UpdateFunction<V> updateF)
+    {
+        return build(source, source.size(), comparator, sorted, updateF);
+    }
+
+    /**
+     * Creates a BTree containing all of the objects in the provided collection
+     *
+     * @param source     the items to build the tree with
+     * @param comparator the comparator that defines the ordering over the items in the tree
+     * @param sorted     if false, the collection will be copied and sorted to facilitate construction
+     * @param <V>
+     * @return
+     */
+    public static <V> Object[] build(Iterable<V> source, int size, Comparator<V> comparator, boolean sorted, UpdateFunction<V> updateF)
+    {
+        if (size < FAN_FACTOR)
+        {
+            // pad to even length to match contract that all leaf nodes are even
+            V[] values = (V[]) new Object[size + (size & 1)];
+            {
+                int i = 0;
+                for (V v : source)
+                    values[i++] = v;
+            }
+
+            // inline sorting since we're already calling toArray
+            if (!sorted)
+                Arrays.sort(values, 0, size, comparator);
+
+            // if updateF is specified
+            if (updateF != null)
+            {
+                for (int i = 0 ; i < size ; i++)
+                    values[i] = updateF.apply(values[i]);
+                updateF.allocated(ObjectSizes.sizeOfArray(values));
+            }
+            return values;
+        }
+
+        if (!sorted)
+            source = sorted(source, comparator, size);
+
+        Queue<Builder> queue = modifier.get();
+        Builder builder = queue.poll();
+        if (builder == null)
+            builder = new Builder();
+        Object[] btree = builder.build(source, updateF, size);
+        queue.add(builder);
+        return btree;
+    }
+
+    /**
+     * Returns a new BTree with the provided set inserting/replacing as necessary any equal items
+     *
+     * @param btree              the tree to update
+     * @param comparator         the comparator that defines the ordering over the items in the tree
+     * @param updateWith         the items to either insert / update
+     * @param updateWithIsSorted if false, updateWith will be copied and sorted to facilitate construction
+     * @param <V>
+     * @return
+     */
+    public static <V> Object[] update(Object[] btree, Comparator<V> comparator, Collection<V> updateWith, boolean updateWithIsSorted)
+    {
+        return update(btree, comparator, updateWith, updateWithIsSorted, NoOp.<V>instance());
+    }
+
+    public static <V> Object[] update(Object[] btree,
+                                      Comparator<V> comparator,
+                                      Collection<V> updateWith,
+                                      boolean updateWithIsSorted,
+                                      UpdateFunction<V> updateF)
+    {
+        return update(btree, comparator, updateWith, updateWith.size(), updateWithIsSorted, updateF);
+    }
+
+    /**
+     * Returns a new BTree with the provided set inserting/replacing as necessary any equal items
+     *
+     * @param btree              the tree to update
+     * @param comparator         the comparator that defines the ordering over the items in the tree
+     * @param updateWith         the items to either insert / update
+     * @param updateWithIsSorted if false, updateWith will be copied and sorted to facilitate construction
+     * @param updateF            the update function to apply to any pairs we are swapping, and maybe abort early
+     * @param <V>
+     * @return
+     */
+    public static <V> Object[] update(Object[] btree,
+                                      Comparator<V> comparator,
+                                      Iterable<V> updateWith,
+                                      int updateWithLength,
+                                      boolean updateWithIsSorted,
+                                      UpdateFunction<V> updateF)
+    {
+        if (btree.length == 0)
+            return build(updateWith, updateWithLength, comparator, updateWithIsSorted, updateF);
+
+        if (!updateWithIsSorted)
+            updateWith = sorted(updateWith, comparator, updateWithLength);
+
+        Queue<Builder> queue = modifier.get();
+        Builder builder = queue.poll();
+        if (builder == null)
+            builder = new Builder();
+        btree = builder.update(btree, comparator, updateWith, updateF);
+        queue.add(builder);
+        return btree;
+    }
+
+    /**
+     * Returns an Iterator over the entire tree
+     *
+     * @param btree    the tree to iterate over
+     * @param forwards if false, the iterator will start at the end and move backwards
+     * @param <V>
+     * @return
+     */
+    public static <V> Cursor<V, V> slice(Object[] btree, boolean forwards)
+    {
+        Cursor<V, V> r = new Cursor<>();
+        r.reset(btree, forwards);
+        return r;
+    }
+
+    /**
+     * Returns an Iterator over a sub-range of the tree
+     *
+     * @param btree      the tree to iterate over
+     * @param comparator the comparator that defines the ordering over the items in the tree
+     * @param start      the first item to include
+     * @param end        the last item to include
+     * @param forwards   if false, the iterator will start at end and move backwards
+     * @param <V>
+     * @return
+     */
+    public static <K, V extends K> Cursor<K, V> slice(Object[] btree, Comparator<K> comparator, K start, K end, boolean forwards)
+    {
+        Cursor<K, V> r = new Cursor<>();
+        r.reset(btree, comparator, start, end, forwards);
+        return r;
+    }
+
+    /**
+     * Returns an Iterator over a sub-range of the tree
+     *
+     * @param btree      the tree to iterate over
+     * @param comparator the comparator that defines the ordering over the items in the tree
+     * @param start      the first item to include
+     * @param end        the last item to include
+     * @param forwards   if false, the iterator will start at end and move backwards
+     * @param <V>
+     * @return
+     */
+    public static <K, V extends K> Cursor<K, V> slice(Object[] btree, Comparator<K> comparator, K start, boolean startInclusive, K end, boolean endInclusive, boolean forwards)
+    {
+        Cursor<K, V> r = new Cursor<>();
+        r.reset(btree, comparator, start, startInclusive, end, endInclusive, forwards);
+        return r;
+    }
+
+    public static <V> V find(Object[] node, Comparator<V> comparator, V find)
+    {
+        while (true)
+        {
+            int keyEnd = getKeyEnd(node);
+            int i = BTree.find(comparator, find, node, 0, keyEnd);
+            if (i >= 0)
+            {
+                return (V) node[i];
+            }
+            else if (!isLeaf(node))
+            {
+                i = -i - 1;
+                node = (Object[]) node[keyEnd + i];
+            }
+            else
+            {
+                return null;
+            }
+        }
+    }
+
+
+    // UTILITY METHODS
+
+    // same basic semantics as Arrays.binarySearch, but delegates to compare() method to avoid
+    // wrapping generic Comparator with support for Special +/- infinity sentinels
+    static <V> int find(Comparator<V> comparator, Object key, Object[] a, final int fromIndex, final int toIndex)
+    {
+        int low = fromIndex;
+        int high = toIndex - 1;
+
+        while (low <= high)
+        {
+            int mid = (low + high) / 2;
+            int cmp = comparator.compare((V) key, (V) a[mid]);
+
+            if (cmp > 0)
+                low = mid + 1;
+            else if (cmp < 0)
+                high = mid - 1;
+            else
+                return mid; // key found
+        }
+        return -(low + 1);  // key not found.
+    }
+
+    // get the upper bound we should search in for keys in the node
+    static int getKeyEnd(Object[] node)
+    {
+        if (isLeaf(node))
+            return getLeafKeyEnd(node);
+        else
+            return getBranchKeyEnd(node);
+    }
+
+    // get the last index that is non-null in the leaf node
+    static int getLeafKeyEnd(Object[] node)
+    {
+        int len = node.length;
+        if (len == 0)
+            return 0;
+        else if (node[len - 1] == null)
+            return len - 1;
+        else
+            return len;
+    }
+
+    // return the boundary position between keys/children for the branch node
+    static int getBranchKeyEnd(Object[] node)
+    {
+        return node.length / 2;
+    }
+
+    // returns true if the provided node is a leaf, false if it is a branch
+    static boolean isLeaf(Object[] node)
+    {
+        return (node.length & 1) == 0;
+    }
+
+    public static boolean isEmpty(Object[] tree)
+    {
+        return tree.length == 0;
+    }
+
+    public static int depth(Object[] tree)
+    {
+        int depth = 1;
+        while (!isLeaf(tree))
+        {
+            depth++;
+            tree = (Object[]) tree[getKeyEnd(tree)];
+        }
+        return depth;
+    }
+
+    // Special class for making certain operations easier, so we can define a +/- Inf
+    static interface Special extends Comparable<Object> { }
+    static final Special POSITIVE_INFINITY = new Special()
+    {
+        public int compareTo(Object o)
+        {
+            return o == this ? 0 : 1;
+        }
+    };
+    static final Special NEGATIVE_INFINITY = new Special()
+    {
+        public int compareTo(Object o)
+        {
+            return o == this ? 0 : -1;
+        }
+    };
+
+    private static final ThreadLocal<Queue<Builder>> modifier = new ThreadLocal<Queue<Builder>>()
+    {
+        @Override
+        protected Queue<Builder> initialValue()
+        {
+            return new ArrayDeque<>();
+        }
+    };
+
+    // return a sorted collection
+    private static <V> Collection<V> sorted(Iterable<V> source, Comparator<V> comparator, int size)
+    {
+        V[] vs = (V[]) new Object[size];
+        int i = 0;
+        for (V v : source)
+            vs[i++] = v;
+        Arrays.sort(vs, comparator);
+        return Arrays.asList(vs);
+    }
+
+    /** simple static wrapper to calls to cmp.compare() which checks if either a or b are Special (i.e. represent an infinity) */
+    // TODO : cheaper to check for POSITIVE/NEGATIVE infinity in callers, rather than here
+    static <V> int compare(Comparator<V> cmp, Object a, Object b)
+    {
+        if (a instanceof Special)
+            return ((Special) a).compareTo(b);
+        if (b instanceof Special)
+            return -((Special) b).compareTo(a);
+        return cmp.compare((V) a, (V) b);
+    }
+
+    public static boolean isWellFormed(Object[] btree, Comparator<? extends Object> cmp)
+    {
+        return isWellFormed(cmp, btree, true, NEGATIVE_INFINITY, POSITIVE_INFINITY);
+    }
+
+    private static boolean isWellFormed(Comparator<?> cmp, Object[] node, boolean isRoot, Object min, Object max)
+    {
+        if (cmp != null && !isNodeWellFormed(cmp, node, min, max))
+            return false;
+
+        if (isLeaf(node))
+        {
+            if (isRoot)
+                return node.length <= FAN_FACTOR;
+            return node.length >= FAN_FACTOR / 2 && node.length <= FAN_FACTOR;
+        }
+
+        int type = 0;
+        int childOffset = getBranchKeyEnd(node);
+        // compare each child node with the branch element at the head of this node it corresponds with
+        for (int i = childOffset; i < node.length; i++)
+        {
+            Object[] child = (Object[]) node[i];
+            Object localmax = i < node.length - 1 ? node[i - childOffset] : max;
+            if (!isWellFormed(cmp, child, false, min, localmax))
+                return false;
+            type |= isLeaf(child) ? 1 : 2;
+            min = localmax;
+        }
+        return type < 3; // either all leaves or all branches but not a mix
+    }
+
+    private static boolean isNodeWellFormed(Comparator<?> cmp, Object[] node, Object min, Object max)
+    {
+        Object previous = min;
+        int end = getKeyEnd(node);
+        for (int i = 0; i < end; i++)
+        {
+            Object current = node[i];
+            if (compare(cmp, previous, current) >= 0)
+                return false;
+            previous = current;
+        }
+        return compare(cmp, previous, max) < 0;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/btree/BTreeSet.java b/src/java/org/apache/cassandra/utils/btree/BTreeSet.java
new file mode 100644
index 0000000..d80b32e
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/BTreeSet.java

@@ -0,0 +1,383 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.NavigableSet;
+import java.util.SortedSet;
+
+public class BTreeSet<V> implements NavigableSet<V>
+{
+    protected final Comparator<V> comparator;
+    protected final Object[] tree;
+
+    public BTreeSet(Object[] tree, Comparator<V> comparator)
+    {
+        this.tree = tree;
+        this.comparator = comparator;
+    }
+
+    public BTreeSet<V> update(Collection<V> updateWith, boolean isSorted)
+    {
+        return new BTreeSet<>(BTree.update(tree, comparator, updateWith, isSorted, UpdateFunction.NoOp.<V>instance()), comparator);
+    }
+
+    @Override
+    public Comparator<? super V> comparator()
+    {
+        return comparator;
+    }
+
+    protected Cursor<V, V> slice(boolean forwards, boolean permitInversion)
+    {
+        return BTree.slice(tree, forwards);
+    }
+
+    @Override
+    public int size()
+    {
+        return slice(true, false).count();
+    }
+
+    @Override
+    public boolean isEmpty()
+    {
+        return slice(true, false).hasNext();
+    }
+
+    @Override
+    public Iterator<V> iterator()
+    {
+        return slice(true, true);
+    }
+
+    @Override
+    public Iterator<V> descendingIterator()
+    {
+        return slice(false, true);
+    }
+
+    @Override
+    public Object[] toArray()
+    {
+        return toArray(new Object[0]);
+    }
+
+    @Override
+    public <T> T[] toArray(T[] a)
+    {
+        int size = size();
+        if (a.length < size)
+            a = Arrays.copyOf(a, size);
+        int i = 0;
+        for (V v : this)
+            a[i++] = (T) v;
+        return a;
+    }
+
+    @Override
+    public NavigableSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
+    {
+        return new BTreeRange<>(tree, comparator, fromElement, fromInclusive, toElement, toInclusive);
+    }
+
+    @Override
+    public NavigableSet<V> headSet(V toElement, boolean inclusive)
+    {
+        return new BTreeRange<>(tree, comparator, null, true, toElement, inclusive);
+    }
+
+    @Override
+    public NavigableSet<V> tailSet(V fromElement, boolean inclusive)
+    {
+        return new BTreeRange<>(tree, comparator, fromElement, inclusive, null, true);
+    }
+
+    @Override
+    public SortedSet<V> subSet(V fromElement, V toElement)
+    {
+        return subSet(fromElement, true, toElement, false);
+    }
+
+    @Override
+    public SortedSet<V> headSet(V toElement)
+    {
+        return headSet(toElement, false);
+    }
+
+    @Override
+    public SortedSet<V> tailSet(V fromElement)
+    {
+        return tailSet(fromElement, true);
+    }
+
+    @Override
+    public V first()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V last()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean addAll(Collection<? extends V> c)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean retainAll(Collection<?> c)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean removeAll(Collection<?> c)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void clear()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V pollFirst()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V pollLast()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean add(V v)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean remove(Object o)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V lower(V v)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V floor(V v)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V ceiling(V v)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V higher(V v)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean contains(Object o)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean containsAll(Collection<?> c)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public NavigableSet<V> descendingSet()
+    {
+        return new BTreeRange<>(this.tree, this.comparator).descendingSet();
+    }
+
+    public static class BTreeRange<V> extends BTreeSet<V> implements NavigableSet<V>
+    {
+
+        protected final V lowerBound, upperBound;
+        protected final boolean inclusiveLowerBound, inclusiveUpperBound;
+
+        BTreeRange(Object[] tree, Comparator<V> comparator)
+        {
+            this(tree, comparator, null, true, null, true);
+        }
+
+        BTreeRange(BTreeRange<V> from)
+        {
+            this(from.tree, from.comparator, from.lowerBound, from.inclusiveLowerBound, from.upperBound, from.inclusiveUpperBound);
+        }
+
+        BTreeRange(Object[] tree, Comparator<V> comparator, V lowerBound, boolean inclusiveLowerBound, V upperBound, boolean inclusiveUpperBound)
+        {
+            super(tree, comparator);
+            this.lowerBound = lowerBound;
+            this.upperBound = upperBound;
+            this.inclusiveLowerBound = inclusiveLowerBound;
+            this.inclusiveUpperBound = inclusiveUpperBound;
+        }
+
+        // narrowing range constructor - makes this the intersection of the two ranges over the same tree b
+        BTreeRange(BTreeRange<V> a, BTreeRange<V> b)
+        {
+            super(a.tree, a.comparator);
+            assert a.tree == b.tree;
+            final BTreeRange<V> lb, ub;
+
+            if (a.lowerBound == null)
+            {
+                lb = b;
+            }
+            else if (b.lowerBound == null)
+            {
+                lb = a;
+            }
+            else
+            {
+                int c = comparator.compare(a.lowerBound, b.lowerBound);
+                if (c < 0)
+                    lb = b;
+                else if (c > 0)
+                    lb = a;
+                else if (!a.inclusiveLowerBound)
+                    lb = a;
+                else
+                    lb = b;
+            }
+
+            if (a.upperBound == null)
+            {
+                ub = b;
+            }
+            else if (b.upperBound == null)
+            {
+                ub = a;
+            }
+            else
+            {
+                int c = comparator.compare(b.upperBound, a.upperBound);
+                if (c < 0)
+                    ub = b;
+                else if (c > 0)
+                    ub = a;
+                else if (!a.inclusiveUpperBound)
+                    ub = a;
+                else
+                    ub = b;
+            }
+
+            lowerBound = lb.lowerBound;
+            inclusiveLowerBound = lb.inclusiveLowerBound;
+            upperBound = ub.upperBound;
+            inclusiveUpperBound = ub.inclusiveUpperBound;
+        }
+
+        @Override
+        protected Cursor<V, V> slice(boolean forwards, boolean permitInversion)
+        {
+            return BTree.slice(tree, comparator, lowerBound, inclusiveLowerBound, upperBound, inclusiveUpperBound, forwards);
+        }
+
+        @Override
+        public NavigableSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
+        {
+            return new BTreeRange<>(this, new BTreeRange<>(tree, comparator, fromElement, fromInclusive, toElement, toInclusive));
+        }
+
+        @Override
+        public NavigableSet<V> headSet(V toElement, boolean inclusive)
+        {
+            return new BTreeRange<>(this, new BTreeRange<>(tree, comparator, lowerBound, true, toElement, inclusive));
+        }
+
+        @Override
+        public NavigableSet<V> tailSet(V fromElement, boolean inclusive)
+        {
+            return new BTreeRange<>(this, new BTreeRange<>(tree, comparator, fromElement, inclusive, null, true));
+        }
+
+        @Override
+        public NavigableSet<V> descendingSet()
+        {
+            return new BTreeDescRange<>(this);
+        }
+    }
+
+    public static class BTreeDescRange<V> extends BTreeRange<V>
+    {
+        BTreeDescRange(BTreeRange<V> from)
+        {
+            super(from.tree, from.comparator, from.lowerBound, from.inclusiveLowerBound, from.upperBound, from.inclusiveUpperBound);
+        }
+
+        @Override
+        protected Cursor<V, V> slice(boolean forwards, boolean permitInversion)
+        {
+            return super.slice(permitInversion ? !forwards : forwards, false);
+        }
+
+        @Override
+        public NavigableSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
+        {
+            return super.subSet(toElement, toInclusive, fromElement, fromInclusive).descendingSet();
+        }
+
+        @Override
+        public NavigableSet<V> headSet(V toElement, boolean inclusive)
+        {
+            return super.tailSet(toElement, inclusive).descendingSet();
+        }
+
+        @Override
+        public NavigableSet<V> tailSet(V fromElement, boolean inclusive)
+        {
+            return super.headSet(fromElement, inclusive).descendingSet();
+        }
+
+        @Override
+        public NavigableSet<V> descendingSet()
+        {
+            return new BTreeRange<>(this);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/btree/Builder.java b/src/java/org/apache/cassandra/utils/btree/Builder.java
new file mode 100644
index 0000000..f6677d4
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/Builder.java

@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.Collection;
+import java.util.Comparator;
+
+import static org.apache.cassandra.utils.btree.BTree.EMPTY_LEAF;
+import static org.apache.cassandra.utils.btree.BTree.FAN_SHIFT;
+import static org.apache.cassandra.utils.btree.BTree.POSITIVE_INFINITY;
+
+/**
+ * A class for constructing a new BTree, either from an existing one and some set of modifications
+ * or a new tree from a sorted collection of items.
+ * <p/>
+ * This is a fairly heavy-weight object, so a ThreadLocal instance is created for making modifications to a tree
+ */
+final class Builder
+{
+    private final NodeBuilder rootBuilder = new NodeBuilder();
+
+    /**
+     * At the highest level, we adhere to the classic b-tree insertion algorithm:
+     *
+     * 1. Add to the appropriate leaf
+     * 2. Split the leaf if necessary, add the median to the parent
+     * 3. Split the parent if necessary, etc.
+     *
+     * There is one important difference: we don't actually modify the original tree, but copy each node that we
+     * modify.  Note that every node on the path to the key being inserted or updated will be modified; this
+     * implies that at a minimum, the root node will be modified for every update, so every root is a "snapshot"
+     * of a tree that can be iterated or sliced without fear of concurrent modifications.
+     *
+     * The NodeBuilder class handles the details of buffering the copied contents of the original tree and
+     * adding in our changes.  Since NodeBuilder maintains parent/child references, it also handles parent-splitting
+     * (easy enough, since any node affected by the split will already be copied into a NodeBuilder).
+     *
+     * One other difference from the simple algorithm is that we perform modifications in bulk;
+     * we assume @param source has been sorted, e.g. by BTree.update, so the update of each key resumes where
+     * the previous left off.
+     */
+    public <V> Object[] update(Object[] btree, Comparator<V> comparator, Iterable<V> source, UpdateFunction<V> updateF)
+    {
+        assert updateF != null;
+
+        NodeBuilder current = rootBuilder;
+        current.reset(btree, POSITIVE_INFINITY, updateF, comparator);
+
+        for (V key : source)
+        {
+            while (true)
+            {
+                if (updateF.abortEarly())
+                {
+                    rootBuilder.clear();
+                    return null;
+                }
+                NodeBuilder next = current.update(key);
+                if (next == null)
+                    break;
+                // we were in a subtree from a previous key that didn't contain this new key;
+                // retry against the correct subtree
+                current = next;
+            }
+        }
+
+        // finish copying any remaining keys from the original btree
+        while (true)
+        {
+            NodeBuilder next = current.finish();
+            if (next == null)
+                break;
+            current = next;
+        }
+
+        // updating with POSITIVE_INFINITY means that current should be back to the root
+        assert current.isRoot();
+
+        Object[] r = current.toNode();
+        current.clear();
+        return r;
+    }
+
+    public <V> Object[] build(Iterable<V> source, UpdateFunction<V> updateF, int size)
+    {
+        assert updateF != null;
+
+        NodeBuilder current = rootBuilder;
+        // we descend only to avoid wasting memory; in update() we will often descend into existing trees
+        // so here we want to descend also, so we don't have lg max(N) depth in both directions
+        while ((size >>= FAN_SHIFT) > 0)
+            current = current.ensureChild();
+
+        current.reset(EMPTY_LEAF, POSITIVE_INFINITY, updateF, null);
+        for (V key : source)
+            current.addNewKey(key);
+
+        current = current.ascendToRoot();
+
+        Object[] r = current.toNode();
+        current.clear();
+        return r;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/btree/Cursor.java b/src/java/org/apache/cassandra/utils/btree/Cursor.java
new file mode 100644
index 0000000..02e047a
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/Cursor.java

@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.Comparator;
+import java.util.Iterator;
+
+import static org.apache.cassandra.utils.btree.BTree.NEGATIVE_INFINITY;
+import static org.apache.cassandra.utils.btree.BTree.POSITIVE_INFINITY;
+import static org.apache.cassandra.utils.btree.BTree.getLeafKeyEnd;
+import static org.apache.cassandra.utils.btree.BTree.isLeaf;
+
+/**
+ * An extension of Path which provides a public interface for iterating over or counting a subrange of the tree
+ *
+ * @param <V>
+ */
+public final class Cursor<K, V extends K> extends Path implements Iterator<V>
+{
+    /*
+     * Conceptually, a Cursor derives two Paths, one for the first object in the slice requested (inclusive),
+     * and one for the last (exclusive).  Then hasNext just checks, have we reached the last yet, and next
+     * calls successor() to get to the next item in the Tree.
+     *
+     * To optimize memory use, we summarize the last Path as just endNode/endIndex, and inherit from Path for
+     *
+     * the first one.
+     */
+
+    // the last node covered by the requested range
+    private Object[] endNode;
+    // the index within endNode that signals we're finished -- that is, endNode[endIndex] is NOT part of the Cursor
+    private byte endIndex;
+
+    private boolean forwards;
+
+    /**
+     * Reset this cursor for the provided tree, to iterate over its entire range
+     *
+     * @param btree    the tree to iterate over
+     * @param forwards if false, the cursor will start at the end and move backwards
+     */
+    public void reset(Object[] btree, boolean forwards)
+    {
+        _reset(btree, null, NEGATIVE_INFINITY, false, POSITIVE_INFINITY, false, forwards);
+    }
+
+    /**
+     * Reset this cursor for the provided tree, to iterate between the provided start and end
+     *
+     * @param btree      the tree to iterate over
+     * @param comparator the comparator that defines the ordering over the items in the tree
+     * @param lowerBound the first item to include, inclusive
+     * @param upperBound the last item to include, exclusive
+     * @param forwards   if false, the cursor will start at the end and move backwards
+     */
+    public void reset(Object[] btree, Comparator<K> comparator, K lowerBound, K upperBound, boolean forwards)
+    {
+        _reset(btree, comparator, lowerBound, true, upperBound, false, forwards);
+    }
+
+    /**
+     * Reset this cursor for the provided tree, to iterate between the provided start and end
+     *
+     * @param btree               the tree to iterate over
+     * @param comparator          the comparator that defines the ordering over the items in the tree
+     * @param lowerBound          the first item to include
+     * @param inclusiveLowerBound should include start in the iterator, if present in the tree
+     * @param upperBound          the last item to include
+     * @param inclusiveUpperBound should include end in the iterator, if present in the tree
+     * @param forwards            if false, the cursor will start at the end and move backwards
+     */
+    public void reset(Object[] btree, Comparator<K> comparator, K lowerBound, boolean inclusiveLowerBound, K upperBound, boolean inclusiveUpperBound, boolean forwards)
+    {
+        _reset(btree, comparator, lowerBound, inclusiveLowerBound, upperBound, inclusiveUpperBound, forwards);
+    }
+
+    private void _reset(Object[] btree, Comparator<K> comparator, Object lowerBound, boolean inclusiveLowerBound, Object upperBound, boolean inclusiveUpperBound, boolean forwards)
+    {
+        ensureDepth(btree);
+        if (lowerBound == null)
+            lowerBound = NEGATIVE_INFINITY;
+        if (upperBound == null)
+            upperBound = POSITIVE_INFINITY;
+
+        this.forwards = forwards;
+
+        Path findLast = new Path(this.path.length);
+        if (forwards)
+        {
+            findLast.find(btree, comparator, upperBound, inclusiveUpperBound ? Op.HIGHER : Op.CEIL, true);
+            find(btree, comparator, lowerBound, inclusiveLowerBound ? Op.CEIL : Op.HIGHER, true);
+        }
+        else
+        {
+            findLast.find(btree, comparator, lowerBound, inclusiveLowerBound ? Op.LOWER : Op.FLOOR, false);
+            find(btree, comparator, upperBound, inclusiveUpperBound ? Op.FLOOR : Op.LOWER, false);
+        }
+        int c = this.compareTo(findLast, forwards);
+        if (forwards ? c > 0 : c < 0)
+        {
+            endNode = currentNode();
+            endIndex = currentIndex();
+        }
+        else
+        {
+            endNode = findLast.currentNode();
+            endIndex = findLast.currentIndex();
+        }
+    }
+
+    public boolean hasNext()
+    {
+        return path[depth] != endNode || indexes[depth] != endIndex;
+    }
+
+    public V next()
+    {
+        Object r = currentKey();
+        if (forwards)
+            successor();
+        else
+            predecessor();
+        return (V) r;
+    }
+
+    public int count()
+    {
+        if (!forwards)
+            throw new IllegalStateException("Count can only be run on forward cursors");
+        int count = 0;
+        int next;
+        while ((next = consumeNextLeaf()) >= 0)
+            count += next;
+        return count;
+    }
+
+    /**
+     * @return the number of objects consumed by moving out of the next (possibly current) leaf
+     */
+    private int consumeNextLeaf()
+    {
+        Object[] node = currentNode();
+        int r = 0;
+
+        if (!isLeaf(node))
+        {
+            // if we're not in a leaf, then calling successor once will take us to a leaf, since the next
+            // key will be in the leftmost subtree of whichever branch is next.  For instance, if we
+            // are in the root node of the tree depicted by http://cis.stvincent.edu/html/tutorials/swd/btree/btree1.gif,
+            // successor() will take us to the leaf containing N and O.
+            int i = currentIndex();
+            if (node == endNode && i == endIndex)
+                return -1;
+            r = 1;
+            successor();
+            node = currentNode();
+        }
+
+        if (node == endNode)
+        {
+            // only count up to endIndex, and don't call successor()
+            if (currentIndex() == endIndex)
+                return r > 0 ? r : -1;
+            r += endIndex - currentIndex();
+            setIndex(endIndex);
+            return r;
+        }
+
+        // count the remaining objects in this leaf
+        int keyEnd = getLeafKeyEnd(node);
+        r += keyEnd - currentIndex();
+        setIndex(keyEnd);
+        successor();
+        return r;
+    }
+
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/btree/NodeBuilder.java b/src/java/org/apache/cassandra/utils/btree/NodeBuilder.java
new file mode 100644
index 0000000..9d57182
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/NodeBuilder.java

@@ -0,0 +1,429 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import org.apache.cassandra.utils.ObjectSizes;
+
+import java.util.Arrays;
+import java.util.Comparator;
+
+import static org.apache.cassandra.utils.btree.BTree.EMPTY_BRANCH;
+import static org.apache.cassandra.utils.btree.BTree.FAN_FACTOR;
+import static org.apache.cassandra.utils.btree.BTree.compare;
+import static org.apache.cassandra.utils.btree.BTree.find;
+import static org.apache.cassandra.utils.btree.BTree.getKeyEnd;
+import static org.apache.cassandra.utils.btree.BTree.isLeaf;
+
+/**
+ * Represents a level / stack item of in progress modifications to a BTree.
+ */
+final class NodeBuilder
+{
+    private static final int MAX_KEYS = 1 + (FAN_FACTOR * 2);
+
+    // parent stack
+    private NodeBuilder parent, child;
+
+    // buffer for building new nodes
+    private Object[] buildKeys = new Object[MAX_KEYS];  // buffers keys for branches and leaves
+    private Object[] buildChildren = new Object[1 + MAX_KEYS]; // buffers children for branches only
+    private int buildKeyPosition;
+    private int buildChildPosition;
+    // we null out the contents of buildKeys/buildChildren when clear()ing them for re-use; this is where
+    // we track how much we actually have to null out
+    private int maxBuildKeyPosition;
+
+    // current node of the btree we're modifying/copying from
+    private Object[] copyFrom;
+    // the index of the first key in copyFrom that has not yet been copied into the build arrays
+    private int copyFromKeyPosition;
+    // the index of the first child node in copyFrom that has not yet been copied into the build arrays
+    private int copyFromChildPosition;
+
+    private UpdateFunction updateFunction;
+    private Comparator comparator;
+
+    // upper bound of range owned by this level; lets us know if we need to ascend back up the tree
+    // for the next key we update when bsearch gives an insertion point past the end of the values
+    // in the current node
+    private Object upperBound;
+
+    // ensure we aren't referencing any garbage
+    void clear()
+    {
+        NodeBuilder current = this;
+        while (current != null && current.upperBound != null)
+        {
+            current.clearSelf();
+            current = current.child;
+        }
+        current = parent;
+        while (current != null && current.upperBound != null)
+        {
+            current.clearSelf();
+            current = current.parent;
+        }
+    }
+
+    void clearSelf()
+    {
+        reset(null, null, null, null);
+        Arrays.fill(buildKeys, 0, maxBuildKeyPosition, null);
+        Arrays.fill(buildChildren, 0, maxBuildKeyPosition + 1, null);
+        maxBuildKeyPosition = 0;
+    }
+
+    // reset counters/setup to copy from provided node
+    void reset(Object[] copyFrom, Object upperBound, UpdateFunction updateFunction, Comparator comparator)
+    {
+        this.copyFrom = copyFrom;
+        this.upperBound = upperBound;
+        this.updateFunction = updateFunction;
+        this.comparator = comparator;
+        maxBuildKeyPosition = Math.max(maxBuildKeyPosition, buildKeyPosition);
+        buildKeyPosition = 0;
+        buildChildPosition = 0;
+        copyFromKeyPosition = 0;
+        copyFromChildPosition = 0;
+    }
+
+    NodeBuilder finish()
+    {
+        assert copyFrom != null;
+        int copyFromKeyEnd = getKeyEnd(copyFrom);
+
+        if (buildKeyPosition + buildChildPosition > 0)
+        {
+            // only want to copy if we've already changed something, otherwise we'll return the original
+            copyKeys(copyFromKeyEnd);
+            if (!isLeaf(copyFrom))
+                copyChildren(copyFromKeyEnd + 1);
+        }
+        return isRoot() ? null : ascend();
+    }
+
+    /**
+     * Inserts or replaces the provided key, copying all not-yet-visited keys prior to it into our buffer.
+     *
+     * @param key key we are inserting/replacing
+     * @return the NodeBuilder to retry the update against (a child if we own the range being updated,
+     * a parent if we do not -- we got here from an earlier key -- and we need to ascend back up),
+     * or null if we finished the update in this node.
+     */
+    NodeBuilder update(Object key)
+    {
+        assert copyFrom != null;
+        int copyFromKeyEnd = getKeyEnd(copyFrom);
+
+        int i = copyFromKeyPosition;
+        boolean found; // exact key match?
+        boolean owns = true; // true iff this node (or a child) should contain the key
+        if (i == copyFromKeyEnd)
+        {
+            found = false;
+        }
+        else
+        {
+            // this optimisation is for the common scenario of updating an existing row with the same columns/keys
+            // and simply avoids performing a binary search until we've checked the proceeding key;
+            // possibly we should disable this check if we determine that it fails more than a handful of times
+            // during any given builder use to get the best of both worlds
+            int c = -comparator.compare(key, copyFrom[i]);
+            if (c >= 0)
+            {
+                found = c == 0;
+            }
+            else
+            {
+                i = find(comparator, key, copyFrom, i + 1, copyFromKeyEnd);
+                found = i >= 0;
+                if (!found)
+                    i = -i - 1;
+            }
+        }
+
+        if (found)
+        {
+            Object prev = copyFrom[i];
+            Object next = updateFunction.apply(prev, key);
+            // we aren't actually replacing anything, so leave our state intact and continue
+            if (prev == next)
+                return null;
+            key = next;
+        }
+        else if (i == copyFromKeyEnd && compare(comparator, key, upperBound) >= 0)
+            owns = false;
+
+        if (isLeaf(copyFrom))
+        {
+
+            if (owns)
+            {
+                // copy keys from the original node up to prior to the found index
+                copyKeys(i);
+
+                if (found)
+                {
+                    // if found, we've applied updateFunction already
+                    replaceNextKey(key);
+                }
+                else
+                {
+                    // if not found, we need to apply updateFunction still
+                    key = updateFunction.apply(key);
+                    addNewKey(key); // handles splitting parent if necessary via ensureRoom
+                }
+
+                // done, so return null
+                return null;
+            }
+            else
+            {
+                // we don't want to copy anything if we're ascending and haven't copied anything previously,
+                // as in this case we can return the original node. Leaving buildKeyPosition as 0 indicates
+                // to buildFromRange that it should return the original instead of building a new node
+                if (buildKeyPosition > 0)
+                    copyKeys(i);
+            }
+
+            // if we don't own it, all we need to do is ensure we've copied everything in this node
+            // (which we have done, since not owning means pos >= keyEnd), ascend, and let Modifier.update
+            // retry against the parent node.  The if/ascend after the else branch takes care of that.
+        }
+        else
+        {
+            // branch
+            if (found)
+            {
+                copyKeys(i);
+                replaceNextKey(key);
+                copyChildren(i + 1);
+                return null;
+            }
+            else if (owns)
+            {
+                copyKeys(i);
+                copyChildren(i);
+
+                // belongs to the range owned by this node, but not equal to any key in the node
+                // so descend into the owning child
+                Object newUpperBound = i < copyFromKeyEnd ? copyFrom[i] : upperBound;
+                Object[] descendInto = (Object[]) copyFrom[copyFromKeyEnd + i];
+                ensureChild().reset(descendInto, newUpperBound, updateFunction, comparator);
+                return child;
+            }
+            else if (buildKeyPosition > 0 || buildChildPosition > 0)
+            {
+                // ensure we've copied all keys and children, but only if we've already copied something.
+                // otherwise we want to return the original node
+                copyKeys(copyFromKeyEnd);
+                copyChildren(copyFromKeyEnd + 1); // since we know that there are exactly 1 more child nodes, than keys
+            }
+        }
+
+        return ascend();
+    }
+
+
+    // UTILITY METHODS FOR IMPLEMENTATION OF UPDATE/BUILD/DELETE
+
+    boolean isRoot()
+    {
+        // if parent == null, or parent.upperBound == null, then we have not initialised a parent builder,
+        // so we are the top level builder holding modifications; if we have more than FAN_FACTOR items, though,
+        // we are not a valid root so we would need to spill-up to create a new root
+        return (parent == null || parent.upperBound == null) && buildKeyPosition <= FAN_FACTOR;
+    }
+
+    // ascend to the root node, splitting into proper node sizes as we go; useful for building
+    // where we work only on the newest child node, which may construct many spill-over parents as it goes
+    NodeBuilder ascendToRoot()
+    {
+        NodeBuilder current = this;
+        while (!current.isRoot())
+            current = current.ascend();
+        return current;
+    }
+
+    // builds a new root BTree node - must be called on root of operation
+    Object[] toNode()
+    {
+        assert buildKeyPosition <= FAN_FACTOR && (buildKeyPosition > 0 || copyFrom.length > 0) : buildKeyPosition;
+        return buildFromRange(0, buildKeyPosition, isLeaf(copyFrom), false);
+    }
+
+    // finish up this level and pass any constructed children up to our parent, ensuring a parent exists
+    private NodeBuilder ascend()
+    {
+        ensureParent();
+        boolean isLeaf = isLeaf(copyFrom);
+        if (buildKeyPosition > FAN_FACTOR)
+        {
+            // split current node and move the midpoint into parent, with the two halves as children
+            int mid = buildKeyPosition / 2;
+            parent.addExtraChild(buildFromRange(0, mid, isLeaf, true), buildKeys[mid]);
+            parent.finishChild(buildFromRange(mid + 1, buildKeyPosition - (mid + 1), isLeaf, false));
+        }
+        else
+        {
+            parent.finishChild(buildFromRange(0, buildKeyPosition, isLeaf, false));
+        }
+        return parent;
+    }
+
+    // copy keys from copyf to the builder, up to the provided index in copyf (exclusive)
+    private void copyKeys(int upToKeyPosition)
+    {
+        if (copyFromKeyPosition >= upToKeyPosition)
+            return;
+
+        int len = upToKeyPosition - copyFromKeyPosition;
+        assert len <= FAN_FACTOR : upToKeyPosition + "," + copyFromKeyPosition;
+
+        ensureRoom(buildKeyPosition + len);
+        if (len > 0)
+        {
+            System.arraycopy(copyFrom, copyFromKeyPosition, buildKeys, buildKeyPosition, len);
+            copyFromKeyPosition = upToKeyPosition;
+            buildKeyPosition += len;
+        }
+    }
+
+    // skips the next key in copyf, and puts the provided key in the builder instead
+    private void replaceNextKey(Object with)
+    {
+        // (this first part differs from addNewKey in that we pass the replaced object to replaceF as well)
+        ensureRoom(buildKeyPosition + 1);
+        buildKeys[buildKeyPosition++] = with;
+
+        copyFromKeyPosition++;
+    }
+
+    // puts the provided key in the builder, with no impact on treatment of data from copyf
+    void addNewKey(Object key)
+    {
+        ensureRoom(buildKeyPosition + 1);
+        buildKeys[buildKeyPosition++] = updateFunction.apply(key);
+    }
+
+    // copies children from copyf to the builder, up to the provided index in copyf (exclusive)
+    private void copyChildren(int upToChildPosition)
+    {
+        // (ensureRoom isn't called here, as we should always be at/behind key additions)
+        if (copyFromChildPosition >= upToChildPosition)
+            return;
+        int len = upToChildPosition - copyFromChildPosition;
+        if (len > 0)
+        {
+            System.arraycopy(copyFrom, getKeyEnd(copyFrom) + copyFromChildPosition, buildChildren, buildChildPosition, len);
+            copyFromChildPosition = upToChildPosition;
+            buildChildPosition += len;
+        }
+    }
+
+    // adds a new and unexpected child to the builder - called by children that overflow
+    private void addExtraChild(Object[] child, Object upperBound)
+    {
+        ensureRoom(buildKeyPosition + 1);
+        buildKeys[buildKeyPosition++] = upperBound;
+        buildChildren[buildChildPosition++] = child;
+    }
+
+    // adds a replacement expected child to the builder - called by children prior to ascending
+    private void finishChild(Object[] child)
+    {
+        buildChildren[buildChildPosition++] = child;
+        copyFromChildPosition++;
+    }
+
+    // checks if we can add the requested keys+children to the builder, and if not we spill-over into our parent
+    private void ensureRoom(int nextBuildKeyPosition)
+    {
+        if (nextBuildKeyPosition < MAX_KEYS)
+            return;
+
+        // flush even number of items so we don't waste leaf space repeatedly
+        Object[] flushUp = buildFromRange(0, FAN_FACTOR, isLeaf(copyFrom), true);
+        ensureParent().addExtraChild(flushUp, buildKeys[FAN_FACTOR]);
+        int size = FAN_FACTOR + 1;
+        assert size <= buildKeyPosition : buildKeyPosition + "," + nextBuildKeyPosition;
+        System.arraycopy(buildKeys, size, buildKeys, 0, buildKeyPosition - size);
+        buildKeyPosition -= size;
+        maxBuildKeyPosition = buildKeys.length;
+        if (buildChildPosition > 0)
+        {
+            System.arraycopy(buildChildren, size, buildChildren, 0, buildChildPosition - size);
+            buildChildPosition -= size;
+        }
+    }
+
+    // builds and returns a node from the buffered objects in the given range
+    private Object[] buildFromRange(int offset, int keyLength, boolean isLeaf, boolean isExtra)
+    {
+        // if keyLength is 0, we didn't copy anything from the original, which means we didn't
+        // modify any of the range owned by it, so can simply return it as is
+        if (keyLength == 0)
+            return copyFrom;
+
+        Object[] a;
+        if (isLeaf)
+        {
+            a = new Object[keyLength + (keyLength & 1)];
+            System.arraycopy(buildKeys, offset, a, 0, keyLength);
+        }
+        else
+        {
+            a = new Object[1 + (keyLength * 2)];
+            System.arraycopy(buildKeys, offset, a, 0, keyLength);
+            System.arraycopy(buildChildren, offset, a, keyLength, keyLength + 1);
+        }
+        if (isExtra)
+            updateFunction.allocated(ObjectSizes.sizeOfArray(a));
+        else if (a.length != copyFrom.length)
+            updateFunction.allocated(ObjectSizes.sizeOfArray(a) -
+                                     (copyFrom.length == 0 ? 0 : ObjectSizes.sizeOfArray(copyFrom)));
+        return a;
+    }
+
+    // checks if there is an initialised parent, and if not creates/initialises one and returns it.
+    // different to ensureChild, as we initialise here instead of caller, as parents in general should
+    // already be initialised, and only aren't in the case where we are overflowing the original root node
+    private NodeBuilder ensureParent()
+    {
+        if (parent == null)
+        {
+            parent = new NodeBuilder();
+            parent.child = this;
+        }
+        if (parent.upperBound == null)
+            parent.reset(EMPTY_BRANCH, upperBound, updateFunction, comparator);
+        return parent;
+    }
+
+    // ensures a child level exists and returns it
+    NodeBuilder ensureChild()
+    {
+        if (child == null)
+        {
+            child = new NodeBuilder();
+            child.parent = this;
+        }
+        return child;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/btree/Path.java b/src/java/org/apache/cassandra/utils/btree/Path.java
new file mode 100644
index 0000000..51207ba
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/Path.java

@@ -0,0 +1,320 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.Comparator;
+
+import static org.apache.cassandra.utils.btree.BTree.NEGATIVE_INFINITY;
+import static org.apache.cassandra.utils.btree.BTree.POSITIVE_INFINITY;
+import static org.apache.cassandra.utils.btree.BTree.getBranchKeyEnd;
+import static org.apache.cassandra.utils.btree.BTree.getKeyEnd;
+import static org.apache.cassandra.utils.btree.BTree.getLeafKeyEnd;
+import static org.apache.cassandra.utils.btree.BTree.isLeaf;
+
+/**
+ * An internal class for searching and iterating through a tree.  As it traverses the tree,
+ * it adds the nodes visited to a stack.  This allows us to backtrack from a child node
+ * to its parent.
+ *
+ * As we navigate the tree, we destructively modify this stack.
+ *
+ * Path is only intended to be used via Cursor.
+ */
+class Path
+{
+    // operations corresponding to the ones in NavigableSet
+    static enum Op
+    {
+        CEIL,   // the least element greater than or equal to the given element
+        FLOOR,  // the greatest element less than or equal to the given element
+        HIGHER, // the least element strictly greater than the given element
+        LOWER   // the greatest element strictly less than the given element
+    }
+
+    // the path to the searched-for key
+    Object[][] path;
+    // the index within the node of our path at a given depth
+    byte[] indexes;
+    // current depth.  nothing in path[i] for i > depth is valid.
+    byte depth;
+
+    Path() { }
+    Path(int depth)
+    {
+        this.path = new Object[depth][];
+        this.indexes = new byte[depth];
+    }
+
+    void ensureDepth(Object[] btree)
+    {
+        int depth = BTree.depth(btree);
+        if (path == null || path.length < depth)
+        {
+            path = new Object[depth][];
+            indexes = new byte[depth];
+        }
+    }
+
+    void moveEnd(Object[] node, boolean forwards)
+    {
+        push(node, getKeyEnd(node));
+        if (!forwards)
+            predecessor();
+    }
+
+    void moveStart(Object[] node, boolean forwards)
+    {
+        push(node, -1);
+        if (forwards)
+            successor();
+    }
+
+    /**
+     * Find the provided key in the tree rooted at node, and store the root to it in the path
+     *
+     * @param node       the tree to search in
+     * @param comparator the comparator defining the order on the tree
+     * @param target     the key to search for
+     * @param mode       the type of search to perform
+     * @param forwards   if the path should be setup for forward or backward iteration
+     * @param <V>
+     */
+    <V> void find(Object[] node, Comparator<V> comparator, Object target, Op mode, boolean forwards)
+    {
+        // TODO : should not require parameter 'forwards' - consider modifying index to represent both
+        // child and key position, as opposed to just key position (which necessitates a different value depending
+        // on which direction you're moving in. Prerequisite for making Path public and using to implement general
+        // search
+
+        depth = -1;
+        if (target instanceof BTree.Special)
+        {
+            if (target == POSITIVE_INFINITY)
+                moveEnd(node, forwards);
+            else if (target == NEGATIVE_INFINITY)
+                moveStart(node, forwards);
+            else
+                throw new AssertionError();
+            return;
+        }
+
+        while (true)
+        {
+            int keyEnd = getKeyEnd(node);
+
+            // search for the target in the current node
+            int i = BTree.find(comparator, target, node, 0, keyEnd);
+            if (i >= 0)
+            {
+                // exact match. transform exclusive bounds into the correct index by moving back or forwards one
+                push(node, i);
+                switch (mode)
+                {
+                    case HIGHER:
+                        successor();
+                        break;
+                    case LOWER:
+                        predecessor();
+                }
+                return;
+            }
+            i = -i - 1;
+
+            // traverse into the appropriate child
+            if (!isLeaf(node))
+            {
+                push(node, forwards ? i - 1 : i);
+                node = (Object[]) node[keyEnd + i];
+                continue;
+            }
+
+            // bottom of the tree and still not found.  pick the right index to satisfy Op
+            switch (mode)
+            {
+                case FLOOR:
+                case LOWER:
+                    i--;
+            }
+
+            if (i < 0)
+            {
+                push(node, 0);
+                predecessor();
+            }
+            else if (i >= keyEnd)
+            {
+                push(node, keyEnd - 1);
+                successor();
+            }
+            else
+            {
+                push(node, i);
+            }
+
+            return;
+        }
+    }
+
+    private boolean isRoot()
+    {
+        return depth == 0;
+    }
+
+    private void pop()
+    {
+        depth--;
+    }
+
+    Object[] currentNode()
+    {
+        return path[depth];
+    }
+
+    byte currentIndex()
+    {
+        return indexes[depth];
+    }
+
+    private void push(Object[] node, int index)
+    {
+        path[++depth] = node;
+        indexes[depth] = (byte) index;
+    }
+
+    void setIndex(int index)
+    {
+        indexes[depth] = (byte) index;
+    }
+
+    // move to the next key in the tree
+    void successor()
+    {
+        Object[] node = currentNode();
+        int i = currentIndex();
+
+        if (!isLeaf(node))
+        {
+            // if we're on a key in a branch, we MUST have a descendant either side of us,
+            // so we always go down the left-most child until we hit a leaf
+            node = (Object[]) node[getBranchKeyEnd(node) + i + 1];
+            while (!isLeaf(node))
+            {
+                push(node, -1);
+                node = (Object[]) node[getBranchKeyEnd(node)];
+            }
+            push(node, 0);
+            return;
+        }
+
+        // if we haven't reached the end of this leaf, just increment our index and return
+        i += 1;
+        if (i < getLeafKeyEnd(node))
+        {
+            // moved to the next key in the same leaf
+            setIndex(i);
+            return;
+        }
+
+        // we've reached the end of this leaf,
+        // so go up until we reach something we've not finished visiting
+        while (!isRoot())
+        {
+            pop();
+            i = currentIndex() + 1;
+            node = currentNode();
+            if (i < getKeyEnd(node))
+            {
+                setIndex(i);
+                return;
+            }
+        }
+
+        // we've visited the last key in the root node, so we're done
+        setIndex(getKeyEnd(node));
+    }
+
+    // move to the previous key in the tree
+    void predecessor()
+    {
+        Object[] node = currentNode();
+        int i = currentIndex();
+
+        if (!isLeaf(node))
+        {
+            // if we're on a key in a branch, we MUST have a descendant either side of us
+            // so we always go down the right-most child until we hit a leaf
+            node = (Object[]) node[getBranchKeyEnd(node) + i];
+            while (!isLeaf(node))
+            {
+                i = getBranchKeyEnd(node);
+                push(node, i);
+                node = (Object[]) node[i * 2];
+            }
+            push(node, getLeafKeyEnd(node) - 1);
+            return;
+        }
+
+        // if we haven't reached the beginning of this leaf, just decrement our index and return
+        i -= 1;
+        if (i >= 0)
+        {
+            setIndex(i);
+            return;
+        }
+
+        // we've reached the beginning of this leaf,
+        // so go up until we reach something we've not finished visiting
+        while (!isRoot())
+        {
+            pop();
+            i = currentIndex() - 1;
+            if (i >= 0)
+            {
+                setIndex(i);
+                return;
+            }
+        }
+
+        // we've visited the last key in the root node, so we're done
+        setIndex(-1);
+    }
+
+    Object currentKey()
+    {
+        return currentNode()[currentIndex()];
+    }
+
+    int compareTo(Path that, boolean forwards)
+    {
+        int d = Math.min(this.depth, that.depth);
+        for (int i = 0; i <= d; i++)
+        {
+            int c = this.indexes[i] - that.indexes[i];
+            if (c != 0)
+                return c;
+        }
+        // identical indices up to depth, so if somebody is lower depth they are on a later item if iterating forwards
+        // and an earlier item if iterating backwards, as the node at max common depth must be a branch if they are
+        // different depths, and branches that are currently descended into lag the child index they are in when iterating forwards,
+        // i.e. if they are in child 0 they record an index of -1 forwards, or 0 when backwards
+        d = this.depth - that.depth;
+        return forwards ? d : -d;
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/utils/btree/UpdateFunction.java b/src/java/org/apache/cassandra/utils/btree/UpdateFunction.java
new file mode 100644
index 0000000..9f45031
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/UpdateFunction.java

@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import com.google.common.base.Function;
+/**
+ * An interface defining a function to be applied to both the object we are replacing in a BTree and
+ * the object that is intended to replace it, returning the object to actually replace it.
+ *
+ * @param <V>
+ */
+public interface UpdateFunction<V> extends Function<V, V>
+{
+    /**
+     * @param replacing the value in the original tree we have matched
+     * @param update the value in the updating collection that matched
+     * @return the value to insert into the new tree
+     */
+    V apply(V replacing, V update);
+
+    /**
+     * @return true if we should fail the update
+     */
+    boolean abortEarly();
+
+    /**
+     * @param heapSize extra heap space allocated (over previous tree)
+     */
+    void allocated(long heapSize);
+
+    public static final class NoOp<V> implements UpdateFunction<V>
+    {
+
+        private static final NoOp INSTANCE = new NoOp();
+        public static <V> NoOp<V> instance()
+        {
+            return INSTANCE;
+        }
+        
+        private NoOp()
+        {
+        }
+
+        public V apply(V replacing, V update)
+        {
+            return update;
+        }
+
+        public V apply(V update)
+        {
+            return update;
+        }
+
+        public boolean abortEarly()
+        {
+            return false;
+        }
+
+        public void allocated(long heapSize)
+        {
+        }
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/utils/concurrent/Locks.java b/src/java/org/apache/cassandra/utils/concurrent/Locks.java
new file mode 100644
index 0000000..1ed5492
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/concurrent/Locks.java

@@ -0,0 +1,37 @@
+package org.apache.cassandra.utils.concurrent;
+
+import sun.misc.Unsafe;
+
+import java.lang.reflect.Field;
+
+public class Locks
+{
+    static final Unsafe unsafe;
+
+    static
+    {
+        try
+        {
+            Field field = sun.misc.Unsafe.class.getDeclaredField("theUnsafe");
+            field.setAccessible(true);
+            unsafe = (sun.misc.Unsafe) field.get(null);
+        }
+        catch (Exception e)
+        {
+            throw new AssertionError(e);
+        }
+    }
+
+    // enters the object's monitor IF UNSAFE IS PRESENT. If it isn't, this is a no-op.
+    public static void monitorEnterUnsafe(Object object)
+    {
+        if (unsafe != null)
+            unsafe.monitorEnter(object);
+    }
+
+    public static void monitorExitUnsafe(Object object)
+    {
+        if (unsafe != null)
+            unsafe.monitorExit(object);
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/concurrent/OpOrder.java b/src/java/org/apache/cassandra/utils/concurrent/OpOrder.java
new file mode 100644
index 0000000..5cebf44
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/concurrent/OpOrder.java

@@ -0,0 +1,431 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.concurrent;
+
+import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
+
+/**
+ * <p>A class for providing synchronization between producers and consumers that do not
+ * communicate directly with each other, but where the consumers need to process their
+ * work in contiguous batches. In particular this is useful for both CommitLog and Memtable
+ * where the producers (writing threads) are modifying a structure that the consumer
+ * (flush executor) only batch syncs, but needs to know what 'position' the work is at
+ * for co-ordination with other processes,
+ *
+ * <p>The typical usage is something like:
+ * <pre>
+     public final class ExampleShared
+     {
+        final OpOrder order = new OpOrder();
+        volatile SharedState state;
+
+        static class SharedState
+        {
+            volatile Barrier barrier;
+
+            // ...
+        }
+
+        public void consume()
+        {
+            SharedState state = this.state;
+            state.setReplacement(new State())
+            state.doSomethingToPrepareForBarrier();
+
+            state.barrier = order.newBarrier();
+            // seal() MUST be called after newBarrier() else barrier.isAfter()
+            // will always return true, and barrier.await() will fail
+            state.barrier.issue();
+
+            // wait for all producer work started prior to the barrier to complete
+            state.barrier.await();
+
+            // change the shared state to its replacement, as the current state will no longer be used by producers
+            this.state = state.getReplacement();
+
+            state.doSomethingWithExclusiveAccess();
+        }
+
+        public void produce()
+        {
+            try (Group opGroup = order.start())
+            {
+                SharedState s = state;
+                while (s.barrier != null && !s.barrier.isAfter(opGroup))
+                    s = s.getReplacement();
+                s.doProduceWork();
+            }
+        }
+    }
+ * </pre>
+ */
+public class OpOrder
+{
+    /**
+     * Constant that when an Ordered.running is equal to, indicates the Ordered is complete
+     */
+    private static final int FINISHED = -1;
+
+    /**
+     * A linked list starting with the most recent Ordered object, i.e. the one we should start new operations from,
+     * with (prev) links to any incomplete Ordered instances, and (next) links to any potential future Ordered instances.
+     * Once all operations started against an Ordered instance and its ancestors have been finished the next instance
+     * will unlink this one
+     */
+    private volatile Group current = new Group();
+
+    /**
+     * Start an operation against this OpOrder.
+     * Once the operation is completed Ordered.close() MUST be called EXACTLY once for this operation.
+     *
+     * @return the Ordered instance that manages this OpOrder
+     */
+    public Group start()
+    {
+        while (true)
+        {
+            Group current = this.current;
+            if (current.register())
+                return current;
+        }
+    }
+
+    /**
+     * Creates a new barrier. The barrier is only a placeholder until barrier.issue() is called on it,
+     * after which all new operations will start against a new Group that will not be accepted
+     * by barrier.isAfter(), and barrier.await() will return only once all operations started prior to the issue
+     * have completed.
+     *
+     * @return
+     */
+    public Barrier newBarrier()
+    {
+        return new Barrier();
+    }
+
+    public Group getCurrent()
+    {
+        return current;
+    }
+
+    public void awaitNewBarrier()
+    {
+        Barrier barrier = newBarrier();
+        barrier.issue();
+        barrier.await();
+    }
+
+    /**
+     * Represents a group of identically ordered operations, i.e. all operations started in the interval between
+     * two barrier issuances. For each register() call this is returned, close() must be called exactly once.
+     * It should be treated like taking a lock().
+     */
+    public static final class Group implements Comparable<Group>, AutoCloseable
+    {
+        /**
+         * In general this class goes through the following stages:
+         * 1) LIVE:      many calls to register() and close()
+         * 2) FINISHING: a call to expire() (after a barrier issue), means calls to register() will now fail,
+         *               and we are now 'in the past' (new operations will be started against a new Ordered)
+         * 3) FINISHED:  once the last close() is called, this Ordered is done. We call unlink().
+         * 4) ZOMBIE:    all our operations are finished, but some operations against an earlier Ordered are still
+         *               running, or tidying up, so unlink() fails to remove us
+         * 5) COMPLETE:  all operations started on or before us are FINISHED (and COMPLETE), so we are unlinked
+         * <p/>
+         * Another parallel states is ISBLOCKING:
+         * <p/>
+         * isBlocking => a barrier that is waiting on us (either directly, or via a future Ordered) is blocking general
+         * progress. This state is entered by calling Barrier.markBlocking(). If the running operations are blocked
+         * on a Signal that is also registered with the isBlockingSignal (probably through isSafeBlockingSignal)
+         * then they will be notified that they are blocking forward progress, and may take action to avoid that.
+         */
+
+        private volatile Group prev, next;
+        private final long id; // monotonically increasing id for compareTo()
+        private volatile int running = 0; // number of operations currently running.  < 0 means we're expired, and the count of tasks still running is -(running + 1)
+        private volatile boolean isBlocking; // indicates running operations are blocking future barriers
+        private final WaitQueue isBlockingSignal = new WaitQueue(); // signal to wait on to indicate isBlocking is true
+        private final WaitQueue waiting = new WaitQueue(); // signal to wait on for completion
+
+        static final AtomicIntegerFieldUpdater<Group> runningUpdater = AtomicIntegerFieldUpdater.newUpdater(Group.class, "running");
+
+        // constructs first instance only
+        private Group()
+        {
+            this.id = 0;
+        }
+
+        private Group(Group prev)
+        {
+            this.id = prev.id + 1;
+            this.prev = prev;
+        }
+
+        // prevents any further operations starting against this Ordered instance
+        // if there are no running operations, calls unlink; otherwise, we let the last op to close call it.
+        // this means issue() won't have to block for ops to finish.
+        private void expire()
+        {
+            while (true)
+            {
+                int current = running;
+                if (current < 0)
+                    throw new IllegalStateException();
+                if (runningUpdater.compareAndSet(this, current, -1 - current))
+                {
+                    // if we're already finished (no running ops), unlink ourselves
+                    if (current == 0)
+                        unlink();
+                    return;
+                }
+            }
+        }
+
+        // attempts to start an operation against this Ordered instance, and returns true if successful.
+        private boolean register()
+        {
+            while (true)
+            {
+                int current = running;
+                if (current < 0)
+                    return false;
+                if (runningUpdater.compareAndSet(this, current, current + 1))
+                    return true;
+            }
+        }
+
+        /**
+         * To be called exactly once for each register() call this object is returned for, indicating the operation
+         * is complete
+         */
+        public void close()
+        {
+            while (true)
+            {
+                int current = running;
+                if (current < 0)
+                {
+                    if (runningUpdater.compareAndSet(this, current, current + 1))
+                    {
+                        if (current + 1 == FINISHED)
+                        {
+                            // if we're now finished, unlink ourselves
+                            unlink();
+                        }
+                        return;
+                    }
+                }
+                else if (runningUpdater.compareAndSet(this, current, current - 1))
+                {
+                    return;
+                }
+            }
+        }
+
+        /**
+         * called once we know all operations started against this Ordered have completed,
+         * however we do not know if operations against its ancestors have completed, or
+         * if its descendants have completed ahead of it, so we attempt to create the longest
+         * chain from the oldest still linked Ordered. If we can't reach the oldest through
+         * an unbroken chain of completed Ordered, we abort, and leave the still completing
+         * ancestor to tidy up.
+         */
+        private void unlink()
+        {
+            // walk back in time to find the start of the list
+            Group start = this;
+            while (true)
+            {
+                Group prev = start.prev;
+                if (prev == null)
+                    break;
+                // if we haven't finished this Ordered yet abort and let it clean up when it's done
+                if (prev.running != FINISHED)
+                    return;
+                start = prev;
+            }
+
+            // now walk forwards in time, in case we finished up late
+            Group end = this.next;
+            while (end.running == FINISHED)
+                end = end.next;
+
+            // now walk from first to last, unlinking the prev pointer and waking up any blocking threads
+            while (start != end)
+            {
+                Group next = start.next;
+                next.prev = null;
+                start.waiting.signalAll();
+                start = next;
+            }
+        }
+
+        /**
+         * @return true if a barrier we are behind is, or may be, blocking general progress,
+         * so we should try more aggressively to progress
+         */
+        public boolean isBlocking()
+        {
+            return isBlocking;
+        }
+
+        /**
+         * register to be signalled when a barrier waiting on us is, or maybe, blocking general progress,
+         * so we should try more aggressively to progress
+         */
+        public WaitQueue.Signal isBlockingSignal()
+        {
+            return isBlockingSignal.register();
+        }
+
+        /**
+         * wrap the provided signal to also be signalled if the operation gets marked blocking
+         */
+        public WaitQueue.Signal isBlockingSignal(WaitQueue.Signal signal)
+        {
+            return WaitQueue.any(signal, isBlockingSignal());
+        }
+
+        public int compareTo(Group that)
+        {
+            // we deliberately use subtraction, as opposed to Long.compareTo() as we care about ordering
+            // not which is the smaller value, so this permits wrapping in the unlikely event we exhaust the long space
+            long c = this.id - that.id;
+            if (c > 0)
+                return 1;
+            else if (c < 0)
+                return -1;
+            else
+                return 0;
+        }
+    }
+
+    /**
+     * This class represents a synchronisation point providing ordering guarantees on operations started
+     * against the enclosing OpOrder.  When issue() is called upon it (may only happen once per Barrier), the
+     * Barrier atomically partitions new operations from those already running (by expiring the current Group),
+     * and activates its isAfter() method
+     * which indicates if an operation was started before or after this partition. It offers methods to
+     * determine, or block until, all prior operations have finished, and a means to indicate to those operations
+     * that they are blocking forward progress. See {@link OpOrder} for idiomatic usage.
+     */
+    public final class Barrier
+    {
+        // this Barrier was issued after all Group operations started against orderOnOrBefore
+        private volatile Group orderOnOrBefore;
+
+        /**
+         * @return true if @param group was started prior to the issuing of the barrier.
+         *
+         * (Until issue is called, always returns true, but if you rely on this behavior you are probably
+         * Doing It Wrong.)
+         */
+        public boolean isAfter(Group group)
+        {
+            if (orderOnOrBefore == null)
+                return true;
+            // we subtract to permit wrapping round the full range of Long - so we only need to ensure
+            // there are never Long.MAX_VALUE * 2 total Group objects in existence at any one timem which will
+            // take care of itself
+            return orderOnOrBefore.id - group.id >= 0;
+        }
+
+        /**
+         * Issues (seals) the barrier, meaning no new operations may be issued against it, and expires the current
+         * Group.  Must be called before await() for isAfter() to be properly synchronised.
+         */
+        public void issue()
+        {
+            if (orderOnOrBefore != null)
+                throw new IllegalStateException("Can only call issue() once on each Barrier");
+
+            final Group current;
+            synchronized (OpOrder.this)
+            {
+                current = OpOrder.this.current;
+                orderOnOrBefore = current;
+                OpOrder.this.current = current.next = new Group(current);
+            }
+            current.expire();
+        }
+
+        /**
+         * Mark all prior operations as blocking, potentially signalling them to more aggressively make progress
+         */
+        public void markBlocking()
+        {
+            Group current = orderOnOrBefore;
+            while (current != null)
+            {
+                current.isBlocking = true;
+                current.isBlockingSignal.signalAll();
+                current = current.prev;
+            }
+        }
+
+        /**
+         * Register to be signalled once allPriorOpsAreFinished() or allPriorOpsAreFinishedOrSafe() may return true
+         */
+        public WaitQueue.Signal register()
+        {
+            return orderOnOrBefore.waiting.register();
+        }
+
+        /**
+         * @return true if all operations started prior to barrier.issue() have completed
+         */
+        public boolean allPriorOpsAreFinished()
+        {
+            Group current = orderOnOrBefore;
+            if (current == null)
+                throw new IllegalStateException("This barrier needs to have issue() called on it before prior operations can complete");
+            if (current.next.prev == null)
+                return true;
+            return false;
+        }
+
+        /**
+         * wait for all operations started prior to issuing the barrier to complete
+         */
+        public void await()
+        {
+            while (!allPriorOpsAreFinished())
+            {
+                WaitQueue.Signal signal = register();
+                if (allPriorOpsAreFinished())
+                {
+                    signal.cancel();
+                    return;
+                }
+                else
+                    signal.awaitUninterruptibly();
+            }
+            assert orderOnOrBefore.running == FINISHED;
+        }
+
+        /**
+         * returns the Group we are waiting on - any Group with .compareTo(getSyncPoint()) <= 0
+         * must complete before await() returns
+         */
+        public Group getSyncPoint()
+        {
+            return orderOnOrBefore;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/concurrent/SimpleCondition.java b/src/java/org/apache/cassandra/utils/concurrent/SimpleCondition.java
new file mode 100644
index 0000000..57614e0
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/concurrent/SimpleCondition.java

@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.concurrent;
+
+import java.util.Date;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
+import java.util.concurrent.locks.Condition;
+
+// fulfils the Condition interface without spurious wakeup problems
+// (or lost notify problems either: that is, even if you call await()
+// _after_ signal(), it will work as desired.)
+public class SimpleCondition implements Condition
+{
+    private static final AtomicReferenceFieldUpdater<SimpleCondition, WaitQueue> waitingUpdater = AtomicReferenceFieldUpdater.newUpdater(SimpleCondition.class, WaitQueue.class, "waiting");
+
+    private volatile WaitQueue waiting;
+    private volatile boolean signaled = false;
+
+    public void await() throws InterruptedException
+    {
+        if (isSignaled())
+            return;
+        if (waiting == null)
+            waitingUpdater.compareAndSet(this, null, new WaitQueue());
+        WaitQueue.Signal s = waiting.register();
+        if (isSignaled())
+            s.cancel();
+        else
+            s.await();
+        assert isSignaled();
+    }
+
+    public boolean await(long time, TimeUnit unit) throws InterruptedException
+    {
+        if (isSignaled())
+            return true;
+        long start = System.nanoTime();
+        long until = start + unit.toNanos(time);
+        if (waiting == null)
+            waitingUpdater.compareAndSet(this, null, new WaitQueue());
+        WaitQueue.Signal s = waiting.register();
+        if (isSignaled())
+        {
+            s.cancel();
+            return true;
+        }
+        return s.awaitUntil(until) || isSignaled();
+    }
+
+    public void signal()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public boolean isSignaled()
+    {
+        return signaled;
+    }
+
+    public void signalAll()
+    {
+        signaled = true;
+        if (waiting != null)
+            waiting.signalAll();
+    }
+
+    public void awaitUninterruptibly()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public long awaitNanos(long nanosTimeout) throws InterruptedException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public boolean awaitUntil(Date deadline) throws InterruptedException
+    {
+        throw new UnsupportedOperationException();
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/concurrent/WaitQueue.java b/src/java/org/apache/cassandra/utils/concurrent/WaitQueue.java
new file mode 100644
index 0000000..2322210
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/concurrent/WaitQueue.java

@@ -0,0 +1,525 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.concurrent;
+
+import com.yammer.metrics.core.TimerContext;
+import org.slf4j.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
+import java.util.concurrent.locks.LockSupport;
+
+/**
+ * <p>A relatively easy to use utility for general purpose thread signalling.</p>
+ * <p>Usage on a thread awaiting a state change using a WaitQueue q is:</p>
+ * <pre>
+ * {@code
+ *      while (!conditionMet())
+ *          Signal s = q.register();
+ *              if (!conditionMet())    // or, perhaps more correctly, !conditionChanged()
+ *                  s.await();
+ *              else
+ *                  s.cancel();
+ * }
+ * </pre>
+ * A signalling thread, AFTER changing the state, then calls q.signal() to wake up one, or q.signalAll()
+ * to wake up all, waiting threads.
+ * <p>To understand intuitively how this class works, the idea is simply that a thread, once it considers itself
+ * incapable of making progress, registers to be awoken once that changes. Since this could have changed between
+ * checking and registering (in which case the thread that made this change would have been unable to signal it),
+ * it checks the condition again, sleeping only if it hasn't changed/still is not met.</p>
+ * <p>This thread synchronisation scheme has some advantages over Condition objects and Object.wait/notify in that no monitor
+ * acquisition is necessary and, in fact, besides the actual waiting on a signal, all operations are non-blocking.
+ * As a result consumers can never block producers, nor each other, or vice versa, from making progress.
+ * Threads that are signalled are also put into a RUNNABLE state almost simultaneously, so they can all immediately make
+ * progress without having to serially acquire the monitor/lock, reducing scheduler delay incurred.</p>
+ *
+ * <p>A few notes on utilisation:</p>
+ * <p>1. A thread will only exit await() when it has been signalled, but this does not guarantee the condition has not
+ * been altered since it was signalled, and depending on your design it is likely the outer condition will need to be
+ * checked in a loop, though this is not always the case.</p>
+ * <p>2. Each signal is single use, so must be re-registered after each await(). This is true even if it times out.</p>
+ * <p>3. If you choose not to wait on the signal (because the condition has been met before you waited on it)
+ * you must cancel() the signal if the signalling thread uses signal() to awake waiters; otherwise signals will be
+ * lost. If signalAll() is used but infrequent, and register() is frequent, cancel() should still be used to prevent the
+ * queue growing unboundedly. Similarly, if you provide a TimerContext, cancel should be used to ensure it is not erroneously
+ * counted towards wait time.</p>
+ * <p>4. Care must be taken when selecting conditionMet() to ensure we are waiting on the condition that actually
+ * indicates progress is possible. In some complex cases it may be tempting to wait on a condition that is only indicative
+ * of local progress, not progress on the task we are aiming to complete, and a race may leave us waiting for a condition
+ * to be met that we no longer need.
+ * <p>5. This scheme is not fair</p>
+ * <p>6. Only the thread that calls register() may call await()</p>
+ */
+public final class WaitQueue
+{
+
+    private static final int CANCELLED = -1;
+    private static final int SIGNALLED = 1;
+    private static final int NOT_SET = 0;
+
+    private static final AtomicIntegerFieldUpdater signalledUpdater = AtomicIntegerFieldUpdater.newUpdater(RegisteredSignal.class, "state");
+
+    // the waiting signals
+    private final ConcurrentLinkedQueue<RegisteredSignal> queue = new ConcurrentLinkedQueue<>();
+
+    /**
+     * The calling thread MUST be the thread that uses the signal
+     * @return                                x
+     */
+    public Signal register()
+    {
+        RegisteredSignal signal = new RegisteredSignal();
+        queue.add(signal);
+        return signal;
+    }
+
+    /**
+     * The calling thread MUST be the thread that uses the signal.
+     * If the Signal is waited on, context.stop() will be called when the wait times out, the Signal is signalled,
+     * or the waiting thread is interrupted.
+     * @return
+     */
+    public Signal register(TimerContext context)
+    {
+        assert context != null;
+        RegisteredSignal signal = new TimedSignal(context);
+        queue.add(signal);
+        return signal;
+    }
+
+    /**
+     * Signal one waiting thread
+     */
+    public boolean signal()
+    {
+        if (!hasWaiters())
+            return false;
+        while (true)
+        {
+            RegisteredSignal s = queue.poll();
+            if (s == null || s.signal() != null)
+                return s != null;
+        }
+    }
+
+    /**
+     * Signal all waiting threads
+     */
+    public void signalAll()
+    {
+        if (!hasWaiters())
+            return;
+
+        // to avoid a race where the condition is not met and the woken thread managed to wait on the queue before
+        // we finish signalling it all, we pick a random thread we have woken-up and hold onto it, so that if we encounter
+        // it again we know we're looping. We reselect a random thread periodically, progressively less often.
+        // the "correct" solution to this problem is to use a queue that permits snapshot iteration, but this solution is sufficient
+        int i = 0, s = 5;
+        Thread randomThread = null;
+        Iterator<RegisteredSignal> iter = queue.iterator();
+        while (iter.hasNext())
+        {
+            RegisteredSignal signal = iter.next();
+            Thread signalled = signal.signal();
+
+            if (signalled != null)
+            {
+                if (signalled == randomThread)
+                    break;
+
+                if (++i == s)
+                {
+                    randomThread = signalled;
+                    s <<= 1;
+                }
+            }
+
+            iter.remove();
+        }
+    }
+
+    private void cleanUpCancelled()
+    {
+        // TODO: attempt to remove the cancelled from the beginning only (need atomic cas of head)
+        Iterator<RegisteredSignal> iter = queue.iterator();
+        while (iter.hasNext())
+        {
+            RegisteredSignal s = iter.next();
+            if (s.isCancelled())
+                iter.remove();
+        }
+    }
+
+    public boolean hasWaiters()
+    {
+        return !queue.isEmpty();
+    }
+
+    /**
+     * Return how many threads are waiting
+     * @return
+     */
+    public int getWaiting()
+    {
+        if (!hasWaiters())
+            return 0;
+        Iterator<RegisteredSignal> iter = queue.iterator();
+        int count = 0;
+        while (iter.hasNext())
+        {
+            Signal next = iter.next();
+            if (!next.isCancelled())
+                count++;
+        }
+        return count;
+    }
+
+    /**
+     * A Signal is a one-time-use mechanism for a thread to wait for notification that some condition
+     * state has transitioned that it may be interested in (and hence should check if it is).
+     * It is potentially transient, i.e. the state can change in the meantime, it only indicates
+     * that it should be checked, not necessarily anything about what the expected state should be.
+     *
+     * Signal implementations should never wake up spuriously, they are always woken up by a
+     * signal() or signalAll().
+     *
+     * This abstract definition of Signal does not need to be tied to a WaitQueue.
+     * Whilst RegisteredSignal is the main building block of Signals, this abstract
+     * definition allows us to compose Signals in useful ways. The Signal is 'owned' by the
+     * thread that registered itself with WaitQueue(s) to obtain the underlying RegisteredSignal(s);
+     * only the owning thread should use a Signal.
+     */
+    public static interface Signal
+    {
+
+        /**
+         * @return true if signalled; once true, must be discarded by the owning thread.
+         */
+        public boolean isSignalled();
+
+        /**
+         * @return true if cancelled; once cancelled, must be discarded by the owning thread.
+         */
+        public boolean isCancelled();
+
+        /**
+         * @return isSignalled() || isCancelled(). Once true, the state is fixed and the Signal should be discarded
+         * by the owning thread.
+         */
+        public boolean isSet();
+
+        /**
+         * atomically: cancels the Signal if !isSet(), or returns true if isSignalled()
+         *
+         * @return true if isSignalled()
+         */
+        public boolean checkAndClear();
+
+        /**
+         * Should only be called by the owning thread. Indicates the signal can be retired,
+         * and if signalled propagates the signal to another waiting thread
+         */
+        public abstract void cancel();
+
+        /**
+         * Wait, without throwing InterruptedException, until signalled. On exit isSignalled() must be true.
+         * If the thread is interrupted in the meantime, the interrupted flag will be set.
+         */
+        public void awaitUninterruptibly();
+
+        /**
+         * Wait until signalled, or throw an InterruptedException if interrupted before this happens.
+         * On normal exit isSignalled() must be true; however if InterruptedException is thrown isCancelled()
+         * will be true.
+         * @throws InterruptedException
+         */
+        public void await() throws InterruptedException;
+
+        /**
+         * Wait until signalled, or the provided time is reached, or the thread is interrupted. If signalled,
+         * isSignalled() will be true on exit, and the method will return true; if timedout, the method will return
+         * false and isCancelled() will be true; if interrupted an InterruptedException will be thrown and isCancelled()
+         * will be true.
+         * @param nanos System.nanoTime() to wait until
+         * @return true if signalled, false if timed out
+         * @throws InterruptedException
+         */
+        public boolean awaitUntil(long nanos) throws InterruptedException;
+    }
+
+    /**
+     * An abstract signal implementation
+     */
+    public static abstract class AbstractSignal implements Signal
+    {
+        public void awaitUninterruptibly()
+        {
+            boolean interrupted = false;
+            while (!isSignalled())
+            {
+                if (Thread.interrupted())
+                    interrupted = true;
+                LockSupport.park();
+            }
+            if (interrupted)
+                Thread.currentThread().interrupt();
+            checkAndClear();
+        }
+
+        public void await() throws InterruptedException
+        {
+            while (!isSignalled())
+            {
+                checkInterrupted();
+                LockSupport.park();
+            }
+            checkAndClear();
+        }
+
+        public boolean awaitUntil(long until) throws InterruptedException
+        {
+            long now;
+            while (until > (now = System.nanoTime()) && !isSignalled())
+            {
+                checkInterrupted();
+                long delta = until - now;
+                LockSupport.parkNanos(delta);
+            }
+            return checkAndClear();
+        }
+
+        private void checkInterrupted() throws InterruptedException
+        {
+            if (Thread.interrupted())
+            {
+                cancel();
+                throw new InterruptedException();
+            }
+        }
+    }
+
+    /**
+     * A signal registered with this WaitQueue
+     */
+    private class RegisteredSignal extends AbstractSignal
+    {
+        private volatile Thread thread = Thread.currentThread();
+        volatile int state;
+
+        public boolean isSignalled()
+        {
+            return state == SIGNALLED;
+        }
+
+        public boolean isCancelled()
+        {
+            return state == CANCELLED;
+        }
+
+        public boolean isSet()
+        {
+            return state != NOT_SET;
+        }
+
+        private Thread signal()
+        {
+            if (!isSet() && signalledUpdater.compareAndSet(this, NOT_SET, SIGNALLED))
+            {
+                Thread thread = this.thread;
+                LockSupport.unpark(thread);
+                this.thread = null;
+                return thread;
+            }
+            return null;
+        }
+
+        public boolean checkAndClear()
+        {
+            if (!isSet() && signalledUpdater.compareAndSet(this, NOT_SET, CANCELLED))
+            {
+                thread = null;
+                cleanUpCancelled();
+                return false;
+            }
+            // must now be signalled assuming correct API usage
+            return true;
+        }
+
+        /**
+         * Should only be called by the registered thread. Indicates the signal can be retired,
+         * and if signalled propagates the signal to another waiting thread
+         */
+        public void cancel()
+        {
+            if (isCancelled())
+                return;
+            if (!signalledUpdater.compareAndSet(this, NOT_SET, CANCELLED))
+            {
+                // must already be signalled - switch to cancelled and
+                state = CANCELLED;
+                // propagate the signal
+                WaitQueue.this.signal();
+            }
+            thread = null;
+            cleanUpCancelled();
+        }
+    }
+
+    /**
+     * A RegisteredSignal that stores a TimerContext, and stops the timer when either cancelled or
+     * finished waiting. i.e. if the timer is started when the signal is registered it tracks the
+     * time in between registering and invalidating the signal.
+     */
+    private final class TimedSignal extends RegisteredSignal
+    {
+        private final TimerContext context;
+
+        private TimedSignal(TimerContext context)
+        {
+            this.context = context;
+        }
+
+        @Override
+        public boolean checkAndClear()
+        {
+            context.stop();
+            return super.checkAndClear();
+        }
+
+        @Override
+        public void cancel()
+        {
+            if (!isCancelled())
+            {
+                context.stop();
+                super.cancel();
+            }
+        }
+    }
+
+    /**
+     * An abstract signal wrapping multiple delegate signals
+     */
+    private abstract static class MultiSignal extends AbstractSignal
+    {
+        final Signal[] signals;
+        protected MultiSignal(Signal[] signals)
+        {
+            this.signals = signals;
+        }
+
+        public boolean isCancelled()
+        {
+            for (Signal signal : signals)
+                if (!signal.isCancelled())
+                    return false;
+            return true;
+        }
+
+        public boolean checkAndClear()
+        {
+            for (Signal signal : signals)
+                signal.checkAndClear();
+            return isSignalled();
+        }
+
+        public void cancel()
+        {
+            for (Signal signal : signals)
+                signal.cancel();
+        }
+    }
+
+    /**
+     * A Signal that wraps multiple Signals and returns when any single one of them would have returned
+     */
+    private static class AnySignal extends MultiSignal
+    {
+        protected AnySignal(Signal ... signals)
+        {
+            super(signals);
+        }
+
+        public boolean isSignalled()
+        {
+            for (Signal signal : signals)
+                if (signal.isSignalled())
+                    return true;
+            return false;
+        }
+
+        public boolean isSet()
+        {
+            for (Signal signal : signals)
+                if (signal.isSet())
+                    return true;
+            return false;
+        }
+    }
+
+    /**
+     * A Signal that wraps multiple Signals and returns when all of them would have finished returning
+     */
+    private static class AllSignal extends MultiSignal
+    {
+        protected AllSignal(Signal ... signals)
+        {
+            super(signals);
+        }
+
+        public boolean isSignalled()
+        {
+            for (Signal signal : signals)
+                if (!signal.isSignalled())
+                    return false;
+            return true;
+        }
+
+        public boolean isSet()
+        {
+            for (Signal signal : signals)
+                if (!signal.isSet())
+                    return false;
+            return true;
+        }
+    }
+
+    /**
+     * @param signals
+     * @return a signal that returns only when any of the provided signals would have returned
+     */
+    public static Signal any(Signal ... signals)
+    {
+        return new AnySignal(signals);
+    }
+
+    /**
+     * @param signals
+     * @return a signal that returns only when all provided signals would have returned
+     */
+    public static Signal all(Signal ... signals)
+    {
+        return new AllSignal(signals);
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/Allocator.java b/src/java/org/apache/cassandra/utils/memory/AbstractAllocator.java
similarity index 84%
rename from src/java/org/apache/cassandra/utils/Allocator.java
rename to src/java/org/apache/cassandra/utils/memory/AbstractAllocator.java
index 7134353..0735d6e 100644
--- a/src/java/org/apache/cassandra/utils/Allocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/AbstractAllocator.java

@@ -15,11 +15,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.utils;
+package org.apache.cassandra.utils.memory;
 
 import java.nio.ByteBuffer;
 
-public abstract class Allocator
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public abstract class AbstractAllocator
 {
     /**
      * Allocate a slice of the given length.
@@ -27,6 +29,8 @@
     public ByteBuffer clone(ByteBuffer buffer)
     {
         assert buffer != null;
+        if (buffer.remaining() == 0)
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
         ByteBuffer cloned = allocate(buffer.remaining());
 
         cloned.mark();
@@ -36,6 +40,4 @@
     }
 
     public abstract ByteBuffer allocate(int size);
-
-    public abstract long getMinimumSize();
 }

diff --git a/src/java/org/apache/cassandra/utils/memory/ContextAllocator.java b/src/java/org/apache/cassandra/utils/memory/ContextAllocator.java
new file mode 100644
index 0000000..62c8f9c
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/ContextAllocator.java

@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+/**
+ * Wraps calls to a PoolAllocator with the provided writeOp. Also doubles as a Function that clones Cells
+ * using itself
+ */
+public final class ContextAllocator extends AbstractAllocator
+{
+    private final OpOrder.Group opGroup;
+    private final MemtableBufferAllocator allocator;
+
+    public ContextAllocator(OpOrder.Group opGroup, MemtableBufferAllocator allocator)
+    {
+        this.opGroup = opGroup;
+        this.allocator = allocator;
+    }
+
+    @Override
+    public ByteBuffer clone(ByteBuffer buffer)
+    {
+        assert buffer != null;
+        if (buffer.remaining() == 0)
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        ByteBuffer cloned = allocate(buffer.remaining());
+
+        cloned.mark();
+        cloned.put(buffer.duplicate());
+        cloned.reset();
+        return cloned;
+    }
+
+    public ByteBuffer allocate(int size)
+    {
+        return allocator.allocate(size, opGroup);
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/HeapAllocator.java b/src/java/org/apache/cassandra/utils/memory/HeapAllocator.java
similarity index 86%
rename from src/java/org/apache/cassandra/utils/HeapAllocator.java
rename to src/java/org/apache/cassandra/utils/memory/HeapAllocator.java
index 3d19b98..41877f5 100644
--- a/src/java/org/apache/cassandra/utils/HeapAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/HeapAllocator.java

@@ -15,11 +15,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.utils;
+package org.apache.cassandra.utils.memory;
 
 import java.nio.ByteBuffer;
 
-public final class HeapAllocator extends Allocator
+public final class HeapAllocator extends AbstractAllocator
 {
     public static final HeapAllocator instance = new HeapAllocator();
 
@@ -27,15 +27,10 @@
      * Normally you should use HeapAllocator.instance, since there is no per-Allocator state.
      * This is exposed so that the reflection done by Memtable works when SlabAllocator is disabled.
      */
-    public HeapAllocator() {}
+    private HeapAllocator() {}
 
     public ByteBuffer allocate(int size)
     {
         return ByteBuffer.allocate(size);
     }
-
-    public long getMinimumSize()
-    {
-        return 0;
-    }
 }

diff --git a/src/java/org/apache/cassandra/utils/memory/HeapPool.java b/src/java/org/apache/cassandra/utils/memory/HeapPool.java
new file mode 100644
index 0000000..a04947c
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/HeapPool.java

@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+public class HeapPool extends MemtablePool
+{
+    public HeapPool(long maxOnHeapMemory, float cleanupThreshold, Runnable cleaner)
+    {
+        super(maxOnHeapMemory, 0, cleanupThreshold, cleaner);
+    }
+
+    public boolean needToCopyOnHeap()
+    {
+        return false;
+    }
+
+    public MemtableAllocator newAllocator()
+    {
+        return new Allocator(this);
+    }
+
+    public class Allocator extends MemtableBufferAllocator
+    {
+        Allocator(HeapPool pool)
+        {
+            super(pool.onHeap.newAllocator(), pool.offHeap.newAllocator());
+        }
+
+        public ByteBuffer allocate(int size, OpOrder.Group opGroup)
+        {
+            super.onHeap().allocate(size, opGroup);
+            return ByteBuffer.allocate(size);
+        }
+
+        public DataReclaimer reclaimer()
+        {
+            return new Reclaimer();
+        }
+
+        private class Reclaimer implements DataReclaimer
+        {
+            List<Cell> delayed;
+
+            public Reclaimer reclaim(Cell cell)
+            {
+                if (delayed == null)
+                    delayed = new ArrayList<>();
+                delayed.add(cell);
+                return this;
+            }
+
+            public Reclaimer reclaimImmediately(Cell cell)
+            {
+                onHeap().release(cell.name().dataSize() + cell.value().remaining());
+                return this;
+            }
+
+            public Reclaimer reclaimImmediately(DecoratedKey key)
+            {
+                onHeap().release(key.getKey().remaining());
+                return this;
+            }
+
+            public void cancel()
+            {
+                if (delayed != null)
+                    delayed.clear();
+            }
+
+            public void commit()
+            {
+                if (delayed != null)
+                    for (Cell cell : delayed)
+                        reclaimImmediately(cell);
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java b/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java
new file mode 100644
index 0000000..129a60b
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java

@@ -0,0 +1,307 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.lang.reflect.Field;
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import com.google.common.primitives.*;
+import sun.misc.Unsafe;
+
+public abstract class MemoryUtil
+{
+    private static final long UNSAFE_COPY_THRESHOLD = 1024 * 1024L; // copied from java.nio.Bits
+
+    private static final Unsafe unsafe;
+    private static final Class<?> DIRECT_BYTE_BUFFER_CLASS;
+    private static final long DIRECT_BYTE_BUFFER_ADDRESS_OFFSET;
+    private static final long DIRECT_BYTE_BUFFER_CAPACITY_OFFSET;
+    private static final long DIRECT_BYTE_BUFFER_LIMIT_OFFSET;
+    private static final long BYTE_ARRAY_BASE_OFFSET;
+
+    private static final boolean BIG_ENDIAN = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);
+
+    private static final boolean UNALIGNED;
+    public static final boolean INVERTED_ORDER;
+
+    static
+    {
+        String arch = System.getProperty("os.arch");
+        UNALIGNED = arch.equals("i386") || arch.equals("x86")
+                || arch.equals("amd64") || arch.equals("x86_64");
+        INVERTED_ORDER = UNALIGNED && !BIG_ENDIAN;
+        try
+        {
+            Field field = sun.misc.Unsafe.class.getDeclaredField("theUnsafe");
+            field.setAccessible(true);
+            unsafe = (sun.misc.Unsafe) field.get(null);
+            Class<?> clazz = ByteBuffer.allocateDirect(0).getClass();
+            DIRECT_BYTE_BUFFER_ADDRESS_OFFSET = unsafe.objectFieldOffset(Buffer.class.getDeclaredField("address"));
+            DIRECT_BYTE_BUFFER_CAPACITY_OFFSET = unsafe.objectFieldOffset(Buffer.class.getDeclaredField("capacity"));
+            DIRECT_BYTE_BUFFER_LIMIT_OFFSET = unsafe.objectFieldOffset(Buffer.class.getDeclaredField("limit"));
+            DIRECT_BYTE_BUFFER_CLASS = clazz;
+            BYTE_ARRAY_BASE_OFFSET = unsafe.arrayBaseOffset(byte[].class);
+        }
+        catch (Exception e)
+        {
+            throw new AssertionError(e);
+        }
+    }
+
+    public static void setByte(long address, byte b)
+    {
+        unsafe.putByte(address, b);
+    }
+
+    public static void setShort(long address, short s)
+    {
+        unsafe.putShort(address, s);
+    }
+
+    public static void setInt(long address, int l)
+    {
+        if (UNALIGNED)
+            unsafe.putInt(address, l);
+        else
+            putIntByByte(address, l);
+    }
+
+    public static void setLong(long address, long l)
+    {
+        if (UNALIGNED)
+            unsafe.putLong(address, l);
+        else
+            putLongByByte(address, l);
+    }
+
+    public static byte getByte(long address)
+    {
+        return unsafe.getByte(address);
+    }
+
+    public static int getShort(long address)
+    {
+        return UNALIGNED ? unsafe.getShort(address) : getShortByByte(address);
+    }
+
+    public static int getInt(long address)
+    {
+        return UNALIGNED ? unsafe.getInt(address) : getIntByByte(address);
+    }
+
+    public static long getLong(long address)
+    {
+        return UNALIGNED ? unsafe.getLong(address) : getLongByByte(address);
+    }
+
+    public static ByteBuffer getByteBuffer(long address, int length)
+    {
+        ByteBuffer instance;
+        try
+        {
+            instance = (ByteBuffer) unsafe.allocateInstance(DIRECT_BYTE_BUFFER_CLASS);
+        }
+        catch (InstantiationException e)
+        {
+            throw new AssertionError(e);
+        }
+
+        unsafe.putLong(instance, DIRECT_BYTE_BUFFER_ADDRESS_OFFSET, address);
+        unsafe.putInt(instance, DIRECT_BYTE_BUFFER_CAPACITY_OFFSET, length);
+        unsafe.putInt(instance, DIRECT_BYTE_BUFFER_LIMIT_OFFSET, length);
+        instance.order(ByteOrder.nativeOrder());
+        return instance;
+    }
+
+    public static long getLongByByte(long address)
+    {
+        if (BIG_ENDIAN)
+        {
+            return  (((long) unsafe.getByte(address    )       ) << 56) |
+                    (((long) unsafe.getByte(address + 1) & 0xff) << 48) |
+                    (((long) unsafe.getByte(address + 2) & 0xff) << 40) |
+                    (((long) unsafe.getByte(address + 3) & 0xff) << 32) |
+                    (((long) unsafe.getByte(address + 4) & 0xff) << 24) |
+                    (((long) unsafe.getByte(address + 5) & 0xff) << 16) |
+                    (((long) unsafe.getByte(address + 6) & 0xff) <<  8) |
+                    (((long) unsafe.getByte(address + 7) & 0xff)      );
+        }
+        else
+        {
+            return  (((long) unsafe.getByte(address + 7)       ) << 56) |
+                    (((long) unsafe.getByte(address + 6) & 0xff) << 48) |
+                    (((long) unsafe.getByte(address + 5) & 0xff) << 40) |
+                    (((long) unsafe.getByte(address + 4) & 0xff) << 32) |
+                    (((long) unsafe.getByte(address + 3) & 0xff) << 24) |
+                    (((long) unsafe.getByte(address + 2) & 0xff) << 16) |
+                    (((long) unsafe.getByte(address + 1) & 0xff) <<  8) |
+                    (((long) unsafe.getByte(address    ) & 0xff)      );
+        }
+    }
+
+    public static int getIntByByte(long address)
+    {
+        if (BIG_ENDIAN)
+        {
+            return  (((int) unsafe.getByte(address    )       ) << 24) |
+                    (((int) unsafe.getByte(address + 1) & 0xff) << 16) |
+                    (((int) unsafe.getByte(address + 2) & 0xff) << 8 ) |
+                    (((int) unsafe.getByte(address + 3) & 0xff)      );
+        }
+        else
+        {
+            return  (((int) unsafe.getByte(address + 3)       ) << 24) |
+                    (((int) unsafe.getByte(address + 2) & 0xff) << 16) |
+                    (((int) unsafe.getByte(address + 1) & 0xff) <<  8) |
+                    (((int) unsafe.getByte(address    ) & 0xff)      );
+        }
+    }
+
+
+    public static int getShortByByte(long address)
+    {
+        if (BIG_ENDIAN)
+        {
+            return  (((int) unsafe.getByte(address    )       ) << 8) |
+                    (((int) unsafe.getByte(address + 1) & 0xff)     );
+        }
+        else
+        {
+            return  (((int) unsafe.getByte(address + 1)       ) <<  8) |
+                    (((int) unsafe.getByte(address    ) & 0xff)      );
+        }
+    }
+
+    public static void putLongByByte(long address, long value)
+    {
+        if (BIG_ENDIAN)
+        {
+            unsafe.putByte(address, (byte) (value >> 56));
+            unsafe.putByte(address + 1, (byte) (value >> 48));
+            unsafe.putByte(address + 2, (byte) (value >> 40));
+            unsafe.putByte(address + 3, (byte) (value >> 32));
+            unsafe.putByte(address + 4, (byte) (value >> 24));
+            unsafe.putByte(address + 5, (byte) (value >> 16));
+            unsafe.putByte(address + 6, (byte) (value >> 8));
+            unsafe.putByte(address + 7, (byte) (value));
+        }
+        else
+        {
+            unsafe.putByte(address + 7, (byte) (value >> 56));
+            unsafe.putByte(address + 6, (byte) (value >> 48));
+            unsafe.putByte(address + 5, (byte) (value >> 40));
+            unsafe.putByte(address + 4, (byte) (value >> 32));
+            unsafe.putByte(address + 3, (byte) (value >> 24));
+            unsafe.putByte(address + 2, (byte) (value >> 16));
+            unsafe.putByte(address + 1, (byte) (value >> 8));
+            unsafe.putByte(address, (byte) (value));
+        }
+    }
+
+    public static void putIntByByte(long address, int value)
+    {
+        if (BIG_ENDIAN)
+        {
+            unsafe.putByte(address, (byte) (value >> 24));
+            unsafe.putByte(address + 1, (byte) (value >> 16));
+            unsafe.putByte(address + 2, (byte) (value >> 8));
+            unsafe.putByte(address + 3, (byte) (value));
+        }
+        else
+        {
+            unsafe.putByte(address + 3, (byte) (value >> 24));
+            unsafe.putByte(address + 2, (byte) (value >> 16));
+            unsafe.putByte(address + 1, (byte) (value >> 8));
+            unsafe.putByte(address, (byte) (value));
+        }
+    }
+
+    public static void setBytes(long address, ByteBuffer buffer)
+    {
+        int start = buffer.position();
+        int count = buffer.limit() - start;
+        if (count == 0)
+            return;
+
+        if (buffer.isDirect())
+            setBytes(unsafe.getLong(buffer, DIRECT_BYTE_BUFFER_ADDRESS_OFFSET) + start, address, count);
+        else
+            setBytes(address, buffer.array(), start, count);
+    }
+
+    /**
+     * Transfers count bytes from buffer to Memory
+     *
+     * @param address start offset in the memory
+     * @param buffer the data buffer
+     * @param bufferOffset start offset of the buffer
+     * @param count number of bytes to transfer
+     */
+    public static void setBytes(long address, byte[] buffer, int bufferOffset, int count)
+    {
+        assert buffer != null;
+        assert !(bufferOffset < 0 || count < 0 || bufferOffset + count > buffer.length);
+        setBytes(buffer, bufferOffset, address, count);
+    }
+
+    public static void setBytes(long src, long trg, long count)
+    {
+        while (count > 0)
+        {
+            long size = (count> UNSAFE_COPY_THRESHOLD) ? UNSAFE_COPY_THRESHOLD : count;
+            unsafe.copyMemory(src, trg, size);
+            count -= size;
+            src += size;
+            trg+= size;
+        }
+    }
+
+    public static void setBytes(byte[] src, int offset, long trg, long count)
+    {
+        while (count > 0)
+        {
+            long size = (count> UNSAFE_COPY_THRESHOLD) ? UNSAFE_COPY_THRESHOLD : count;
+            unsafe.copyMemory(src, BYTE_ARRAY_BASE_OFFSET + offset, null, trg, size);
+            count -= size;
+            offset += size;
+            trg += size;
+        }
+    }
+
+    /**
+     * Transfers count bytes from Memory starting at memoryOffset to buffer starting at bufferOffset
+     *
+     * @param address start offset in the memory
+     * @param buffer the data buffer
+     * @param bufferOffset start offset of the buffer
+     * @param count number of bytes to transfer
+     */
+    public static void getBytes(long address, byte[] buffer, int bufferOffset, int count)
+    {
+        if (buffer == null)
+            throw new NullPointerException();
+        else if (bufferOffset < 0 || count < 0 || count > buffer.length - bufferOffset)
+            throw new IndexOutOfBoundsException();
+        else if (count == 0)
+            return;
+
+        unsafe.copyMemory(null, address, buffer, BYTE_ARRAY_BASE_OFFSET + bufferOffset, count);
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/memory/MemtableAllocator.java b/src/java/org/apache/cassandra/utils/memory/MemtableAllocator.java
new file mode 100644
index 0000000..e814b4d
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/MemtableAllocator.java

@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.util.concurrent.atomic.AtomicLongFieldUpdater;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+
+public abstract class MemtableAllocator
+{
+    private final SubAllocator onHeap;
+    private final SubAllocator offHeap;
+    volatile LifeCycle state = LifeCycle.LIVE;
+
+    static enum LifeCycle
+    {
+        LIVE, DISCARDING, DISCARDED;
+        LifeCycle transition(LifeCycle targetState)
+        {
+            switch (targetState)
+            {
+                case DISCARDING:
+                    assert this == LifeCycle.LIVE;
+                    return LifeCycle.DISCARDING;
+
+                case DISCARDED:
+                    assert this == LifeCycle.DISCARDING;
+                    return LifeCycle.DISCARDED;
+
+                default:
+                    throw new IllegalStateException();
+            }
+        }
+    }
+
+    MemtableAllocator(SubAllocator onHeap, SubAllocator offHeap)
+    {
+        this.onHeap = onHeap;
+        this.offHeap = offHeap;
+    }
+
+    public abstract Cell clone(Cell cell, CFMetaData cfm, OpOrder.Group writeOp);
+    public abstract CounterCell clone(CounterCell cell, CFMetaData cfm, OpOrder.Group writeOp);
+    public abstract DeletedCell clone(DeletedCell cell, CFMetaData cfm, OpOrder.Group writeOp);
+    public abstract ExpiringCell clone(ExpiringCell cell, CFMetaData cfm, OpOrder.Group writeOp);
+    public abstract DecoratedKey clone(DecoratedKey key, OpOrder.Group opGroup);
+    public abstract DataReclaimer reclaimer();
+
+    public SubAllocator onHeap()
+    {
+        return onHeap;
+    }
+
+    public SubAllocator offHeap()
+    {
+        return offHeap;
+    }
+
+    /**
+     * Mark this allocator reclaiming; this will permit any outstanding allocations to temporarily
+     * overshoot the maximum memory limit so that flushing can begin immediately
+     */
+    public void setDiscarding()
+    {
+        state = state.transition(LifeCycle.DISCARDING);
+        // mark the memory owned by this allocator as reclaiming
+        onHeap.markAllReclaiming();
+        offHeap.markAllReclaiming();
+    }
+
+    /**
+     * Indicate the memory and resources owned by this allocator are no longer referenced,
+     * and can be reclaimed/reused.
+     */
+    public void setDiscarded()
+    {
+        state = state.transition(LifeCycle.DISCARDED);
+        // release any memory owned by this allocator; automatically signals waiters
+        onHeap.releaseAll();
+        offHeap.releaseAll();
+    }
+
+    public boolean isLive()
+    {
+        return state == LifeCycle.LIVE;
+    }
+
+    public static interface DataReclaimer
+    {
+        public DataReclaimer reclaim(Cell cell);
+        public DataReclaimer reclaimImmediately(Cell cell);
+        public DataReclaimer reclaimImmediately(DecoratedKey key);
+        public void cancel();
+        public void commit();
+    }
+
+    public static final DataReclaimer NO_OP = new DataReclaimer()
+    {
+        public DataReclaimer reclaim(Cell cell)
+        {
+            return this;
+        }
+
+        public DataReclaimer reclaimImmediately(Cell cell)
+        {
+            return this;
+        }
+
+        public DataReclaimer reclaimImmediately(DecoratedKey key)
+        {
+            return this;
+        }
+
+        @Override
+        public void cancel()
+        {}
+
+        @Override
+        public void commit()
+        {}
+    };
+
+    /** Mark the BB as unused, permitting it to be reclaimed */
+    public static final class SubAllocator
+    {
+        // the tracker we are owning memory from
+        private final MemtablePool.SubPool parent;
+
+        // the amount of memory/resource owned by this object
+        private volatile long owns;
+        // the amount of memory we are reporting to collect; this may be inaccurate, but is close
+        // and is used only to ensure that once we have reclaimed we mark the tracker with the same amount
+        private volatile long reclaiming;
+
+        SubAllocator(MemtablePool.SubPool parent)
+        {
+            this.parent = parent;
+        }
+
+        // should only be called once we know we will never allocate to the object again.
+        // currently no corroboration/enforcement of this is performed.
+        void releaseAll()
+        {
+            parent.adjustAcquired(-ownsUpdater.getAndSet(this, 0), false);
+            parent.adjustReclaiming(-reclaimingUpdater.getAndSet(this, 0));
+        }
+
+        // allocate memory in the tracker, and mark ourselves as owning it
+        public void allocate(long size, OpOrder.Group opGroup)
+        {
+            while (true)
+            {
+                if (parent.tryAllocate(size))
+                {
+                    acquired(size);
+                    return;
+                }
+                WaitQueue.Signal signal = opGroup.isBlockingSignal(parent.hasRoom().register());
+                boolean allocated = parent.tryAllocate(size);
+                if (allocated || opGroup.isBlocking())
+                {
+                    signal.cancel();
+                    if (allocated) // if we allocated, take ownership
+                        acquired(size);
+                    else // otherwise we're blocking so we're permitted to overshoot our constraints, to just allocate without blocking
+                        allocated(size);
+                    return;
+                }
+                else
+                    signal.awaitUninterruptibly();
+            }
+        }
+
+        // retroactively mark an amount allocated amd acquired in the tracker, and owned by us
+        void allocated(long size)
+        {
+            parent.adjustAcquired(size, true);
+            ownsUpdater.addAndGet(this, size);
+        }
+
+        // retroactively mark an amount acquired in the tracker, and owned by us
+        void acquired(long size)
+        {
+            parent.adjustAcquired(size, false);
+            ownsUpdater.addAndGet(this, size);
+        }
+
+        void release(long size)
+        {
+            parent.adjustAcquired(-size, false);
+            ownsUpdater.addAndGet(this, -size);
+        }
+
+        // mark everything we currently own as reclaiming, both here and in our parent
+        void markAllReclaiming()
+        {
+            while (true)
+            {
+                long cur = owns;
+                long prev = reclaiming;
+                if (reclaimingUpdater.compareAndSet(this, prev, cur))
+                {
+                    parent.adjustReclaiming(cur - prev);
+                    return;
+                }
+            }
+        }
+
+        public long owns()
+        {
+            return owns;
+        }
+
+        public float ownershipRatio()
+        {
+            float r = owns / (float) parent.limit;
+            if (Float.isNaN(r))
+                return 0;
+            return r;
+        }
+
+        private static final AtomicLongFieldUpdater<SubAllocator> ownsUpdater = AtomicLongFieldUpdater.newUpdater(SubAllocator.class, "owns");
+        private static final AtomicLongFieldUpdater<SubAllocator> reclaimingUpdater = AtomicLongFieldUpdater.newUpdater(SubAllocator.class, "reclaiming");
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/utils/memory/MemtableBufferAllocator.java b/src/java/org/apache/cassandra/utils/memory/MemtableBufferAllocator.java
new file mode 100644
index 0000000..7034d76
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/MemtableBufferAllocator.java

@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.CounterCell;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletedCell;
+import org.apache.cassandra.db.ExpiringCell;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+public abstract class MemtableBufferAllocator extends MemtableAllocator
+{
+
+    protected MemtableBufferAllocator(SubAllocator onHeap, SubAllocator offHeap)
+    {
+        super(onHeap, offHeap);
+    }
+
+    public Cell clone(Cell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    {
+        return cell.localCopy(cfm, allocator(writeOp));
+    }
+
+    public CounterCell clone(CounterCell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    {
+        return cell.localCopy(cfm, allocator(writeOp));
+    }
+
+    public DeletedCell clone(DeletedCell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    {
+        return cell.localCopy(cfm, allocator(writeOp));
+    }
+
+    public ExpiringCell clone(ExpiringCell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    {
+        return cell.localCopy(cfm, allocator(writeOp));
+    }
+
+    public DecoratedKey clone(DecoratedKey key, OpOrder.Group writeOp)
+    {
+        return new BufferDecoratedKey(key.getToken(), allocator(writeOp).clone(key.getKey()));
+    }
+
+    public abstract ByteBuffer allocate(int size, OpOrder.Group opGroup);
+
+    protected AbstractAllocator allocator(OpOrder.Group writeOp)
+    {
+        return new ContextAllocator(writeOp, this);
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/memory/MemtableCleanerThread.java b/src/java/org/apache/cassandra/utils/memory/MemtableCleanerThread.java
new file mode 100644
index 0000000..5a90463
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/MemtableCleanerThread.java

@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+
+/**
+ * A thread that reclaims memory from a MemtablePool on demand.  The actual reclaiming work is delegated to the
+ * cleaner Runnable, e.g., FlushLargestColumnFamily
+ */
+class MemtableCleanerThread<P extends MemtablePool> extends Thread
+{
+    /** The pool we're cleaning */
+    final P pool;
+
+    /** should ensure that at least some memory has been marked reclaiming after completion */
+    final Runnable cleaner;
+
+    /** signalled whenever needsCleaning() may return true */
+    final WaitQueue wait = new WaitQueue();
+
+    MemtableCleanerThread(P pool, Runnable cleaner)
+    {
+        super(pool.getClass().getSimpleName() + "Cleaner");
+        this.pool = pool;
+        this.cleaner = cleaner;
+        setDaemon(true);
+    }
+
+    boolean needsCleaning()
+    {
+        return pool.onHeap.needsCleaning() || pool.offHeap.needsCleaning();
+    }
+
+    // should ONLY be called when we really think it already needs cleaning
+    void trigger()
+    {
+        wait.signal();
+    }
+
+    @Override
+    public void run()
+    {
+        while (true)
+        {
+            while (!needsCleaning())
+            {
+                final WaitQueue.Signal signal = wait.register();
+                if (!needsCleaning())
+                    signal.awaitUninterruptibly();
+                else
+                    signal.cancel();
+            }
+
+            cleaner.run();
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/memory/MemtablePool.java b/src/java/org/apache/cassandra/utils/memory/MemtablePool.java
new file mode 100644
index 0000000..1d219bb
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/MemtablePool.java

@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.util.concurrent.atomic.AtomicLongFieldUpdater;
+
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+
+
+/**
+ * Represents an amount of memory used for a given purpose, that can be allocated to specific tasks through
+ * child MemtableAllocator objects.
+ */
+public abstract class MemtablePool
+{
+    final MemtableCleanerThread<?> cleaner;
+
+    // the total memory used by this pool
+    public final SubPool onHeap;
+    public final SubPool offHeap;
+
+    final WaitQueue hasRoom = new WaitQueue();
+
+    MemtablePool(long maxOnHeapMemory, long maxOffHeapMemory, float cleanThreshold, Runnable cleaner)
+    {
+        this.onHeap = getSubPool(maxOnHeapMemory, cleanThreshold);
+        this.offHeap = getSubPool(maxOffHeapMemory, cleanThreshold);
+        this.cleaner = getCleaner(cleaner);
+        if (this.cleaner != null)
+            this.cleaner.start();
+    }
+
+    SubPool getSubPool(long limit, float cleanThreshold)
+    {
+        return new SubPool(limit, cleanThreshold);
+    }
+
+    MemtableCleanerThread<?> getCleaner(Runnable cleaner)
+    {
+        return cleaner == null ? null : new MemtableCleanerThread<>(this, cleaner);
+    }
+
+    public abstract boolean needToCopyOnHeap();
+    public abstract MemtableAllocator newAllocator();
+
+    /**
+     * Note the difference between acquire() and allocate(); allocate() makes more resources available to all owners,
+     * and acquire() makes shared resources unavailable but still recorded. An Owner must always acquire resources,
+     * but only needs to allocate if there are none already available. This distinction is not always meaningful.
+     */
+    public class SubPool
+    {
+
+        // total memory/resource permitted to allocate
+        public final long limit;
+
+        // ratio of used to spare (both excluding 'reclaiming') at which to trigger a clean
+        public final float cleanThreshold;
+
+        // total bytes allocated and reclaiming
+        volatile long allocated;
+        volatile long reclaiming;
+
+        // a cache of the calculation determining at what allocation threshold we should next clean
+        volatile long nextClean;
+
+        public SubPool(long limit, float cleanThreshold)
+        {
+            this.limit = limit;
+            this.cleanThreshold = cleanThreshold;
+        }
+
+        /** Methods for tracking and triggering a clean **/
+
+        boolean needsCleaning()
+        {
+            // use strictly-greater-than so we don't clean when limit is 0
+            return used() > nextClean && updateNextClean();
+        }
+
+        void maybeClean()
+        {
+            if (needsCleaning() && cleaner != null)
+                cleaner.trigger();
+        }
+
+        private boolean updateNextClean()
+        {
+            while (true)
+            {
+                long current = nextClean;
+                long reclaiming = this.reclaiming;
+                long next =  reclaiming + (long) (this.limit * cleanThreshold);
+                if (current == next || nextCleanUpdater.compareAndSet(this, current, next))
+                    return used() > next;
+            }
+        }
+
+        /** Methods to allocate space **/
+
+        boolean tryAllocate(long size)
+        {
+            while (true)
+            {
+                long cur;
+                if ((cur = allocated) + size > limit)
+                    return false;
+                if (allocatedUpdater.compareAndSet(this, cur, cur + size))
+                    return true;
+            }
+        }
+
+        /**
+         * apply the size adjustment to allocated, bypassing any limits or constraints. If this reduces the
+         * allocated total, we will signal waiters
+         */
+        void adjustAllocated(long size)
+        {
+            if (size == 0)
+                return;
+            while (true)
+            {
+                long cur = allocated;
+                if (allocatedUpdater.compareAndSet(this, cur, cur + size))
+                    return;
+            }
+        }
+
+        // 'acquires' an amount of memory, and maybe also marks it allocated. This method is meant to be overridden
+        // by implementations with a separate concept of acquired/allocated. As this method stands, an acquire
+        // without an allocate is a no-op (acquisition is achieved through allocation), however a release (where size < 0)
+        // is always processed and accounted for in allocated.
+        void adjustAcquired(long size, boolean alsoAllocated)
+        {
+            if (size > 0 || alsoAllocated)
+            {
+                if (alsoAllocated)
+                    adjustAllocated(size);
+                maybeClean();
+            }
+            else if (size < 0)
+            {
+                adjustAllocated(size);
+                hasRoom.signalAll();
+            }
+        }
+
+        // space reclaimed should be released prior to calling this, to avoid triggering unnecessary cleans
+        void adjustReclaiming(long reclaiming)
+        {
+            if (reclaiming == 0)
+                return;
+            reclaimingUpdater.addAndGet(this, reclaiming);
+            if (reclaiming < 0 && updateNextClean() && cleaner != null)
+                cleaner.trigger();
+        }
+
+        public long allocated()
+        {
+            return allocated;
+        }
+
+        public long used()
+        {
+            return allocated;
+        }
+
+        public MemtableAllocator.SubAllocator newAllocator()
+        {
+            return new MemtableAllocator.SubAllocator(this);
+        }
+
+        public WaitQueue hasRoom()
+        {
+            return hasRoom;
+        }
+    }
+
+    private static final AtomicLongFieldUpdater<SubPool> reclaimingUpdater = AtomicLongFieldUpdater.newUpdater(SubPool.class, "reclaiming");
+    private static final AtomicLongFieldUpdater<SubPool> allocatedUpdater = AtomicLongFieldUpdater.newUpdater(SubPool.class, "allocated");
+    private static final AtomicLongFieldUpdater<SubPool> nextCleanUpdater = AtomicLongFieldUpdater.newUpdater(SubPool.class, "nextClean");
+
+}

diff --git a/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java b/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java
new file mode 100644
index 0000000..1b5dcf2
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java

@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.lang.reflect.Field;
+
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.CounterCell;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletedCell;
+import org.apache.cassandra.db.ExpiringCell;
+import org.apache.cassandra.db.NativeCell;
+import org.apache.cassandra.db.NativeCounterCell;
+import org.apache.cassandra.db.NativeDecoratedKey;
+import org.apache.cassandra.db.NativeDeletedCell;
+import org.apache.cassandra.db.NativeExpiringCell;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import sun.misc.Unsafe;
+
+public class NativeAllocator extends MemtableAllocator
+{
+    private static final Logger logger = LoggerFactory.getLogger(NativeAllocator.class);
+
+    private final static int REGION_SIZE = 1024 * 1024;
+    private final static int MAX_CLONED_SIZE = 128 * 1024; // bigger than this don't go in the region
+
+    // globally stash any Regions we allocate but are beaten to using, and use these up before allocating any more
+    private static final ConcurrentLinkedQueue<Region> RACE_ALLOCATED = new ConcurrentLinkedQueue<>();
+
+    private final AtomicReference<Region> currentRegion = new AtomicReference<>();
+    private final AtomicInteger regionCount = new AtomicInteger(0);
+    private final ConcurrentLinkedQueue<Region> regions = new ConcurrentLinkedQueue<>();
+    private AtomicLong unslabbed = new AtomicLong(0);
+
+    protected NativeAllocator(NativePool pool)
+    {
+        super(pool.onHeap.newAllocator(), pool.offHeap.newAllocator());
+    }
+
+    @Override
+    public Cell clone(Cell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    {
+        return new NativeCell(this, writeOp, cell);
+    }
+
+    @Override
+    public CounterCell clone(CounterCell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    {
+        return new NativeCounterCell(this, writeOp, cell);
+    }
+
+    @Override
+    public DeletedCell clone(DeletedCell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    {
+        return new NativeDeletedCell(this, writeOp, cell);
+    }
+
+    @Override
+    public ExpiringCell clone(ExpiringCell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    {
+        return new NativeExpiringCell(this, writeOp, cell);
+    }
+
+    public DecoratedKey clone(DecoratedKey key, OpOrder.Group writeOp)
+    {
+        return new NativeDecoratedKey(key.getToken(), this, writeOp, key.getKey());
+    }
+
+    @Override
+    public MemtableAllocator.DataReclaimer reclaimer()
+    {
+        return NO_OP;
+    }
+
+    public long allocate(int size, OpOrder.Group opGroup)
+    {
+        assert size >= 0;
+        offHeap().allocate(size, opGroup);
+        // satisfy large allocations directly from JVM since they don't cause fragmentation
+        // as badly, and fill up our regions quickly
+        if (size > MAX_CLONED_SIZE)
+        {
+            unslabbed.addAndGet(size);
+            Region region = new Region(unsafe.allocateMemory(size), size);
+            regions.add(region);
+
+            long peer;
+            if ((peer = region.allocate(size)) == -1)
+                throw new AssertionError();
+
+            return peer;
+        }
+
+        while (true)
+        {
+            Region region = getRegion();
+
+            long peer;
+            if ((peer = region.allocate(size)) > 0)
+                return peer;
+
+            // not enough space!
+            currentRegion.compareAndSet(region, null);
+        }
+    }
+
+    public void setDiscarded()
+    {
+        for (Region region : regions)
+            unsafe.freeMemory(region.peer);
+        super.setDiscarded();
+    }
+
+    /**
+     * Get the current region, or, if there is no current region, allocate a new one
+     */
+    private Region getRegion()
+    {
+        while (true)
+        {
+            // Try to get the region
+            Region region = currentRegion.get();
+            if (region != null)
+                return region;
+
+            // No current region, so we want to allocate one. We race
+            // against other allocators to CAS in a Region, and if we fail we stash the region for re-use
+            region = RACE_ALLOCATED.poll();
+            if (region == null)
+                region = new Region(unsafe.allocateMemory(REGION_SIZE), REGION_SIZE);
+            if (currentRegion.compareAndSet(null, region))
+            {
+                regions.add(region);
+                regionCount.incrementAndGet();
+                logger.trace("{} regions now allocated in {}", regionCount, this);
+                return region;
+            }
+
+            // someone else won race - that's fine, we'll try to grab theirs
+            // in the next iteration of the loop.
+            RACE_ALLOCATED.add(region);
+        }
+    }
+
+    /**
+     * A region of memory out of which allocations are sliced.
+     *
+     * This serves two purposes:
+     *  - to provide a step between initialization and allocation, so that racing to CAS a
+     *    new region in is harmless
+     *  - encapsulates the allocation offset
+     */
+    private static class Region
+    {
+        /**
+         * Actual underlying data
+         */
+        private final long peer;
+
+        private final long capacity;
+
+        /**
+         * Offset for the next allocation, or the sentinel value -1
+         * which implies that the region is still uninitialized.
+         */
+        private AtomicInteger nextFreeOffset = new AtomicInteger(0);
+
+        /**
+         * Total number of allocations satisfied from this buffer
+         */
+        private AtomicInteger allocCount = new AtomicInteger();
+
+        /**
+         * Create an uninitialized region. Note that memory is not allocated yet, so
+         * this is cheap.
+         *
+         * @param peer peer
+         */
+        private Region(long peer, long capacity)
+        {
+            this.peer = peer;
+            this.capacity = capacity;
+        }
+
+        /**
+         * Try to allocate <code>size</code> bytes from the region.
+         *
+         * @return the successful allocation, or null to indicate not-enough-space
+         */
+        long allocate(int size)
+        {
+            while (true)
+            {
+                int oldOffset = nextFreeOffset.get();
+
+                if (oldOffset + size > capacity) // capacity == remaining
+                    return -1;
+
+                // Try to atomically claim this region
+                if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size))
+                {
+                    // we got the alloc
+                    allocCount.incrementAndGet();
+                    return peer + oldOffset;
+                }
+                // we raced and lost alloc, try again
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            return "Region@" + System.identityHashCode(this) +
+                    " allocs=" + allocCount.get() + "waste=" +
+                    (capacity - nextFreeOffset.get());
+        }
+    }
+
+
+    static final Unsafe unsafe;
+
+    static
+    {
+        try
+        {
+            Field field = sun.misc.Unsafe.class.getDeclaredField("theUnsafe");
+            field.setAccessible(true);
+            unsafe = (sun.misc.Unsafe) field.get(null);
+        }
+        catch (Exception e)
+        {
+            throw new AssertionError(e);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/src/java/org/apache/cassandra/utils/memory/NativePool.java
similarity index 66%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to src/java/org/apache/cassandra/utils/memory/NativePool.java
index e42574b..012867a 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/src/java/org/apache/cassandra/utils/memory/NativePool.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.io.util;
 /*
- * 
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +6,34 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
  */
+package org.apache.cassandra.utils.memory;
 
-
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class NativePool extends MemtablePool
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public NativePool(long maxOnHeapMemory, long maxOffHeapMemory, float cleanThreshold, Runnable cleaner)
     {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
+        super(maxOnHeapMemory, maxOffHeapMemory, cleanThreshold, cleaner);
     }
 
     @Override
-    public void write(byte[] b, int off, int len)
+    public boolean needToCopyOnHeap()
     {
-        buffer.put(b, off, len);
+        return true;
+    }
+
+    @Override
+    public NativeAllocator newAllocator()
+    {
+        return new NativeAllocator(this);
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/SlabAllocator.java b/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java
similarity index 74%
rename from src/java/org/apache/cassandra/utils/SlabAllocator.java
rename to src/java/org/apache/cassandra/utils/memory/SlabAllocator.java
index 20939fe..19334ee 100644
--- a/src/java/org/apache/cassandra/utils/SlabAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java

@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.utils;
+package org.apache.cassandra.utils.memory;
 
 import java.nio.ByteBuffer;
 import java.util.concurrent.ConcurrentLinkedQueue;
@@ -26,6 +26,10 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import sun.nio.ch.DirectBuffer;
+
 /**
  * The SlabAllocator is a bump-the-pointer allocator that allocates
  * large (2MB by default) regions and then doles them out to threads that request
@@ -40,7 +44,7 @@
  * interleaved throughout the heap, and the old generation gets progressively
  * more fragmented until a stop-the-world compacting collection occurs.
  */
-public class SlabAllocator extends Allocator
+public class SlabAllocator extends MemtableBufferAllocator
 {
     private static final Logger logger = LoggerFactory.getLogger(SlabAllocator.class);
 
@@ -50,22 +54,42 @@
     // globally stash any Regions we allocate but are beaten to using, and use these up before allocating any more
     private static final ConcurrentLinkedQueue<Region> RACE_ALLOCATED = new ConcurrentLinkedQueue<>();
 
-    private final AtomicReference<Region> currentRegion = new AtomicReference<Region>();
+    private final AtomicReference<Region> currentRegion = new AtomicReference<>();
     private final AtomicInteger regionCount = new AtomicInteger(0);
-    private AtomicLong unslabbed = new AtomicLong(0);
+
+    // this queue is used to keep references to off-heap allocated regions so that we can free them when we are discarded
+    private final ConcurrentLinkedQueue<Region> offHeapRegions = new ConcurrentLinkedQueue<>();
+    private AtomicLong unslabbedSize = new AtomicLong(0);
+    private final boolean allocateOnHeapOnly;
+
+    SlabAllocator(SubAllocator onHeap, SubAllocator offHeap, boolean allocateOnHeapOnly)
+    {
+        super(onHeap, offHeap);
+        this.allocateOnHeapOnly = allocateOnHeapOnly;
+    }
 
     public ByteBuffer allocate(int size)
     {
+        return allocate(size, null);
+    }
+
+    public ByteBuffer allocate(int size, OpOrder.Group opGroup)
+    {
         assert size >= 0;
         if (size == 0)
             return ByteBufferUtil.EMPTY_BYTE_BUFFER;
 
+        (allocateOnHeapOnly ? onHeap() : offHeap()).allocate(size, opGroup);
         // satisfy large allocations directly from JVM since they don't cause fragmentation
         // as badly, and fill up our regions quickly
         if (size > MAX_CLONED_SIZE)
         {
-            unslabbed.addAndGet(size);
-            return ByteBuffer.allocate(size);
+            unslabbedSize.addAndGet(size);
+            if (allocateOnHeapOnly)
+                return ByteBuffer.allocate(size);
+            Region region = new Region(ByteBuffer.allocateDirect(size));
+            offHeapRegions.add(region);
+            return region.allocate(size);
         }
 
         while (true)
@@ -82,6 +106,18 @@
         }
     }
 
+    public DataReclaimer reclaimer()
+    {
+        return NO_OP;
+    }
+
+    public void setDiscarded()
+    {
+        for (Region region : offHeapRegions)
+            ((DirectBuffer) region.data).cleaner().clean();
+        super.setDiscarded();
+    }
+
     /**
      * Get the current region, or, if there is no current region, allocate a new one
      */
@@ -98,9 +134,11 @@
             // against other allocators to CAS in a Region, and if we fail we stash the region for re-use
             region = RACE_ALLOCATED.poll();
             if (region == null)
-                region = new Region(REGION_SIZE);
+                region = new Region(allocateOnHeapOnly ? ByteBuffer.allocate(REGION_SIZE) : ByteBuffer.allocateDirect(REGION_SIZE));
             if (currentRegion.compareAndSet(null, region))
             {
+                if (!allocateOnHeapOnly)
+                    offHeapRegions.add(region);
                 regionCount.incrementAndGet();
                 logger.trace("{} regions now allocated in {}", regionCount, this);
                 return region;
@@ -112,12 +150,9 @@
         }
     }
 
-    /**
-     * @return a lower bound on how much space has been allocated
-     */
-    public long getMinimumSize()
+    protected AbstractAllocator allocator(OpOrder.Group writeOp)
     {
-        return unslabbed.get() + (regionCount.get() - 1) * (long)REGION_SIZE;
+        return new ContextAllocator(writeOp, this);
     }
 
     /**
@@ -133,28 +168,28 @@
         /**
          * Actual underlying data
          */
-        private final ByteBuffer data;
+        private ByteBuffer data;
 
         /**
          * Offset for the next allocation, or the sentinel value -1
          * which implies that the region is still uninitialized.
          */
-        private final AtomicInteger nextFreeOffset = new AtomicInteger(0);
+        private AtomicInteger nextFreeOffset = new AtomicInteger(0);
 
         /**
          * Total number of allocations satisfied from this buffer
          */
-        private final AtomicInteger allocCount = new AtomicInteger();
+        private AtomicInteger allocCount = new AtomicInteger();
 
         /**
          * Create an uninitialized region. Note that memory is not allocated yet, so
          * this is cheap.
          *
-         * @param size in bytes
+         * @param buffer bytes
          */
-        private Region(int size)
+        private Region(ByteBuffer buffer)
         {
-            data = ByteBuffer.allocate(size);
+            data = buffer;
         }
 
         /**

diff --git a/src/java/org/apache/cassandra/utils/memory/SlabPool.java b/src/java/org/apache/cassandra/utils/memory/SlabPool.java
new file mode 100644
index 0000000..c5c44e1
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/SlabPool.java

@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+public class SlabPool extends MemtablePool
+{
+    final boolean allocateOnHeap;
+
+    public SlabPool(long maxOnHeapMemory, long maxOffHeapMemory, float cleanupThreshold, Runnable cleaner)
+    {
+        super(maxOnHeapMemory, maxOffHeapMemory, cleanupThreshold, cleaner);
+        this.allocateOnHeap = maxOffHeapMemory == 0;
+    }
+
+    public MemtableAllocator newAllocator()
+    {
+        return new SlabAllocator(onHeap.newAllocator(), offHeap.newAllocator(), allocateOnHeap);
+    }
+
+    public boolean needToCopyOnHeap()
+    {
+        return !allocateOnHeap;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/obs/IBitSet.java b/src/java/org/apache/cassandra/utils/obs/IBitSet.java
index c6fbddd..96aac6b 100644
--- a/src/java/org/apache/cassandra/utils/obs/IBitSet.java
+++ b/src/java/org/apache/cassandra/utils/obs/IBitSet.java

@@ -49,4 +49,6 @@
     public long serializedSize(TypeSizes type);
 
     public void clear();
+
+    public void close();
 }

diff --git a/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java b/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java
index 29dd848..3007292 100644
--- a/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java
+++ b/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java

@@ -21,7 +21,6 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.cassandra.cache.RefCountedMemory;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.util.Memory;
 
@@ -42,7 +41,7 @@
         try
         {
             long byteCount = wordCount * 8L;
-            bytes = RefCountedMemory.allocate(byteCount);
+            bytes = Memory.allocate(byteCount);
         }
         catch (OutOfMemoryError e)
         {
@@ -123,7 +122,7 @@
     public static OffHeapBitSet deserialize(DataInput in) throws IOException
     {
         long byteCount = in.readInt() * 8L;
-        Memory memory = RefCountedMemory.allocate(byteCount);
+        Memory memory = Memory.allocate(byteCount);
         for (long i = 0; i < byteCount;)
         {
             long v = in.readLong();
@@ -139,7 +138,7 @@
         return new OffHeapBitSet(memory);
     }
 
-    public void close() throws IOException
+    public void close()
     {
         bytes.free();
     }

diff --git a/src/java/org/apache/cassandra/utils/obs/OpenBitSet.java b/src/java/org/apache/cassandra/utils/obs/OpenBitSet.java
index b5310fa..1d2f690 100644
--- a/src/java/org/apache/cassandra/utils/obs/OpenBitSet.java
+++ b/src/java/org/apache/cassandra/utils/obs/OpenBitSet.java

@@ -307,7 +307,7 @@
     long[][] thisArr = this.bits;
     long[][] otherArr = other.bits;
     int thisPageSize = PAGE_SIZE;
-    int otherPageSize = other.PAGE_SIZE;
+    int otherPageSize = OpenBitSet.PAGE_SIZE;
     // testing against zero can be more efficient
     int pos=newLen;
     while(--pos>=0) {
@@ -357,8 +357,8 @@
       a=this;
     }
 
-    int aPageSize = this.PAGE_SIZE;
-    int bPageSize = b.PAGE_SIZE;
+    int aPageSize = OpenBitSet.PAGE_SIZE;
+    int bPageSize = OpenBitSet.PAGE_SIZE;
 
     // check for any set bits out of the range of b
     for (int i=a.wlen-1; i>=b.wlen; i--) {
@@ -387,7 +387,7 @@
     return (int)((h>>32) ^ h) + 0x98761234;
   }
 
-  public void close() throws IOException {
+  public void close() {
     // noop, let GC do the cleanup.
   }
 

diff --git a/src/resources/org/apache/cassandra/cli/CliHelp.yaml b/src/resources/org/apache/cassandra/cli/CliHelp.yaml
index 3d4bd99..cc838f8 100644
--- a/src/resources/org/apache/cassandra/cli/CliHelp.yaml
+++ b/src/resources/org/apache/cassandra/cli/CliHelp.yaml

@@ -250,7 +250,7 @@
           Options have the form {key:value}, see the information on each
           strategy and the examples.
 
-        - durable_writes: When set to false all RowMutations on keyspace will by-pass CommitLog.
+        - durable_writes: When set to false all Mutations on keyspace will by-pass CommitLog.
           Set to true by default.
 
         Examples:
@@ -318,7 +318,7 @@
           Options have the form {key:value}, see the information on each
           strategy and the examples.
 
-        - durable_writes: When set to false all RowMutations on keyspace will by-pass CommitLog.
+        - durable_writes: When set to false all Mutations on keyspace will by-pass CommitLog.
           Set to true by default.
 
         Examples:
@@ -541,12 +541,6 @@
         boundaries are the number of tables Cassandra attempts to merge together at
         once.
 
-        - replicate_on_write: Replicate every counter update from the leader to the
-        follower replicas. Accepts the values true and false.
-
-        - populate_io_cache_on_flush: Populates the page cache on memtable flush
-        and compaction. Accepts the values true and false.
-
         - compression_options: Options related to compression.
           Options have the form {key:value}.
           The main recognized options are:
@@ -794,7 +788,11 @@
             - KEYS_ONLY
             - ROWS_ONLY
             - NONE;
+        - cells_per_row_to_cache: State the number of cells per row to cache.
 
+          Defaults to 100. Set to "ALL" if you want the old cache behaviour.
+
+          Will not be used if row caching is not enabled.
         - speculative_retry: Speculative retry is used to speculate a read failure.
 
           Speculative retry will execute additional read on a different nodes when
@@ -834,12 +832,6 @@
         boundaries are the number of tables Cassandra attempts to merge together at
         once.
 
-        - replicate_on_write: Replicate every counter update from the leader to the
-        follower replicas. Accepts the values true and false.
-
-        - populate_io_cache_on_flush: Populates the page cache on memtable flush
-        and compaction. Accepts the values true and false.
-
         - compression_options: Options related to compression.
           Options have the form {key:value}.
           The main recognized options are:

diff --git a/src/resources/org/apache/cassandra/tools/NodeToolHelp.yaml b/src/resources/org/apache/cassandra/tools/NodeToolHelp.yaml
deleted file mode 100644
index fa2b39c..0000000
--- a/src/resources/org/apache/cassandra/tools/NodeToolHelp.yaml
+++ /dev/null

@@ -1,228 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Help file for nodetool commands in Yaml.
-commands:
-  - name: ring
-    help: |
-      Print information about the token ring
-  - name: join
-    help: |
-      Join the ring
-  - name: info [-T/--tokens]
-    help: |
-      Print node information (uptime, load, ...)
-  - name: status
-    help: |
-      Print cluster information (state, load, IDs, ...)
-  - name: cfstats [keyspace].[cfname] ...
-    help: |
-      Print statistics on column families. Use the -i flag to ignore the list of column families and display the remaining cfs.
-  - name: version
-    help: |
-      Print cassandra version
-  - name: tpstats
-    help: |
-      Print usage statistics of thread pools
-  - name: proxyhistograms
-    help: |
-      Print statistic histograms for network operations
-  - name: drain
-    help: |
-      Drain the node (stop accepting writes and flush all column families)
-  - name: decommission
-    help: |
-      Decommission the *node I am connecting to*
-  - name: compactionstats
-    help: |
-      Print statistics on compactions
-  - name: compactionhistory
-    help: |
-      Print history of compaction
-  - name: disablebinary
-    help: |
-      Disable native transport (binary protocol)
-  - name: enablebinary
-    help: |
-      Reenable native transport (binary protocol)
-  - name: statusbinary
-    help: |
-      Status of native transport (binary protocol)
-  - name: disablehandoff
-    help: |
-      Disable the future hints storing on the current node
-  - name: enablehandoff
-    help: |
-      Reenable the future hints storing on the current node
-  - name: truncatehints <host-name>
-    help: |
-        Truncate all hints on the local node, or truncate hints for the endpoint specified.
-  - name: resumehandoff
-    help: |
-      Resume hints delivery process
-  - name: pausehandoff
-    help: |
-      Pause hints delivery process
-  - name: disablegossip
-    help: |
-      Disable gossip (effectively marking the node down)
-  - name: enablegossip
-    help: |
-      Reenable gossip
-  - name: disablethrift
-    help: |
-      Disable thrift server
-  - name: enablethrift
-    help: |
-      Reenable thrift server
-  - name: enablebackup
-    help: |
-      Enable incremental backup
-  - name: disablebackup
-    help: |
-      Disable incremental backup
-  - name: statusthrift
-    help: |
-      Status of thrift server
-  - name: gossipinfo
-    help: |
-      Shows the gossip information for the cluster
-  - name: invalidatekeycache
-    help: |
-      Invalidate the key cache
-  - name: invalidaterowcache
-    help: |
-      Invalidate the row cache
-  - name: resetlocalschema
-    help: |
-      Reset node's local schema and resync
-  - name: netstats [host]
-    help: |
-      Print network information on provided host (connecting node by default)
-  - name: move <new token>
-    help: |
-      Move node on the token ring to a new token. (for negative tokens, use \\ to escape, Example: move \\-123)
-  - name: removenode status|force|<ID>
-    help: |
-      Show status of current node removal, force completion of pending removal or remove provided ID
-  - name: setcompactionthroughput <value_in_mb>
-    help: |
-      Set the MB/s throughput cap for compaction in the system, or 0 to disable throttling.
-  - name: setstreamthroughput  <value_in_mb>
-    help: |
-      Set the Mb/s throughput cap for streaming in the system, or 0 to disable throttling.
-  - name: describecluster
-    help: |
-      Print the name, snitch, partitioner and schema version of a cluster.
-  - name: describering [keyspace]
-    help: |
-      Shows the token ranges info of a given keyspace.
-  - name: rangekeysample
-    help: |
-      Shows the sampled keys held across all keyspaces.
-  - name: rebuild [src-dc-name]
-    help: |
-      Rebuild data by streaming from other nodes (similarly to bootstrap)
-  - name: settraceprobability [value]
-    help: |
-      Sets the probability for tracing any given request to value. 0 disables, 1 enables for all requests, 0 is the default
-  - name: snapshot [keyspaces...] -cf [columnfamilyName] -t [snapshotName]
-    help: |
-      Take a snapshot of the optionally specified column family of the specified keyspaces  using optional name snapshotName
-  - name: clearsnapshot [keyspaces...] -t [snapshotName]
-    help: |
-      Remove snapshots for the specified keyspaces. Either remove all snapshots or remove the snapshots with the given name.
-  - name: flush [keyspace] [cfnames]
-    help: |
-      Flush one or more column families
-  - name: repair [keyspace] [cfnames]
-    help: |
-      Repair one or more column families
-         Use -dc to repair specific datacenters (csv list).
-         Use -et to specify a token at which repair range ends.
-         Use -local to only repair against nodes in the same datacenter.
-         Use -pr to repair only the first range returned by the partitioner.
-         Use -par to carry out a parallel repair.
-         Use -st to specify a token at which the repair range starts.
-  - name: cleanup [keyspace] [cfnames]
-    help: |
-      Run cleanup on one or more column families
-  - name: compact [keyspace] [cfnames]
-    help: |
-      Force a (major) compaction on one or more column families
-  - name: scrub [keyspace] [cfnames] [-s|--skip-corrupted]
-    help: |
-      Scrub (rebuild sstables for) one or more column families.
-         Use -s/--skip-corrupted to skip corrupted rows even when scrubbing
-         tables that use counters.
-  - name: upgradesstables [-a|--include-all-sstables] [keyspace] [cfnames]
-    help: |
-      Rewrite sstables (for the requested column families) that are not on the current version (thus upgrading them to said current version).
-         Use -a to include all sstables, even those already on the current version.
-  - name: setcompactionthreshold <keyspace> <cfname>
-    help: |
-      Set min and max compaction thresholds for a given column family
-  - name: getcompactionthreshold <keyspace> <cfname>
-    help: |
-      Print min and max compaction thresholds for a given column family
-  - name: disableautocompaction [keyspace] [cfnames]
-    help: |
-      Disable autocompaction for the given keyspace and column family
-  - name: enableautocompaction [keyspace] [cfnames]
-    help: |
-      Enable autocompaction
-  - name: getcompactionthroughput
-    help: |
-      Print the MB/s throughput cap for compaction in the system
-  - name: getstreamthroughput
-    help: |
-      Print the Mb/s throughput cap for streaming in the system
-  - name: stop <compaction_type>
-    help: |
-      Supported types are COMPACTION, VALIDATION, CLEANUP, SCRUB, INDEX_BUILD
-  - name: cfhistograms <keyspace> <cfname>
-    help: |
-      Print statistic histograms for a given column family
-  - name: refresh <keyspace> <cf-name>
-    help: |
-      Load newly placed SSTables to the system without restart.
-  - name: rebuild_index <keyspace> <cf-name> <idx1,idx1>
-    help: |
-      a full rebuild of native secondary indexes for a given column family. IndexNameExample: Standard3.IdxName,Standard3.IdxName1
-  - name: setcachecapacity <key-cache-capacity> <row-cache-capacity>
-    help: |
-      Set global key and row cache capacities (in MB units).
-  - name: setcachekeystosave <key-cache-keys-to-save> <row-cache-keys-to-save>
-    help: |
-      Set number of keys saved by each cache for faster post-restart warmup. 0 to disable.
-  - name: getendpoints <keyspace> <cf> <key>
-    help: |
-      Print the end points that owns the key
-  - name: getsstables <keyspace> <cf> <key>
-    help: |
-      Print the sstable filenames that own the key
-  - name: reloadtriggers
-    help: |
-      reload trigger classes
-  - name: setlogginglevel <class> <level>
-    help: |
-      Set the log level threshold for a given class. If both class and level are empty/null, it will reset to the initial configuration
-  - name: getlogginglevels
-    help: |
-      Get the runtime logging levels
-  - name: sethintedhandoffthrottlekb  <throttle-in-kb>
-    help: |
-      Set hinted handoff throttle in kb per second, per delivery thread.

diff --git a/test/cassandra.in.sh b/test/cassandra.in.sh
deleted file mode 100644
index 1033512..0000000
--- a/test/cassandra.in.sh
+++ /dev/null

@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cassandra_home=`dirname $0`/..
-
-# The directory where Cassandra's configs live (required)
-CASSANDRA_CONF=$cassandra_home/test/conf
-
-# This can be the path to a jar file, or a directory containing the 
-# compiled classes. NOTE: This isn't needed by the startup script,
-# it's just used here in constructing the classpath.
-cassandra_bin=$cassandra_home/build/classes/main
-cassandra_bin=$cassandra_bin:$cassandra_home/build/classes/thrift
-#cassandra_bin=$cassandra_home/build/cassandra.jar
-
-# The java classpath (required)
-CLASSPATH=$CASSANDRA_CONF:$cassandra_bin
-
-for jar in $cassandra_home/lib/*.jar $cassandra_home/build/lib/jars/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-
-# Arguments to pass to the JVM
-JVM_OPTS=" \
-        -ea \
-        -Xdebug \
-        -Xrunjdwp:transport=dt_socket,server=y,address=8898,suspend=n \
-        -Xms128M \
-        -Xmx1G \
-        -Xss256k \
-        -XX:SurvivorRatio=8 \
-        -XX:TargetSurvivorRatio=90 \
-        -XX:+AggressiveOpts \
-        -XX:+UseParNewGC \
-        -XX:+UseConcMarkSweepGC \
-        -XX:CMSInitiatingOccupancyFraction=1 \
-        -XX:+CMSParallelRemarkEnabled \
-        -XX:+HeapDumpOnOutOfMemoryError \
-        -Dcom.sun.management.jmxremote.port=8090 \
-        -Dcom.sun.management.jmxremote.ssl=false \
-        -Dcom.sun.management.jmxremote.authenticate=false \
-        -Dcassandra.ring_delay_ms=1000"

diff --git a/test/conf/log4j-server.properties b/test/conf/cassandra-rackdc.properties.mod
similarity index 68%
rename from test/conf/log4j-server.properties
rename to test/conf/cassandra-rackdc.properties.mod
index 796008f..457ad65 100644
--- a/test/conf/log4j-server.properties
+++ b/test/conf/cassandra-rackdc.properties.mod

@@ -13,13 +13,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-log4j.rootLogger=DEBUG,R
-
-# rolling log file ("system.log
-log4j.appender.R=org.apache.log4j.DailyRollingFileAppender
-log4j.appender.R.DatePattern='.'yyyy-MM-dd-HH
-log4j.appender.R.layout=org.apache.log4j.PatternLayout
-log4j.appender.R.layout.ConversionPattern=%5p [%t] %d{ISO8601} %F (line %L) %m%n
-log4j.appender.R.File=build/test/logs/system.log
+dc=DC2
+rack=RAC2

diff --git a/test/conf/cassandra.yaml b/test/conf/cassandra.yaml
index 3bb29bb..ec988e2 100644
--- a/test/conf/cassandra.yaml
+++ b/test/conf/cassandra.yaml

@@ -3,14 +3,16 @@
 # Consider the effects on 'o.a.c.i.s.LegacySSTableTest' before changing schemas in this file.
 #
 cluster_name: Test Cluster
-in_memory_compaction_limit_in_mb: 1
+memtable_allocation_type: offheap_objects
 commitlog_sync: batch
 commitlog_sync_batch_window_in_ms: 1.0
-commitlog_segment_size_in_mb: 1
+commitlog_segment_size_in_mb: 5
 partitioner: org.apache.cassandra.dht.ByteOrderedPartitioner
 listen_address: 127.0.0.1
 storage_port: 7010
 rpc_port: 9170
+start_native_transport: true
+native_transport_port: 9042
 column_index_size_in_kb: 4
 commitlog_directory: build/test/cassandra/commitlog
 saved_caches_directory: build/test/cassandra/saved_caches
@@ -32,6 +34,5 @@
     truststore: conf/.truststore
     truststore_password: cassandra
 incremental_backups: true
+concurrent_compactors: 4
 compaction_throughput_mb_per_sec: 0
-start_native_transport: true
-native_transport_port: 9052

diff --git a/test/conf/log4j-junit.properties b/test/conf/log4j-junit.properties
deleted file mode 100644
index 3bc0c1a..0000000
--- a/test/conf/log4j-junit.properties
+++ /dev/null

@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# for production, you should probably set the root to INFO
-# and the pattern to %c instead of %l.  (%l is slower.)
-
-# output messages into a rolling log file as well as stdout
-log4j.rootLogger=DEBUG,stderr,R
-
-# stderr
-log4j.appender.stderr=org.apache.log4j.ConsoleAppender
-log4j.appender.stderr.target=System.err
-log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
-log4j.appender.stderr.layout.ConversionPattern=%5p %d{HH:mm:ss,SSS} %m%n
-log4j.appender.stderr.threshold=WARN
-
-# rolling log file
-log4j.appender.R=org.apache.log4j.RollingFileAppender
-log4j.appender.file.maxFileSize=20MB
-log4j.appender.file.maxBackupIndex=50
-log4j.appender.R.layout=org.apache.log4j.PatternLayout
-log4j.appender.R.layout.ConversionPattern=%5p [%t] %d{ISO8601} %F (line %L) %m%n
-# Edit the next line to point to your logs directory
-log4j.appender.R.File=build/test/logs/system.log
-
-log4j.logger.org.apache.hadoop=ERROR
-

diff --git a/test/conf/logback-test.xml b/test/conf/logback-test.xml
new file mode 100644
index 0000000..535e4fe
--- /dev/null
+++ b/test/conf/logback-test.xml

@@ -0,0 +1,60 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<configuration>
+  <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
+    <file>./build/test/logs/system.log</file>
+    <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
+      <fileNamePattern>./build/test/logs/system.log.%i.zip</fileNamePattern>
+      <minIndex>1</minIndex>
+      <maxIndex>20</maxIndex>
+    </rollingPolicy>
+
+    <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
+      <maxFileSize>20MB</maxFileSize>
+    </triggeringPolicy>
+    <encoder>
+      <pattern>%-5level [%thread] %date{ISO8601} %msg%n</pattern>
+    </encoder>
+  </appender>
+  
+    <appender name="STDERR" target="System.err" class="ch.qos.logback.core.ConsoleAppender">
+    <encoder>
+      <pattern>%-5level %date{HH:mm:ss,SSS} %msg%n</pattern>
+    </encoder>
+    <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
+      <level>WARN</level>
+    </filter>
+  </appender>
+  
+  <appender name="STDOUT" target="System.out" class="ch.qos.logback.core.ConsoleAppender">
+    <encoder>
+      <pattern>%-5level %date{HH:mm:ss,SSS} %msg%n</pattern>
+    </encoder>
+    <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
+      <level>WARN</level>
+    </filter>
+  </appender>
+        
+  <root level="INFO">
+    <appender-ref ref="FILE" />
+    <appender-ref ref="STDERR" />
+    <appender-ref ref="STDOUT" />
+  </root>
+</configuration>

diff --git a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Data.db b/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Data.db
deleted file mode 100644
index f134aee..0000000
--- a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Data.db
+++ /dev/null
Binary files differ

diff --git a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Digest.sha1 b/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Digest.sha1
deleted file mode 100644
index 91b9562..0000000
--- a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Digest.sha1
+++ /dev/null

@@ -1 +0,0 @@
-c22f034592ec31b1998083a34c1593538e8f1ea1  Keyspace1-Standard1-ic-0-Data.db
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Index.db b/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Index.db
deleted file mode 100644
index 715913b..0000000
--- a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Index.db
+++ /dev/null
Binary files differ

diff --git a/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-CRC.db b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-CRC.db
new file mode 100644
index 0000000..0b6dab4
--- /dev/null
+++ b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-CRC.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Data.db b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Data.db
new file mode 100644
index 0000000..7d9407e
--- /dev/null
+++ b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Digest.sha1 b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Digest.sha1
new file mode 100644
index 0000000..963bd9b
--- /dev/null
+++ b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Digest.sha1

@@ -0,0 +1 @@
+4a9f1896a599e4b3ff5d19600901de1a0b851bc1  Keyspace1-Standard1-jb-0-Data.db
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Filter.db b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Filter.db
similarity index 100%
rename from test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Filter.db
rename to test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Index.db b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Index.db
new file mode 100644
index 0000000..ee9f5fb
--- /dev/null
+++ b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Statistics.db b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Statistics.db
similarity index 94%
rename from test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Statistics.db
rename to test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Statistics.db
index e9d5d4b..daec1c3 100644
--- a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Statistics.db
+++ b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Summary.db b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Summary.db
similarity index 94%
rename from test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Summary.db
rename to test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Summary.db
index c1bc2e2..83c68ce 100644
--- a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-Summary.db
+++ b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-TOC.txt b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-TOC.txt
similarity index 90%
rename from test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-TOC.txt
rename to test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-TOC.txt
index 34b61c7..d3aa557 100644
--- a/test/data/legacy-sstables/ic/Keyspace1/Keyspace1-Standard1-ic-0-TOC.txt
+++ b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-TOC.txt

@@ -1,7 +1,8 @@
-Data.db
-TOC.txt
 Index.db
+TOC.txt
 Summary.db
 Filter.db
 Statistics.db
+Data.db
+CRC.db
 Digest.sha1

diff --git a/test/data/serialization/2.0/db.Row.bin b/test/data/serialization/2.0/db.Row.bin
deleted file mode 100644
index c699448..0000000
--- a/test/data/serialization/2.0/db.Row.bin
+++ /dev/null
Binary files differ

diff --git a/test/data/serialization/2.0/db.RowMutation.bin b/test/data/serialization/2.0/db.RowMutation.bin
deleted file mode 100644
index 73d93e8..0000000
--- a/test/data/serialization/2.0/db.RowMutation.bin
+++ /dev/null
Binary files differ

diff --git a/test/data/serialization/2.1/db.RangeSliceCommand.bin b/test/data/serialization/2.1/db.RangeSliceCommand.bin
new file mode 100644
index 0000000..f852df0
--- /dev/null
+++ b/test/data/serialization/2.1/db.RangeSliceCommand.bin
Binary files differ

diff --git a/test/data/serialization/2.1/db.SliceByNamesReadCommand.bin b/test/data/serialization/2.1/db.SliceByNamesReadCommand.bin
new file mode 100644
index 0000000..e9c33a2
--- /dev/null
+++ b/test/data/serialization/2.1/db.SliceByNamesReadCommand.bin
Binary files differ

diff --git a/test/data/serialization/2.1/db.SliceFromReadCommand.bin b/test/data/serialization/2.1/db.SliceFromReadCommand.bin
new file mode 100644
index 0000000..1beede3
--- /dev/null
+++ b/test/data/serialization/2.1/db.SliceFromReadCommand.bin
Binary files differ

diff --git a/test/data/serialization/2.1/db.Truncation.bin b/test/data/serialization/2.1/db.Truncation.bin
new file mode 100644
index 0000000..ea67995
--- /dev/null
+++ b/test/data/serialization/2.1/db.Truncation.bin
Binary files differ

diff --git a/test/data/serialization/2.1/db.WriteResponse.bin b/test/data/serialization/2.1/db.WriteResponse.bin
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/data/serialization/2.1/db.WriteResponse.bin


diff --git a/test/data/serialization/2.1/gms.EndpointState.bin b/test/data/serialization/2.1/gms.EndpointState.bin
new file mode 100644
index 0000000..f87fc77
--- /dev/null
+++ b/test/data/serialization/2.1/gms.EndpointState.bin
Binary files differ

diff --git a/test/data/serialization/2.1/gms.Gossip.bin b/test/data/serialization/2.1/gms.Gossip.bin
new file mode 100644
index 0000000..af5ac57
--- /dev/null
+++ b/test/data/serialization/2.1/gms.Gossip.bin
Binary files differ

diff --git a/test/data/serialization/2.1/service.SyncComplete.bin b/test/data/serialization/2.1/service.SyncComplete.bin
new file mode 100644
index 0000000..533abe2
--- /dev/null
+++ b/test/data/serialization/2.1/service.SyncComplete.bin
Binary files differ

diff --git a/test/data/serialization/2.1/service.SyncRequest.bin b/test/data/serialization/2.1/service.SyncRequest.bin
new file mode 100644
index 0000000..2bb8bf9
--- /dev/null
+++ b/test/data/serialization/2.1/service.SyncRequest.bin
Binary files differ

diff --git a/test/data/serialization/2.1/service.ValidationComplete.bin b/test/data/serialization/2.1/service.ValidationComplete.bin
new file mode 100644
index 0000000..6eff48f
--- /dev/null
+++ b/test/data/serialization/2.1/service.ValidationComplete.bin
Binary files differ

diff --git a/test/data/serialization/2.1/service.ValidationRequest.bin b/test/data/serialization/2.1/service.ValidationRequest.bin
new file mode 100644
index 0000000..e774d05
--- /dev/null
+++ b/test/data/serialization/2.1/service.ValidationRequest.bin
Binary files differ

diff --git a/test/data/serialization/2.1/utils.BloomFilter.bin b/test/data/serialization/2.1/utils.BloomFilter.bin
new file mode 100644
index 0000000..357042a
--- /dev/null
+++ b/test/data/serialization/2.1/utils.BloomFilter.bin
Binary files differ

diff --git a/test/data/serialization/2.1/utils.EstimatedHistogram.bin b/test/data/serialization/2.1/utils.EstimatedHistogram.bin
new file mode 100644
index 0000000..bedd39b
--- /dev/null
+++ b/test/data/serialization/2.1/utils.EstimatedHistogram.bin
Binary files differ

diff --git a/test/long/org/apache/cassandra/concurrent/LongOpOrderTest.java b/test/long/org/apache/cassandra/concurrent/LongOpOrderTest.java
new file mode 100644
index 0000000..d7105df
--- /dev/null
+++ b/test/long/org/apache/cassandra/concurrent/LongOpOrderTest.java

@@ -0,0 +1,240 @@
+package org.apache.cassandra.concurrent;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.cliffc.high_scale_lib.NonBlockingHashMap;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+import static org.junit.Assert.assertTrue;
+
+// TODO: we don't currently test SAFE functionality at all!
+// TODO: should also test markBlocking and SyncOrdered
+public class LongOpOrderTest
+{
+
+    private static final Logger logger = LoggerFactory.getLogger(LongOpOrderTest.class);
+
+    static final int CONSUMERS = 4;
+    static final int PRODUCERS = 32;
+
+    static final long RUNTIME = TimeUnit.MINUTES.toMillis(5);
+    static final long REPORT_INTERVAL = TimeUnit.MINUTES.toMillis(1);
+
+    static final Thread.UncaughtExceptionHandler handler = new Thread.UncaughtExceptionHandler()
+    {
+        @Override
+        public void uncaughtException(Thread t, Throwable e)
+        {
+            System.err.println(t.getName() + ": " + e.getMessage());
+            e.printStackTrace();
+        }
+    };
+
+    final OpOrder order = new OpOrder();
+    final AtomicInteger errors = new AtomicInteger();
+
+    class TestOrdering implements Runnable
+    {
+
+        final int[] waitNanos = new int[1 << 16];
+        volatile State state = new State();
+        final ScheduledExecutorService sched;
+
+        TestOrdering(ExecutorService exec, ScheduledExecutorService sched)
+        {
+            this.sched = sched;
+            final ThreadLocalRandom rnd = ThreadLocalRandom.current();
+            for (int i = 0 ; i < waitNanos.length ; i++)
+                waitNanos[i] = rnd.nextInt(5000);
+            for (int i = 0 ; i < PRODUCERS / CONSUMERS ; i++)
+                exec.execute(new Producer());
+            exec.execute(this);
+        }
+
+        @Override
+        public void run()
+        {
+            final long until = System.currentTimeMillis() + RUNTIME;
+            long lastReport = System.currentTimeMillis();
+            long count = 0;
+            long opCount = 0;
+            while (true)
+            {
+                long now = System.currentTimeMillis();
+                if (now > until)
+                    break;
+                if (now > lastReport + REPORT_INTERVAL)
+                {
+                    lastReport = now;
+                    logger.info(String.format("%s: Executed %d barriers with %d operations. %.0f%% complete.",
+                            Thread.currentThread().getName(), count, opCount, 100 * (1 - ((until - now) / (double) RUNTIME))));
+                }
+                try
+                {
+                    Thread.sleep(0, waitNanos[((int) (count & (waitNanos.length - 1)))]);
+                } catch (InterruptedException e)
+                {
+                    e.printStackTrace();
+                }
+
+                final State s = state;
+                s.barrier = order.newBarrier();
+                s.replacement = new State();
+                s.barrier.issue();
+                s.barrier.await();
+                s.check();
+                opCount += s.totalCount();
+                state = s.replacement;
+                sched.schedule(new Runnable()
+                {
+                    @Override
+                    public void run()
+                    {
+                        s.check();
+                    }
+                }, 1, TimeUnit.SECONDS);
+                count++;
+            }
+        }
+
+        class State
+        {
+
+            volatile OpOrder.Barrier barrier;
+            volatile State replacement;
+            final NonBlockingHashMap<OpOrder.Group, AtomicInteger> count = new NonBlockingHashMap<>();
+            int checkCount = -1;
+
+            boolean accept(OpOrder.Group opGroup)
+            {
+                if (barrier != null && !barrier.isAfter(opGroup))
+                    return false;
+                AtomicInteger c;
+                if (null == (c = count.get(opGroup)))
+                {
+                    count.putIfAbsent(opGroup, new AtomicInteger());
+                    c = count.get(opGroup);
+                }
+                c.incrementAndGet();
+                return true;
+            }
+
+            int totalCount()
+            {
+                int c = 0;
+                for (AtomicInteger v : count.values())
+                    c += v.intValue();
+                return c;
+            }
+
+            void check()
+            {
+                boolean delete;
+                if (checkCount >= 0)
+                {
+                    if (checkCount != totalCount())
+                    {
+                        errors.incrementAndGet();
+                        logger.error("Received size changed after barrier finished: {} vs {}", checkCount, totalCount());
+                    }
+                    delete = true;
+                }
+                else
+                {
+                    checkCount = totalCount();
+                    delete = false;
+                }
+                for (Map.Entry<OpOrder.Group, AtomicInteger> e : count.entrySet())
+                {
+                    if (e.getKey().compareTo(barrier.getSyncPoint()) > 0)
+                    {
+                        errors.incrementAndGet();
+                        logger.error("Received an operation that was created after the barrier was issued.");
+                    }
+                    if (TestOrdering.this.count.get(e.getKey()).intValue() != e.getValue().intValue())
+                    {
+                        errors.incrementAndGet();
+                        logger.error("Missing registered operations. {} vs {}", TestOrdering.this.count.get(e.getKey()).intValue(), e.getValue().intValue());
+                    }
+                    if (delete)
+                        TestOrdering.this.count.remove(e.getKey());
+                }
+            }
+
+        }
+
+        final NonBlockingHashMap<OpOrder.Group, AtomicInteger> count = new NonBlockingHashMap<>();
+
+        class Producer implements Runnable
+        {
+            public void run()
+            {
+                while (true)
+                {
+                    AtomicInteger c;
+                    try (OpOrder.Group opGroup = order.start())
+                    {
+                        if (null == (c = count.get(opGroup)))
+                        {
+                            count.putIfAbsent(opGroup, new AtomicInteger());
+                            c = count.get(opGroup);
+                        }
+                        c.incrementAndGet();
+                        State s = state;
+                        while (!s.accept(opGroup))
+                            s = s.replacement;
+                    }
+                }
+            }
+        }
+
+    }
+
+    @Test
+    public void testOrdering() throws InterruptedException
+    {
+        errors.set(0);
+        Thread.setDefaultUncaughtExceptionHandler(handler);
+        final ExecutorService exec = Executors.newCachedThreadPool(new NamedThreadFactory("checker"));
+        final ScheduledExecutorService checker = Executors.newScheduledThreadPool(1, new NamedThreadFactory("checker"));
+        for (int i = 0 ; i < CONSUMERS ; i++)
+            new TestOrdering(exec, checker);
+        exec.shutdown();
+        exec.awaitTermination((long) (RUNTIME * 1.1), TimeUnit.MILLISECONDS);
+        assertTrue(exec.isShutdown());
+        assertTrue(errors.get() == 0);
+    }
+
+
+}

diff --git a/test/long/org/apache/cassandra/concurrent/LongSharedExecutorPoolTest.java b/test/long/org/apache/cassandra/concurrent/LongSharedExecutorPoolTest.java
new file mode 100644
index 0000000..0fd53bb
--- /dev/null
+++ b/test/long/org/apache/cassandra/concurrent/LongSharedExecutorPoolTest.java

@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.concurrent;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.TreeSet;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.locks.LockSupport;
+
+import com.google.common.util.concurrent.Uninterruptibles;
+import org.apache.commons.math3.distribution.WeibullDistribution;
+import org.junit.Test;
+
+public class LongSharedExecutorPoolTest
+{
+
+    private static final class WaitTask implements Runnable
+    {
+        final long nanos;
+
+        private WaitTask(long nanos)
+        {
+            this.nanos = nanos;
+        }
+
+        public void run()
+        {
+            LockSupport.parkNanos(nanos);
+        }
+    }
+
+    private static final class Result implements Comparable<Result>
+    {
+        final Future<?> future;
+        final long forecastedCompletion;
+
+        private Result(Future<?> future, long forecastedCompletion)
+        {
+            this.future = future;
+            this.forecastedCompletion = forecastedCompletion;
+        }
+
+        public int compareTo(Result that)
+        {
+            int c = Long.compare(this.forecastedCompletion, that.forecastedCompletion);
+            if (c != 0)
+                return c;
+            c = Integer.compare(this.hashCode(), that.hashCode());
+            if (c != 0)
+                return c;
+            return Integer.compare(this.future.hashCode(), that.future.hashCode());
+        }
+    }
+
+    private static final class Batch implements Comparable<Batch>
+    {
+        final TreeSet<Result> results;
+        final long timeout;
+        final int executorIndex;
+
+        private Batch(TreeSet<Result> results, long timeout, int executorIndex)
+        {
+            this.results = results;
+            this.timeout = timeout;
+            this.executorIndex = executorIndex;
+        }
+
+        public int compareTo(Batch that)
+        {
+            int c = Long.compare(this.timeout, that.timeout);
+            if (c != 0)
+                return c;
+            c = Integer.compare(this.results.size(), that.results.size());
+            if (c != 0)
+                return c;
+            return Integer.compare(this.hashCode(), that.hashCode());
+        }
+    }
+
+    @Test
+    public void testPromptnessOfExecution() throws InterruptedException, ExecutionException, TimeoutException
+    {
+        testPromptnessOfExecution(TimeUnit.MINUTES.toNanos(2L), 0.5f);
+    }
+
+    private void testPromptnessOfExecution(long intervalNanos, float loadIncrement) throws InterruptedException, ExecutionException, TimeoutException
+    {
+        final int executorCount = 4;
+        int threadCount = 8;
+        int maxQueued = 1024;
+        final WeibullDistribution workTime = new WeibullDistribution(3, 200000);
+        final long minWorkTime = TimeUnit.MICROSECONDS.toNanos(1);
+        final long maxWorkTime = TimeUnit.MILLISECONDS.toNanos(1);
+
+        final int[] threadCounts = new int[executorCount];
+        final WeibullDistribution[] workCount = new WeibullDistribution[executorCount];
+        final ExecutorService[] executors = new ExecutorService[executorCount];
+        for (int i = 0 ; i < executors.length ; i++)
+        {
+            executors[i] = JMXEnabledSharedExecutorPool.SHARED.newExecutor(threadCount, maxQueued, "test" + i, "test" + i);
+            threadCounts[i] = threadCount;
+            workCount[i] = new WeibullDistribution(2, maxQueued);
+            threadCount *= 2;
+            maxQueued *= 2;
+        }
+
+        long runs = 0;
+        long events = 0;
+        final TreeSet<Batch> pending = new TreeSet<>();
+        final BitSet executorsWithWork = new BitSet(executorCount);
+        long until = 0;
+        // basic idea is to go through different levels of load on the executor service; initially is all small batches
+        // (mostly within max queue size) of very short operations, moving to progressively larger batches
+        // (beyond max queued size), and longer operations
+        for (float multiplier = 0f ; multiplier < 2.01f ; )
+        {
+            if (System.nanoTime() > until)
+            {
+                System.out.println(String.format("Completed %.0fK batches with %.1fM events", runs * 0.001f, events * 0.000001f));
+                events = 0;
+                until = System.nanoTime() + intervalNanos;
+                multiplier += loadIncrement;
+                System.out.println(String.format("Running for %ds with load multiplier %.1f", TimeUnit.NANOSECONDS.toSeconds(intervalNanos), multiplier));
+            }
+
+            // wait a random amount of time so we submit new tasks in various stages of
+            long timeout;
+            if (pending.isEmpty()) timeout = 0;
+            else if (Math.random() > 0.98) timeout = Long.MAX_VALUE;
+            else if (pending.size() == executorCount) timeout = pending.first().timeout;
+            else timeout = (long) (Math.random() * pending.last().timeout);
+
+            while (!pending.isEmpty() && timeout > System.nanoTime())
+            {
+                Batch first = pending.first();
+                boolean complete = false;
+                try
+                {
+                    for (Result result : first.results.descendingSet())
+                        result.future.get(timeout - System.nanoTime(), TimeUnit.NANOSECONDS);
+                    complete = true;
+                }
+                catch (TimeoutException e)
+                {
+                }
+                if (!complete && System.nanoTime() > first.timeout)
+                {
+                    for (Result result : first.results)
+                        if (!result.future.isDone())
+                            throw new AssertionError();
+                    complete = true;
+                }
+                if (complete)
+                {
+                    pending.pollFirst();
+                    executorsWithWork.clear(first.executorIndex);
+                }
+            }
+
+            // if we've emptied the executors, give all our threads an opportunity to spin down
+            if (timeout == Long.MAX_VALUE)
+                Uninterruptibles.sleepUninterruptibly(10, TimeUnit.MILLISECONDS);
+
+            // submit a random batch to the first free executor service
+            int executorIndex = executorsWithWork.nextClearBit(0);
+            if (executorIndex >= executorCount)
+                continue;
+            executorsWithWork.set(executorIndex);
+            ExecutorService executor = executors[executorIndex];
+            TreeSet<Result> results = new TreeSet<>();
+            int count = (int) (workCount[executorIndex].sample() * multiplier);
+            long targetTotalElapsed = 0;
+            long start = System.nanoTime();
+            long baseTime;
+            if (Math.random() > 0.5) baseTime = 2 * (long) (workTime.sample() * multiplier);
+            else  baseTime = 0;
+            for (int j = 0 ; j < count ; j++)
+            {
+                long time;
+                if (baseTime == 0) time = (long) (workTime.sample() * multiplier);
+                else time = (long) (baseTime * Math.random());
+                if (time < minWorkTime)
+                    time = minWorkTime;
+                if (time > maxWorkTime)
+                    time = maxWorkTime;
+                targetTotalElapsed += time;
+                Future<?> future = executor.submit(new WaitTask(time));
+                results.add(new Result(future, System.nanoTime() + time));
+            }
+            long end = start + (long) Math.ceil(targetTotalElapsed / (double) threadCounts[executorIndex])
+                       + TimeUnit.MILLISECONDS.toNanos(100L);
+            long now = System.nanoTime();
+            if (runs++ > executorCount && now > end)
+                throw new AssertionError();
+            events += results.size();
+            pending.add(new Batch(results, end, executorIndex));
+//            System.out.println(String.format("Submitted batch to executor %d with %d items and %d permitted millis", executorIndex, count, TimeUnit.NANOSECONDS.toMillis(end - start)));
+        }
+    }
+
+    public static void main(String[] args) throws InterruptedException, ExecutionException, TimeoutException
+    {
+        // do longer test
+        new LongSharedExecutorPoolTest().testPromptnessOfExecution(TimeUnit.MINUTES.toNanos(10L), 0.1f);
+    }
+
+}

diff --git a/test/long/org/apache/cassandra/cql3/CorruptionTest.java b/test/long/org/apache/cassandra/cql3/CorruptionTest.java
new file mode 100644
index 0000000..1a42112
--- /dev/null
+++ b/test/long/org/apache/cassandra/cql3/CorruptionTest.java

@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.*;
+import com.datastax.driver.core.policies.LoggingRetryPolicy;
+import com.datastax.driver.core.policies.Policies;
+import com.datastax.driver.core.utils.Bytes;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.service.EmbeddedCassandraService;
+
+public class CorruptionTest extends SchemaLoader
+{
+
+    private static EmbeddedCassandraService cassandra;
+    private static Cluster cluster;
+    private static Session session;
+
+    private static PreparedStatement getStatement;
+    private static PreparedStatement putStatement;
+    private static String KEYSPACE = "cass_test";
+    private static final String TABLE="put_test";
+    private static final String KEY = "SingleFailingKey";
+    private static String VALUE;
+    private final int THREADPOOL_SIZE=40;
+
+    @BeforeClass()
+    public static void setup() throws ConfigurationException, IOException
+    {
+        Schema.instance.clear();
+
+        cassandra = new EmbeddedCassandraService();
+        cassandra.start();
+
+        cluster = Cluster.builder().addContactPoint("127.0.0.1")
+                         .withRetryPolicy(new LoggingRetryPolicy(Policies.defaultRetryPolicy()))
+                         .withPort(DatabaseDescriptor.getNativeTransportPort()).build();
+        session = cluster.connect();
+
+        session.execute("CREATE KEYSPACE IF NOT EXISTS " + KEYSPACE +" WITH replication " +
+                        "= {'class':'SimpleStrategy', 'replication_factor':1};");
+        session.execute("USE " + KEYSPACE);
+        session.execute("CREATE TABLE IF NOT EXISTS " + TABLE + " (" +
+                         "key blob," +
+                         "value blob," +
+                         "PRIMARY KEY (key));");
+
+
+        // Prepared statements
+        getStatement = session.prepare("SELECT value FROM " + TABLE + " WHERE key = ?;");
+        getStatement.setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM);
+
+        putStatement = session.prepare("INSERT INTO " + TABLE + " (key, value) VALUES (?, ?);");
+        putStatement.setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM);
+
+
+
+        StringBuilder s = new StringBuilder();
+        char a='a';
+        char z='z';
+        for (int i = 0; i < 500*1024; i++)
+        {
+            char x = (char)((i%((z-a)+1))+a);
+            if (x == 'a')
+            {
+                x = '\n';
+            }
+            s.append(x);
+        }
+        VALUE = s.toString();
+    }
+
+    @Test
+    public void runCorruptionTest()
+    {
+
+        final CountDownLatch failure = new CountDownLatch(1);
+
+
+        ExecutorService executor = Executors.newFixedThreadPool(THREADPOOL_SIZE);
+        for (int i = 0; i < THREADPOOL_SIZE; i++)
+        {
+            executor.execute(new Runnable()
+            {
+                @Override
+                public void run()
+                {
+                    for (int i = 0; i < 100000; i++)
+                    {
+                        put(KEY.getBytes(), VALUE.getBytes());
+                        byte[] res = get(KEY.getBytes());
+                        //since we're flooding the server we might get some timeouts, that's not
+                        //relevant for this test
+                        if (res == null)
+                            continue;
+
+                        if (!Arrays.equals(VALUE.getBytes(), res))
+                        {
+                            /*try
+                            {
+                                dumpKeys(VALUE.getBytes(), res);
+                            }
+                            catch (IOException e)
+                            {
+                                e.printStackTrace();
+                            }*/
+                            failure.countDown();
+                        }
+                    }
+                }
+
+                private void dumpKeys(byte[] putdata, byte[] getdata) throws IOException {
+                    String basename = "bad-data-tid" + Thread.currentThread().getId();
+                    File put = new File(basename+"-put");
+                    File get = new File(basename+"-get");
+                    try(FileWriter pw = new FileWriter(put)) {
+                        pw.write(new String(putdata));
+                    }
+                    try(FileWriter pw = new FileWriter(get)) {
+                        pw.write(new String(getdata));
+                    }
+                }
+            });
+        }
+
+        try
+        {
+            assert!failure.await(2, TimeUnit.MINUTES);
+        }
+        catch (InterruptedException e)
+        {
+
+        }
+        executor.shutdownNow();
+
+    }
+
+    public static byte[] get(byte[] key)
+    {
+        BoundStatement boundStatement = new BoundStatement(getStatement);
+        boundStatement.setBytes(0, ByteBuffer.wrap(key));
+
+        final com.datastax.driver.core.ResultSet resultSet =  session.execute(boundStatement);
+        final Row row = resultSet.one();
+        if (row != null)
+        {
+            final ByteBuffer byteBuf = row.getBytes("value");
+            return Bytes.getArray(byteBuf);
+        }
+
+        return null;
+    }
+
+    public static void put(byte[] key, byte[] value)
+    {
+        BoundStatement boundStatement = new BoundStatement(putStatement);
+        boundStatement.setBytes(0, ByteBuffer.wrap(key));
+        boundStatement.setBytes(1, ByteBuffer.wrap(value));
+
+        session.execute(boundStatement);
+    }
+}

diff --git a/test/long/org/apache/cassandra/cql3/DropKeyspaceCommitLogRecycleTest.java b/test/long/org/apache/cassandra/cql3/DropKeyspaceCommitLogRecycleTest.java
new file mode 100644
index 0000000..a0bacea
--- /dev/null
+++ b/test/long/org/apache/cassandra/cql3/DropKeyspaceCommitLogRecycleTest.java

@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.After;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.SchemaLoader;
+
+import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
+
+/**
+ * Base class for CQL tests.
+ */
+public class DropKeyspaceCommitLogRecycleTest
+{
+    protected static final Logger logger = LoggerFactory.getLogger(DropKeyspaceCommitLogRecycleTest.class);
+
+    private static final String KEYSPACE = "cql_test_keyspace";
+    private static final String KEYSPACE2 = "cql_test_keyspace2";
+
+    static
+    {
+        // Once per-JVM is enough
+        SchemaLoader.prepareServer();
+    }
+
+    private void create(boolean both)
+    {
+        executeOnceInternal(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", KEYSPACE));
+        executeOnceInternal(String.format("CREATE TABLE %s.test (k1 int, k2 int, v int, PRIMARY KEY (k1, k2))", KEYSPACE));
+        
+        if (both)
+        {
+            executeOnceInternal(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", KEYSPACE2));
+            executeOnceInternal(String.format("CREATE TABLE %s.test (k1 int, k2 int, v int, PRIMARY KEY (k1, k2))", KEYSPACE2));
+        }
+    }
+
+    private void insert()
+    {
+        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (0, 0, 0)", KEYSPACE));
+        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (1, 1, 1)", KEYSPACE));
+        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (2, 2, 2)", KEYSPACE));
+
+        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (0, 0, 0)", KEYSPACE2));
+        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (1, 1, 1)", KEYSPACE2));
+        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (2, 2, 2)", KEYSPACE2));       
+    }
+
+    private void drop(boolean both)
+    {
+        executeOnceInternal(String.format("DROP KEYSPACE IF EXISTS %s", KEYSPACE));
+        if (both)
+            executeOnceInternal(String.format("DROP KEYSPACE IF EXISTS %s", KEYSPACE2));
+    }
+
+    @Test
+    public void testRecycle()
+    {
+        for (int i = 0 ; i < 1000 ; i++)
+        {
+            create(i == 0);
+            insert();
+            drop(false);
+        }
+    }
+
+    @After
+    public void afterTest() throws Throwable
+    {
+        drop(true);
+    }
+}

diff --git a/test/long/org/apache/cassandra/db/MeteredFlusherTest.java b/test/long/org/apache/cassandra/db/LongFlushMemtableTest.java
similarity index 78%
rename from test/long/org/apache/cassandra/db/MeteredFlusherTest.java
rename to test/long/org/apache/cassandra/db/LongFlushMemtableTest.java
index 5e25744..4bb8fdd 100644
--- a/test/long/org/apache/cassandra/db/MeteredFlusherTest.java
+++ b/test/long/org/apache/cassandra/db/LongFlushMemtableTest.java

@@ -27,34 +27,34 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-public class MeteredFlusherTest extends SchemaLoader
+public class LongFlushMemtableTest extends SchemaLoader
 {
     @Test
-    public void testManyMemtables() throws IOException, ConfigurationException
+    public void testFlushMemtables() throws IOException, ConfigurationException
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         for (int i = 0; i < 100; i++)
         {
-            CFMetaData metadata = new CFMetaData(keyspace.getName(), "_CF" + i, ColumnFamilyType.Standard, UTF8Type.instance, null);
+            CFMetaData metadata = CFMetaData.denseCFMetaData(keyspace.getName(), "_CF" + i, UTF8Type.instance);
             MigrationManager.announceNewColumnFamily(metadata);
         }
 
-        ByteBuffer name = ByteBufferUtil.bytes("c");
         for (int j = 0; j < 200; j++)
         {
             for (int i = 0; i < 100; i++)
             {
-                RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("key" + j));
-                ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "_CF" + i);
+                Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("key" + j));
+                ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "_CF" + i);
                 // don't cheat by allocating this outside of the loop; that defeats the purpose of deliberately using lots of memory
                 ByteBuffer value = ByteBuffer.allocate(100000);
-                cf.addColumn(new Column(name, value));
+                cf.addColumn(new BufferCell(Util.cellname("c"), value));
                 rm.add(cf);
                 rm.applyUnsafe();
             }

diff --git a/test/long/org/apache/cassandra/db/LongKeyspaceTest.java b/test/long/org/apache/cassandra/db/LongKeyspaceTest.java
index 1a472d6..7a5b837 100644
--- a/test/long/org/apache/cassandra/db/LongKeyspaceTest.java
+++ b/test/long/org/apache/cassandra/db/LongKeyspaceTest.java

@@ -21,13 +21,10 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.utils.WrappedRunnable;
 import static org.apache.cassandra.Util.column;
 
 import org.apache.cassandra.Util;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 
 
 public class LongKeyspaceTest extends SchemaLoader
@@ -40,8 +37,8 @@
 
         for (int i = 1; i < 5000; i += 100)
         {
-            RowMutation rm = new RowMutation("Keyspace1", Util.dk("key" + i).key);
-            ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+            Mutation rm = new Mutation("Keyspace1", Util.dk("key" + i).getKey());
+            ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
             for (int j = 0; j < i; j++)
                 cf.addColumn(column("c" + j, "v" + j, 1L));
             rm.add(cf);
@@ -57,10 +54,7 @@
                 {
                     for (int j = 0; j < i; j++)
                     {
-                        cf = cfStore.getColumnFamily(QueryFilter.getNamesFilter(Util.dk("key" + i),
-                                                                                "Standard1",
-                                                                                FBUtilities.singleton(ByteBufferUtil.bytes("c" + j), cfStore.getComparator()),
-                                                                                System.currentTimeMillis()));
+                        cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, Util.dk("key" + i), "c" + j));
                         KeyspaceTest.assertColumns(cf, "c" + j);
                     }
                 }

diff --git a/test/long/org/apache/cassandra/db/commitlog/ComitLogStress.java b/test/long/org/apache/cassandra/db/commitlog/ComitLogStress.java
new file mode 100644
index 0000000..5b334cc
--- /dev/null
+++ b/test/long/org/apache/cassandra/db/commitlog/ComitLogStress.java

@@ -0,0 +1,94 @@
+package org.apache.cassandra.db.commitlog;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.UUIDGen;
+
+public class ComitLogStress
+{
+
+    public static final String format = "%s,%s,%s,%s,%s,%s";
+
+    public static void main(String[] args) throws Exception {
+        int NUM_THREADS = Runtime.getRuntime().availableProcessors();
+        if (args.length >= 1) {
+            NUM_THREADS = Integer.parseInt(args[0]);
+            System.out.println("Setting num threads to: " + NUM_THREADS);
+        }
+        ExecutorService executor = new JMXEnabledThreadPoolExecutor(NUM_THREADS, NUM_THREADS, 60,
+                TimeUnit.SECONDS, new ArrayBlockingQueue<Runnable>(10 * NUM_THREADS), new NamedThreadFactory(""), "");
+        ScheduledExecutorService scheduled = Executors.newScheduledThreadPool(1);
+
+        org.apache.cassandra.SchemaLoader.loadSchema();
+        final AtomicLong count = new AtomicLong();
+        final long start = System.currentTimeMillis();
+        System.out.println(String.format(format, "seconds", "max_mb", "allocated_mb", "free_mb", "diffrence", "count"));
+        scheduled.scheduleAtFixedRate(new Runnable() {
+            long lastUpdate = 0;
+
+            public void run() {
+                Runtime runtime = Runtime.getRuntime();
+                long maxMemory = mb(runtime.maxMemory());
+                long allocatedMemory = mb(runtime.totalMemory());
+                long freeMemory = mb(runtime.freeMemory());
+                long temp = count.get();
+                System.out.println(String.format(format, ((System.currentTimeMillis() - start) / 1000),
+                        maxMemory, allocatedMemory, freeMemory, (temp - lastUpdate), lastUpdate));
+                lastUpdate = temp;
+            }
+        }, 1, 1, TimeUnit.SECONDS);
+
+        while (true) {
+            executor.execute(new CommitlogExecutor());
+            count.incrementAndGet();
+        }
+    }
+
+    private static long mb(long maxMemory) {
+        return maxMemory / (1024 * 1024);
+    }
+
+    static final String keyString = UUIDGen.getTimeUUID().toString();
+    public static class CommitlogExecutor implements Runnable {
+        public void run() {
+            String ks = "Keyspace1";
+            ByteBuffer key = ByteBufferUtil.bytes(keyString);
+            Mutation mutation = new Mutation(ks, key);
+            mutation.add("Standard1", Util.cellname("name"), ByteBufferUtil.bytes("value"),
+                    System.currentTimeMillis());
+            CommitLog.instance.add(mutation);
+        }
+    }
+}

diff --git a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
index 21c6457..94bc09f 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java

@@ -81,7 +81,7 @@
             for (int j = 0; j < rowsPerSSTable; j++)
             {
                 String key = String.valueOf(j);
-                Column[] cols = new Column[colsPerRow];
+                Cell[] cols = new Cell[colsPerRow];
                 for (int i = 0; i < colsPerRow; i++)
                 {
                     // last sstable has highest timestamps
@@ -99,7 +99,7 @@
 
         long start = System.nanoTime();
         final int gcBefore = (int) (System.currentTimeMillis() / 1000) - Schema.instance.getCFMetaData(KEYSPACE1, "Standard1").getGcGraceSeconds();
-        new CompactionTask(store, sstables, gcBefore).execute(null);
+        new CompactionTask(store, sstables, gcBefore, false).execute(null);
         System.out.println(String.format("%s: sstables=%d rowsper=%d colsper=%d: %d ms",
                                          this.getClass().getName(),
                                          sstableCount,
@@ -117,7 +117,7 @@
         cfs.clearUnsafe();
 
         final int ROWS_PER_SSTABLE = 10;
-        final int SSTABLES = cfs.metadata.getIndexInterval() * 3 / ROWS_PER_SSTABLE;
+        final int SSTABLES = cfs.metadata.getMinIndexInterval() * 3 / ROWS_PER_SSTABLE;
 
         // disable compaction while flushing
         cfs.disableAutoCompaction();
@@ -127,9 +127,9 @@
         for (int j = 0; j < SSTABLES; j++) {
             for (int i = 0; i < ROWS_PER_SSTABLE; i++) {
                 DecoratedKey key = Util.dk(String.valueOf(i % 2));
-                RowMutation rm = new RowMutation(KEYSPACE1, key.key);
+                Mutation rm = new Mutation(KEYSPACE1, key.getKey());
                 long timestamp = j * ROWS_PER_SSTABLE + i;
-                rm.add("Standard1", ByteBufferUtil.bytes(String.valueOf(i / 2)),
+                rm.add("Standard1", Util.cellname(String.valueOf(i / 2)),
                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
                        timestamp);
                 maxTimestampExpected = Math.max(timestamp, maxTimestampExpected);

diff --git a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
index c6b6eb0..b071001 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java

@@ -27,13 +27,8 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 
 public class LongLeveledCompactionStrategyTest extends SchemaLoader
@@ -59,10 +54,10 @@
         for (int r = 0; r < rows; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            RowMutation rm = new RowMutation(ksname, key.key);
+            Mutation rm = new Mutation(ksname, key.getKey());
             for (int c = 0; c < columns; c++)
             {
-                rm.add(cfname, ByteBufferUtil.bytes("column" + c), value, 0);
+                rm.add(cfname, Util.cellname("column" + c), value, 0);
             }
             rm.apply();
             store.forceBlockingFlush();
@@ -77,7 +72,7 @@
         {
             while (true)
             {
-                final AbstractCompactionTask t = lcs.getMaximalTask(Integer.MIN_VALUE);
+                final AbstractCompactionTask t = lcs.getMaximalTask(Integer.MIN_VALUE).iterator().next();
                 if (t == null)
                     break;
                 tasks.add(new Runnable()
@@ -106,7 +101,7 @@
         {
             List<SSTableReader> sstables = manifest.getLevel(level);
             // score check
-            assert (double) SSTable.getTotalBytes(sstables) / manifest.maxBytesForLevel(level) < 1.00;
+            assert (double) SSTableReader.getTotalBytes(sstables) / manifest.maxBytesForLevel(level) < 1.00;
             // overlap check for levels greater than 0
             if (level > 0)
             {

diff --git a/test/long/org/apache/cassandra/utils/LongBTreeTest.java b/test/long/org/apache/cassandra/utils/LongBTreeTest.java
new file mode 100644
index 0000000..76ff2bf
--- /dev/null
+++ b/test/long/org/apache/cassandra/utils/LongBTreeTest.java

@@ -0,0 +1,401 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.*;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import javax.annotation.Nullable;
+
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.ListenableFutureTask;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.yammer.metrics.Metrics;
+import com.yammer.metrics.core.Timer;
+import com.yammer.metrics.core.TimerContext;
+import com.yammer.metrics.stats.Snapshot;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.BTreeSet;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+
+// TODO : should probably lower fan-factor for tests to make them more intensive
+public class LongBTreeTest
+{
+
+    private static final Timer BTREE_TIMER = Metrics.newTimer(BTree.class, "BTREE", TimeUnit.NANOSECONDS, TimeUnit.NANOSECONDS);
+    private static final Timer TREE_TIMER = Metrics.newTimer(BTree.class, "TREE", TimeUnit.NANOSECONDS, TimeUnit.NANOSECONDS);
+    private static final ExecutorService MODIFY = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors(), new NamedThreadFactory("MODIFY"));
+    private static final ExecutorService COMPARE = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors(), new NamedThreadFactory("COMPARE"));
+    private static final RandomAbort<Integer> SPORADIC_ABORT = new RandomAbort<>(new Random(), 0.0001f);
+
+    static
+    {
+        System.setProperty("cassandra.btree.fanfactor", "4");
+    }
+
+    @Test
+    public void testOversizedMiddleInsert()
+    {
+        TreeSet<Integer> canon = new TreeSet<>();
+        for (int i = 0 ; i < 10000000 ; i++)
+            canon.add(i);
+        Object[] btree = BTree.build(Arrays.asList(Integer.MIN_VALUE, Integer.MAX_VALUE), ICMP, true, null);
+        btree = BTree.update(btree, ICMP, canon, true);
+        canon.add(Integer.MIN_VALUE);
+        canon.add(Integer.MAX_VALUE);
+        Assert.assertTrue(BTree.isWellFormed(btree, ICMP));
+        testEqual("Oversize", BTree.<Integer>slice(btree, true), canon.iterator());
+    }
+
+    @Test
+    public void testIndividualInsertsSmallOverlappingRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(10000000, 50, 1, 1, true);
+    }
+
+    @Test
+    public void testBatchesSmallOverlappingRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(10000000, 50, 1, 5, true);
+    }
+
+    @Test
+    public void testIndividualInsertsMediumSparseRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(10000000, 500, 10, 1, true);
+    }
+
+    @Test
+    public void testBatchesMediumSparseRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(10000000, 500, 10, 10, true);
+    }
+
+    @Test
+    public void testLargeBatchesLargeRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(100000000, 5000, 3, 100, true);
+    }
+
+    @Test
+    public void testSlicingSmallRandomTrees() throws ExecutionException, InterruptedException
+    {
+        testInsertions(10000, 50, 10, 10, false);
+    }
+
+    private static void testInsertions(int totalCount, int perTestCount, int testKeyRatio, int modificationBatchSize, boolean quickEquality) throws ExecutionException, InterruptedException
+    {
+        int batchesPerTest = perTestCount / modificationBatchSize;
+        int maximumRunLength = 100;
+        int testKeyRange = perTestCount * testKeyRatio;
+        int tests = totalCount / perTestCount;
+        System.out.println(String.format("Performing %d tests of %d operations, with %.2f max size/key-range ratio in batches of ~%d ops",
+                tests, perTestCount, 1 / (float) testKeyRatio, modificationBatchSize));
+
+        // if we're not doing quick-equality, we can spam with garbage for all the checks we perform, so we'll split the work into smaller chunks
+        int chunkSize = quickEquality ? tests : (int) (100000 / Math.pow(perTestCount, 2));
+        for (int chunk = 0 ; chunk < tests ; chunk += chunkSize)
+        {
+            final List<ListenableFutureTask<List<ListenableFuture<?>>>> outer = new ArrayList<>();
+            for (int i = 0 ; i < chunkSize ; i++)
+            {
+                outer.add(doOneTestInsertions(testKeyRange, maximumRunLength, modificationBatchSize, batchesPerTest, quickEquality));
+            }
+
+            final List<ListenableFuture<?>> inner = new ArrayList<>();
+            int complete = 0;
+            int reportInterval = totalCount / 100;
+            int lastReportAt = 0;
+            for (ListenableFutureTask<List<ListenableFuture<?>>> f : outer)
+            {
+                inner.addAll(f.get());
+                complete += perTestCount;
+                if (complete - lastReportAt >= reportInterval)
+                {
+                    System.out.println(String.format("Completed %d of %d operations", (chunk * perTestCount) + complete, totalCount));
+                    lastReportAt = complete;
+                }
+            }
+            Futures.allAsList(inner).get();
+        }
+        Snapshot snap = BTREE_TIMER.getSnapshot();
+        System.out.println(String.format("btree   : %.2fns, %.2fns, %.2fns", snap.getMedian(), snap.get95thPercentile(), snap.get999thPercentile()));
+        snap = TREE_TIMER.getSnapshot();
+        System.out.println(String.format("snaptree: %.2fns, %.2fns, %.2fns", snap.getMedian(), snap.get95thPercentile(), snap.get999thPercentile()));
+        System.out.println("Done");
+    }
+
+    private static ListenableFutureTask<List<ListenableFuture<?>>> doOneTestInsertions(final int upperBound, final int maxRunLength, final int averageModsPerIteration, final int iterations, final boolean quickEquality)
+    {
+        ListenableFutureTask<List<ListenableFuture<?>>> f = ListenableFutureTask.create(new Callable<List<ListenableFuture<?>>>()
+        {
+            @Override
+            public List<ListenableFuture<?>> call()
+            {
+                final List<ListenableFuture<?>> r = new ArrayList<>();
+                NavigableMap<Integer, Integer> canon = new TreeMap<>();
+                Object[] btree = BTree.empty();
+                final TreeMap<Integer, Integer> buffer = new TreeMap<>();
+                final Random rnd = new Random();
+                for (int i = 0 ; i < iterations ; i++)
+                {
+                    buffer.clear();
+                    int mods = (averageModsPerIteration >> 1) + 1 + rnd.nextInt(averageModsPerIteration);
+                    while (mods > 0)
+                    {
+                        int v = rnd.nextInt(upperBound);
+                        int rc = Math.max(0, Math.min(mods, maxRunLength) - 1);
+                        int c = 1 + (rc <= 0 ? 0 : rnd.nextInt(rc));
+                        for (int j = 0 ; j < c ; j++)
+                        {
+                            buffer.put(v, v);
+                            v++;
+                        }
+                        mods -= c;
+                    }
+                    TimerContext ctxt;
+                    ctxt = TREE_TIMER.time();
+                    canon.putAll(buffer);
+                    ctxt.stop();
+                    ctxt = BTREE_TIMER.time();
+                    Object[] next = null;
+                    while (next == null)
+                        next = BTree.update(btree, ICMP, buffer.keySet(), true, SPORADIC_ABORT);
+                    btree = next;
+                    ctxt.stop();
+
+                    if (!BTree.isWellFormed(btree, ICMP))
+                    {
+                        System.out.println("ERROR: Not well formed");
+                        throw new AssertionError("Not well formed!");
+                    }
+                    if (quickEquality)
+                        testEqual("", BTree.<Integer>slice(btree, true), canon.keySet().iterator());
+                    else
+                        r.addAll(testAllSlices("RND", btree, new TreeSet<>(canon.keySet())));
+                }
+                return r;
+            }
+        });
+        MODIFY.execute(f);
+        return f;
+    }
+
+    @Test
+    public void testSlicingAllSmallTrees() throws ExecutionException, InterruptedException
+    {
+        Object[] cur = BTree.empty();
+        TreeSet<Integer> canon = new TreeSet<>();
+        // we set FAN_FACTOR to 4, so 128 items is four levels deep, three fully populated
+        for (int i = 0 ; i < 128 ; i++)
+        {
+            String id = String.format("[0..%d)", canon.size());
+            System.out.println("Testing " + id);
+            Futures.allAsList(testAllSlices(id, cur, canon)).get();
+            Object[] next = null;
+            while (next == null)
+                next = BTree.update(cur, ICMP, Arrays.asList(i), true, SPORADIC_ABORT);
+            cur = next;
+            canon.add(i);
+        }
+    }
+
+    static final Comparator<Integer> ICMP = new Comparator<Integer>()
+    {
+        @Override
+        public int compare(Integer o1, Integer o2)
+        {
+            return Integer.compare(o1, o2);
+        }
+    };
+
+    private static List<ListenableFuture<?>> testAllSlices(String id, Object[] btree, NavigableSet<Integer> canon)
+    {
+        List<ListenableFuture<?>> waitFor = new ArrayList<>();
+        testAllSlices(id + " ASC", new BTreeSet<>(btree, ICMP), canon, true, waitFor);
+        testAllSlices(id + " DSC", new BTreeSet<>(btree, ICMP).descendingSet(), canon.descendingSet(), false, waitFor);
+        return waitFor;
+    }
+
+    private static void testAllSlices(String id, NavigableSet<Integer> btree, NavigableSet<Integer> canon, boolean ascending, List<ListenableFuture<?>> results)
+    {
+        testOneSlice(id, btree, canon, results);
+        for (Integer lb : range(canon.size(), Integer.MIN_VALUE, ascending))
+        {
+            // test head/tail sets
+            testOneSlice(String.format("%s->[%d..)", id, lb), btree.headSet(lb, true), canon.headSet(lb, true), results);
+            testOneSlice(String.format("%s->(%d..)", id, lb), btree.headSet(lb, false), canon.headSet(lb, false), results);
+            testOneSlice(String.format("%s->(..%d]", id, lb), btree.tailSet(lb, true), canon.tailSet(lb, true), results);
+            testOneSlice(String.format("%s->(..%d]", id, lb), btree.tailSet(lb, false), canon.tailSet(lb, false), results);
+            for (Integer ub : range(canon.size(), lb, ascending))
+            {
+                // test subsets
+                testOneSlice(String.format("%s->[%d..%d]", id, lb, ub), btree.subSet(lb, true, ub, true), canon.subSet(lb, true, ub, true), results);
+                testOneSlice(String.format("%s->(%d..%d]", id, lb, ub), btree.subSet(lb, false, ub, true), canon.subSet(lb, false, ub, true), results);
+                testOneSlice(String.format("%s->[%d..%d)", id, lb, ub), btree.subSet(lb, true, ub, false), canon.subSet(lb, true, ub, false), results);
+                testOneSlice(String.format("%s->(%d..%d)", id, lb, ub), btree.subSet(lb, false, ub, false), canon.subSet(lb, false, ub, false), results);
+            }
+        }
+    }
+
+    private static void testOneSlice(final String id, final NavigableSet<Integer> test, final NavigableSet<Integer> canon, List<ListenableFuture<?>> results)
+    {
+        ListenableFutureTask<?> f = ListenableFutureTask.create(new Runnable()
+        {
+
+            @Override
+            public void run()
+            {
+                test(id + " Count", test.size(), canon.size());
+                testEqual(id, test.iterator(), canon.iterator());
+                testEqual(id + "->DSCI", test.descendingIterator(), canon.descendingIterator());
+                testEqual(id + "->DSCS", test.descendingSet().iterator(), canon.descendingSet().iterator());
+                testEqual(id + "->DSCS->DSCI", test.descendingSet().descendingIterator(), canon.descendingSet().descendingIterator());
+            }
+        }, null);
+        results.add(f);
+        COMPARE.execute(f);
+    }
+
+    private static void test(String id, int test, int expect)
+    {
+        if (test != expect)
+        {
+            System.out.println(String.format("%s: Expected %d, Got %d", id, expect, test));
+        }
+    }
+
+    private static <V> void testEqual(String id, Iterator<V> btree, Iterator<V> canon)
+    {
+        boolean equal = true;
+        while (btree.hasNext() && canon.hasNext())
+        {
+            Object i = btree.next();
+            Object j = canon.next();
+            if (!i.equals(j))
+            {
+                System.out.println(String.format("%s: Expected %d, Got %d", id, j, i));
+                equal = false;
+            }
+        }
+        while (btree.hasNext())
+        {
+            System.out.println(String.format("%s: Expected <Nil>, Got %d", id, btree.next()));
+            equal = false;
+        }
+        while (canon.hasNext())
+        {
+            System.out.println(String.format("%s: Expected %d, Got Nil", id, canon.next()));
+            equal = false;
+        }
+        if (!equal)
+            throw new AssertionError("Not equal");
+    }
+
+    // should only be called on sets that range from 0->N or N->0
+    private static final Iterable<Integer> range(final int size, final int from, final boolean ascending)
+    {
+        return new Iterable<Integer>()
+        {
+            int cur;
+            int delta;
+            int end;
+            {
+                if (ascending)
+                {
+                    end = size + 1;
+                    cur = from == Integer.MIN_VALUE ? -1 : from;
+                    delta = 1;
+                }
+                else
+                {
+                    end = -2;
+                    cur = from == Integer.MIN_VALUE ? size : from;
+                    delta = -1;
+                }
+            }
+            @Override
+            public Iterator<Integer> iterator()
+            {
+                return new Iterator<Integer>()
+                {
+                    @Override
+                    public boolean hasNext()
+                    {
+                        return cur != end;
+                    }
+
+                    @Override
+                    public Integer next()
+                    {
+                        Integer r = cur;
+                        cur += delta;
+                        return r;
+                    }
+
+                    @Override
+                    public void remove()
+                    {
+                        throw new UnsupportedOperationException();
+                    }
+                };
+            }
+        };
+    }
+
+    private static final class RandomAbort<V> implements UpdateFunction<V>
+    {
+        final Random rnd;
+        final float chance;
+        private RandomAbort(Random rnd, float chance)
+        {
+            this.rnd = rnd;
+            this.chance = chance;
+        }
+
+        public V apply(V replacing, V update)
+        {
+            return update;
+        }
+
+        public boolean abortEarly()
+        {
+            return rnd.nextFloat() < chance;
+        }
+
+        public void allocated(long heapSize)
+        {
+
+        }
+
+        public V apply(V v)
+        {
+            return v;
+        }
+    }
+
+}

diff --git a/test/pig/org/apache/cassandra/pig/PigTestBase.java b/test/pig/org/apache/cassandra/pig/PigTestBase.java
index ed307f4..4b3e422 100644
--- a/test/pig/org/apache/cassandra/pig/PigTestBase.java
+++ b/test/pig/org/apache/cassandra/pig/PigTestBase.java

@@ -68,7 +68,12 @@
     protected static String defaultParameters= "init_address=localhost&rpc_port=9170&partitioner=org.apache.cassandra.dht.ByteOrderedPartitioner";
     protected static String nativeParameters = "&core_conns=2&max_conns=10&min_simult_reqs=3&max_simult_reqs=10&native_timeout=10000000"  +
                                                "&native_read_timeout=10000000&send_buff_size=4096&receive_buff_size=4096&solinger=3" +
-                                               "&tcp_nodelay=true&reuse_address=true&keep_alive=true&native_port=9052";
+                                               "&tcp_nodelay=true&reuse_address=true&keep_alive=true&native_port=9042";
+
+    static
+    {
+        System.setProperty("logback.configurationFile", "logback-test.xml");
+    }
 
     @AfterClass
     public static void oneTimeTearDown() throws Exception {
@@ -88,7 +93,7 @@
 
     protected static Cassandra.Client getClient() throws TTransportException
     {
-        TTransport tr = new TFramedTransport(new TSocket("localhost", DatabaseDescriptor.getRpcPort()));
+        TTransport tr = new TFramedTransport(new TSocket("localhost", 9170));
         TProtocol proto = new TBinaryProtocol(tr);
         Cassandra.Client client = new Cassandra.Client(proto);
         tr.open();

diff --git a/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyTest.java b/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyTest.java
index 15f402e..8903297 100644
--- a/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyTest.java
+++ b/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyTest.java

@@ -601,7 +601,8 @@
         }
     }
 
-    @Test
+    /** This test case fails due to antlr lib conflicts, Cassandra2.1 uses 3.2, Hive1.2 uses 3.4 */
+    //@Test
     public void testCassandraStorageCompositeColumnCF() throws IOException, ClassNotFoundException, TException, TimedOutException, NotFoundException, InvalidRequestException, NoSuchFieldException, UnavailableException, IllegalAccessException, InstantiationException, AuthenticationException, AuthorizationException
     {
         //Test CompositeType

diff --git a/test/pig/org/apache/pig/test/MiniCluster.java b/test/pig/org/apache/pig/test/MiniCluster.java
index 3216392..e8f1f6e 100644
--- a/test/pig/org/apache/pig/test/MiniCluster.java
+++ b/test/pig/org/apache/pig/test/MiniCluster.java

@@ -20,6 +20,7 @@
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.OutputStream;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
@@ -59,7 +60,10 @@
             m_conf.set("mapred.map.max.attempts", "2");
             m_conf.set("mapred.reduce.max.attempts", "2");
             m_conf.set("pig.jobcontrol.sleep", "100");
-            m_conf.writeXml(new FileOutputStream(conf_file));
+            try (OutputStream os = new FileOutputStream(conf_file))
+            {
+                m_conf.writeXml(os);
+            }
 
             // Set the system properties needed by Pig
             System.setProperty("cluster", m_conf.get("mapred.job.tracker"));

diff --git a/test/resources/CQLTable.json b/test/resources/CQLTable.json
new file mode 100644
index 0000000..af15f70
--- /dev/null
+++ b/test/resources/CQLTable.json

@@ -0,0 +1,10 @@
+[
+{"key": "00000001",
+ "cells": [["","",1408056347831000],
+           ["v1","NY",1408056347831000],
+           ["v2","1980",1408056347831000]]},
+{"key": "00000002",
+ "cells": [["","",1408056347812000],
+           ["v1","CA",1408056347812000],
+           ["v2","2014",1408056347812000]]}
+]

diff --git a/test/resources/CounterCF.json b/test/resources/CounterCF.json
index b6bca73..dcd087f 100644
--- a/test/resources/CounterCF.json
+++ b/test/resources/CounterCF.json

@@ -1,3 +1,3 @@
 [
- {"key": "726f7741", "columns": [["636f6c4141", "000100008c619170467411e00000fe8ebeead9ee0000000000000001000000000000002a", 1294532915068, "c", 0]]}
+ {"key": "726f7741", "cells": [["636f6c4141", "000100008c619170467411e00000fe8ebeead9ee0000000000000001000000000000002a", 1294532915068, "c", 0]]}
 ]

diff --git a/test/resources/SimpleCF.json b/test/resources/SimpleCF.json
index 45f57eb..7ef95f6 100644
--- a/test/resources/SimpleCF.json
+++ b/test/resources/SimpleCF.json

@@ -1,4 +1,4 @@
 [
- {"key": "726f7741", "columns": [["636f6c4141", "76616c4141", 1294532915068], ["636f6c4142", "76616c4142", 1294532915069], ["636f6c4143", "76616c4143", 1294532915071, "e", 42, 2000000000]]},
- {"key": "726f7742", "columns": [["636f6c4241", "76616c4241", 1294532915070], ["636f6c4242", "76616c4242", 1294532915073]]}
+ {"key": "726f7741", "cells": [["636f6c4141", "76616c4141", 1294532915068], ["636f6c4142", "76616c4142", 1294532915069], ["636f6c4143", "76616c4143", 1294532915071, "e", 42, 2000000000]]},
+ {"key": "726f7742", "cells": [["636f6c4241", "76616c4241", 1294532915070], ["636f6c4242", "76616c4242", 1294532915073]]}
 ]

diff --git a/test/resources/SimpleCF.oldformat.json b/test/resources/SimpleCF.oldformat.json
deleted file mode 100644
index d920cfb..0000000
--- a/test/resources/SimpleCF.oldformat.json
+++ /dev/null

@@ -1,4 +0,0 @@
-[
- {"key": "726f7741", "columns": [["636f6c4141", "76616c4141", 1294532915068, false], ["636f6c4142", "76616c4142", 1294532915069, false], ["636f6c4143", "76616c4143", 1294532915071, false, 42, 2000000000 ]]},
- {"key": "726f7742", "columns": [["636f6c4241", "76616c4241", 1294532915070, false], ["636f6c4242", "76616c4242", 1294532915073, false]]}
-]

diff --git a/test/resources/SimpleCFWithDeletionInfo.json b/test/resources/SimpleCFWithDeletionInfo.json
index ef673f2..5090699 100644
--- a/test/resources/SimpleCFWithDeletionInfo.json
+++ b/test/resources/SimpleCFWithDeletionInfo.json

@@ -1,4 +1,4 @@
 [
- {"key": "726f7741","metadata":{"deletionInfo":{"markedForDeleteAt":0,"localDeletionTime":0}}, "columns": [["636f6c4141", "76616c4141", 1294532915068], ["636f6c4142", "76616c4142", 1294532915069], ["636f6c4143", "76616c4143", 1294532915071, "e", 42, 2000000000]]},
- {"key": "726f7742","metadata":{"deletionInfo":{"markedForDeleteAt":0,"localDeletionTime":0}}, "columns": [["636f6c4241", "76616c4241", 1294532915070], ["636f6c4242", "76616c4242", 1294532915073]]}
+ {"key": "726f7741","metadata":{"deletionInfo":{"markedForDeleteAt":0,"localDeletionTime":0}}, "cells": [["636f6c4141", "76616c4141", 1294532915068], ["636f6c4142", "76616c4142", 1294532915069], ["636f6c4143", "76616c4143", 1294532915071, "e", 42, 2000000000]]},
+ {"key": "726f7742","metadata":{"deletionInfo":{"markedForDeleteAt":0,"localDeletionTime":0}}, "cells": [["636f6c4241", "76616c4241", 1294532915070], ["636f6c4242", "76616c4242", 1294532915073]]}
 ]

diff --git a/test/resources/SuperCF.json b/test/resources/SuperCF.json
deleted file mode 100644
index 5afec7f..0000000
--- a/test/resources/SuperCF.json
+++ /dev/null

@@ -1,4 +0,0 @@
-[
- {"key": "726f7741", "columns": {"737570657241": {"metadata": {"deletionInfo": {"markedForDeleteAt":0,"localDeletionTime":0}}, "subColumns": [["636f6c4141", "76616c75654141", 1294532915069], ["636f6c4142", "76616c75654142", 1294532915069]]}}},
- {"key": "726f7742", "columns": {"737570657242": {"subColumns": [["636f6c4241", "76616c75654241", 1294532915069], ["636f6c4242", "76616c75654242", 1294532915069]]}}}
-]

diff --git a/test/resources/UnsortedCF.json b/test/resources/UnsortedCF.json
index 814f182..64ef24f 100644
--- a/test/resources/UnsortedCF.json
+++ b/test/resources/UnsortedCF.json

@@ -1,4 +1,4 @@
 [
- {"key": "726f7742", "columns": [["636f6c4241", "76616c4241", 1294532915070], ["636f6c4242", "76616c4242", 1294532915073]]},
- {"key": "726f7741", "columns": [["636f6c4141", "76616c4141", 1294532915068], ["636f6c4142", "76616c4142", 1294532915069], ["636f6c4143", "76616c4143", 1294532915071, "e", 42, 2000000000]]}
+ {"key": "726f7742", "cells": [["636f6c4241", "76616c4241", 1294532915070], ["636f6c4242", "76616c4242", 1294532915073]]},
+ {"key": "726f7741", "cells": [["636f6c4141", "76616c4141", 1294532915068], ["636f6c4142", "76616c4142", 1294532915069], ["636f6c4143", "76616c4143", 1294532915071, "e", 42, 2000000000]]}
 ]

diff --git a/test/resources/UnsortedSuperCF.json b/test/resources/UnsortedSuperCF.json
deleted file mode 100644
index bd07e81..0000000
--- a/test/resources/UnsortedSuperCF.json
+++ /dev/null

@@ -1,5 +0,0 @@
-[
- {"key": "303935", "columns": { "5330": {"deletedAt": -9223372036854775808, "subColumns": [["4330", "366338333439636337323630", 1294656637116, false], ["4331", "366338333439636337323630", 1294656637116, false], ["4332", "366338333439636337323630", 1294656637116, false], ["4333", "366338333439636337323630", 1294656637116, false], ["4334", "366338333439636337323630", 1294656637116, false]]}}}  ,
- {"key": "303630", "columns": { "5330": {"deletedAt": -9223372036854775808, "subColumns": [["4330", "643364393434363830326134", 1294656636902, false], ["4331", "643364393434363830326134", 1294656636902, false], ["4332", "643364393434363830326134", 1294656636902, false], ["4333", "643364393434363830326134", 1294656636902, false], ["4334", "643364393434363830326134", 1294656636902, false]]}}}  ,
- {"key": "303638", "columns": { "5330": {"deletedAt": -9223372036854775808, "subColumns": [["4330", "366634393232663435353638", 1294656636885, false], ["4331", "366634393232663435353638", 1294656636885, false], ["4332", "366634393232663435353638", 1294656636885, false], ["4333", "366634393232663435353638", 1294656636885, false], ["4334", "366634393232663435353638", 1294656636885, false]]}}}
-]

diff --git a/test/resources/functions/install_cassandra.sh b/test/resources/functions/install_cassandra.sh
index 59f3e98..278c122 100644
--- a/test/resources/functions/install_cassandra.sh
+++ b/test/resources/functions/install_cassandra.sh

@@ -35,7 +35,7 @@
   mkdir -p /mnt/cassandra/logs
   ln -s /mnt/cassandra/logs $C_LOG_DIR
   mkdir -p $C_CONF_DIR
-  cp $CASSANDRA_HOME/conf/log4j*.properties $C_CONF_DIR
+  cp $CASSANDRA_HOME/conf/logback*.xml $C_CONF_DIR
   if [[ "0.6" == "$C_MAJOR_VERSION" ]] ; then 
     cp $CASSANDRA_HOME/conf/storage-conf.xml $C_CONF_DIR
     sed -i -e "s|CASSANDRA_CONF=\$cassandra_home/conf|CASSANDRA_CONF=$C_CONF_DIR|" $CASSANDRA_HOME/bin/cassandra.in.sh

diff --git a/test/system/__init__.py b/test/system/__init__.py
deleted file mode 100644
index 669bf6d..0000000
--- a/test/system/__init__.py
+++ /dev/null

@@ -1,184 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys, time, signal, httplib, errno, uuid
-
-__all__ = ['root', 'thrift_client']
-
-from thrift.transport import TTransport
-from thrift.transport import TSocket
-from thrift.transport import THttpClient
-from thrift.protocol import TBinaryProtocol
-
-# add cassandra directory to sys.path
-L = os.path.abspath(__file__).split(os.path.sep)[:-3]
-root = os.path.sep.join(L)
-_ipath = os.path.join(root, 'interface', 'thrift', 'gen-py')
-sys.path.append(os.path.join(_ipath, 'cassandra'))
-import Cassandra
-
-def get_thrift_client(host='127.0.0.1', port=9170):
-    socket = TSocket.TSocket(host, port)
-    transport = TTransport.TFramedTransport(socket)
-    protocol = TBinaryProtocol.TBinaryProtocol(transport)
-    client = Cassandra.Client(protocol)
-    client.transport = transport
-    return client
-thrift_client = get_thrift_client()
-
-pid_fname = "system_test.pid"
-def pid():
-    return int(open(pid_fname).read())
-
-class BaseTester(object):
-    # leave this True unless you are manually starting a server and then
-    # running only a single test against it; tests assume they start
-    # against an empty db.
-    runserver = True
-    client = None
-    extra_args = []
-
-    def open_client(self):
-        raise NotImplementedError()
-
-    def close_client(self):
-        raise NotImplementedError()
-    
-    def define_schema(self):
-        raise NotImplementedError()
-
-    def setUp(self):
-        if self.runserver:
-            if os.path.exists(pid_fname):
-                pid_path = os.path.join(root, pid_fname)
-                print "Unclean shutdown detected, (%s found)" % pid_path
-                raise Exception('damn it')
-
-            # clean out old stuff
-            import shutil
-            # todo get directories from conf/cassandra.yaml
-            for dirname in ['system', 'data', 'commitlog']:
-                try:
-                    shutil.rmtree(os.path.join(root, 'build', 'test', 'cassandra', dirname))
-                except OSError:
-                    pass
-            # start the server
-            import subprocess as sp
-            os.chdir(root)
-            os.environ['CASSANDRA_INCLUDE'] = 'test/cassandra.in.sh'
-            args = ['bin/cassandra', '-p', pid_fname] + self.extra_args
-            process = sp.Popen(args, stderr=sp.PIPE, stdout=sp.PIPE)
-            time.sleep(0.1)
-
-            # connect to it, with a timeout in case something went wrong
-            start = time.time()
-            while time.time() < start + 10:
-                try:
-                    self.open_client()
-                except:
-                    time.sleep(0.1)
-                else:
-                    break
-            else:
-                print "Couldn't connect to server; aborting regression test"
-                # see if process is still alive
-                process.poll()
-                
-                if process.returncode is None:
-                    os.kill(pid(), signal.SIGKILL) # just in case
-                else:
-                    stdout_value, stderr_value = process.communicate()
-                    print "Stdout: %s" % (stdout_value)
-                    print "Stderr: %s" % (stderr_value)
-                raise Exception('damn it')
-        else:
-            try:
-                self.open_client()
-            except:
-                pass
-        
-        self.define_schema()
-
-    def tearDown(self):
-        def is_alive(pid):
-            try:
-                os.kill(pid, 0)
-                return 1
-            except OSError, err:
-                return err.errno == errno.EPERM
-
-        if self.runserver:
-            spid = pid()
-            max_wait = 1
-            self.close_client()
-            open('/tmp/kill', 'w').write('killing %s\n' % spid)
-            os.kill(spid, signal.SIGTERM)
-            slept = 0
-            while (slept < max_wait):
-                time.sleep(0.5)
-                if not is_alive(spid):
-                    break
-                slept += 0.5
-            # Give time for cassandra to shutdown
-            time.sleep(2)
-            if (slept > max_wait and is_alive(spid)):
-                os.kill(spid, signal.SIGKILL)
-                fpath = os.path.join(root, pid_fname)
-                if os.path.exists(fpath): os.unlink(fpath)
-                raise Exception('Server did not shutdown correctly')
-
-class ThriftTester(BaseTester):
-    client = thrift_client
-
-    def open_client(self):
-        self.client.transport.open()
-
-    def close_client(self):
-        self.client.transport.close()
-        
-    def define_schema(self):
-        keyspace1 = Cassandra.KsDef('Keyspace1', 'org.apache.cassandra.locator.SimpleStrategy', {'replication_factor':'1'},
-        cf_defs=[
-            Cassandra.CfDef('Keyspace1', 'Standard1'),
-            Cassandra.CfDef('Keyspace1', 'Standard2'), 
-            Cassandra.CfDef('Keyspace1', 'StandardLong1', comparator_type='LongType'), 
-            Cassandra.CfDef('Keyspace1', 'StandardLong2', comparator_type='LongType'), 
-            Cassandra.CfDef('Keyspace1', 'StandardInteger1', comparator_type='IntegerType'),
-            Cassandra.CfDef('Keyspace1', 'Super1', column_type='Super', subcomparator_type='LongType'),
-            Cassandra.CfDef('Keyspace1', 'Super2', column_type='Super', subcomparator_type='LongType'), 
-            Cassandra.CfDef('Keyspace1', 'Super3', column_type='Super', subcomparator_type='LongType'), 
-            Cassandra.CfDef('Keyspace1', 'Super4', column_type='Super', subcomparator_type='UTF8Type'),
-            Cassandra.CfDef('Keyspace1', 'Super5', column_type='Super', comparator_type='LongType', subcomparator_type='UTF8Type'),
-            Cassandra.CfDef('Keyspace1', 'Counter1', default_validation_class='CounterColumnType'),
-            Cassandra.CfDef('Keyspace1', 'SuperCounter1', column_type='Super', default_validation_class='CounterColumnType'),
-            Cassandra.CfDef('Keyspace1', 'Indexed1', column_metadata=[Cassandra.ColumnDef('birthdate', 'LongType', Cassandra.IndexType.KEYS, 'birthdate_index')]),
-            Cassandra.CfDef('Keyspace1', 'Indexed2', comparator_type='TimeUUIDType', column_metadata=[Cassandra.ColumnDef(uuid.UUID('00000000-0000-1000-0000-000000000000').bytes, 'LongType', Cassandra.IndexType.KEYS)]),
-            Cassandra.CfDef('Keyspace1', 'Indexed3', comparator_type='TimeUUIDType', column_metadata=[Cassandra.ColumnDef(uuid.UUID('00000000-0000-1000-0000-000000000000').bytes, 'UTF8Type', Cassandra.IndexType.KEYS)]),
-
-        ])
-
-        keyspace2 = Cassandra.KsDef('Keyspace2', 'org.apache.cassandra.locator.SimpleStrategy', {'replication_factor':'1'},
-        cf_defs=[
-            Cassandra.CfDef('Keyspace2', 'Standard1'),
-            Cassandra.CfDef('Keyspace2', 'Standard3'),
-            Cassandra.CfDef('Keyspace2', 'Super3', column_type='Super', subcomparator_type='BytesType'),
-            Cassandra.CfDef('Keyspace2', 'Super4', column_type='Super', subcomparator_type='TimeUUIDType'),
-        ])
-
-        for ks in [keyspace1, keyspace2]:
-            self.client.system_add_keyspace(ks)
-
-# vim:ai sw=4 ts=4 tw=0 et

diff --git a/test/system/test_thrift_server.py b/test/system/test_thrift_server.py
deleted file mode 100644
index 1e43532..0000000
--- a/test/system/test_thrift_server.py
+++ /dev/null

@@ -1,2079 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# to run a single test, run from trunk/:
-# PYTHONPATH=test nosetests --tests=system.test_thrift_server:TestMutations.test_empty_range
-
-import os, sys, time, struct, uuid, re
-
-from . import root, ThriftTester
-from . import thrift_client as client
-
-from thrift.Thrift import TApplicationException
-from ttypes import *
-
-
-def _i64(n):
-    return struct.pack('>q', n) # big endian = network order
-
-_SIMPLE_COLUMNS = [Column('c1', 'value1', 0),
-                   Column('c2', 'value2', 0)]
-_SUPER_COLUMNS = [SuperColumn(name='sc1', columns=[Column(_i64(4), 'value4', 0)]),
-                  SuperColumn(name='sc2', columns=[Column(_i64(5), 'value5', 0),
-                                                   Column(_i64(6), 'value6', 0)])]
-
-def _assert_column(column_family, key, column, value, ts = 0):
-    try:
-        assert client.get(key, ColumnPath(column_family, column=column), ConsistencyLevel.ONE).column == Column(column, value, ts)
-    except NotFoundException:
-        raise Exception('expected %s:%s:%s:%s, but was not present' % (column_family, key, column, value) )
-
-def _assert_columnpath_exists(key, column_path):
-    try:
-        assert client.get(key, column_path, ConsistencyLevel.ONE)
-    except NotFoundException:
-        raise Exception('expected %s with %s but was not present.' % (key, column_path) )
-
-def _assert_no_columnpath(key, column_path):
-    try:
-        client.get(key, column_path, ConsistencyLevel.ONE)
-        assert False, ('columnpath %s existed in %s when it should not' % (column_path, key))
-    except NotFoundException:
-        assert True, 'column did not exist'
-
-def _insert_simple(block=True):
-   return _insert_multi(['key1'])
-
-def _insert_batch(block):
-   return _insert_multi_batch(['key1'], block)
-
-def _insert_multi(keys):
-    CL = ConsistencyLevel.ONE
-    for key in keys:
-        client.insert(key, ColumnParent('Standard1'), Column('c1', 'value1', 0), CL)
-        client.insert(key, ColumnParent('Standard1'), Column('c2', 'value2', 0), CL)
-
-def _insert_multi_batch(keys, block):
-    cfmap = {'Standard1': [Mutation(ColumnOrSuperColumn(c)) for c in _SIMPLE_COLUMNS],
-             'Standard2': [Mutation(ColumnOrSuperColumn(c)) for c in _SIMPLE_COLUMNS]}
-    for key in keys:
-        client.batch_mutate({key: cfmap}, ConsistencyLevel.ONE)
-
-def _big_slice(key, column_parent):
-    p = SlicePredicate(slice_range=SliceRange('', '', False, 1000))
-    return client.get_slice(key, column_parent, p, ConsistencyLevel.ONE)
-
-def _big_multislice(keys, column_parent):
-    p = SlicePredicate(slice_range=SliceRange('', '', False, 1000))
-    return client.multiget_slice(keys, column_parent, p, ConsistencyLevel.ONE)
-
-def _verify_batch():
-    _verify_simple()
-    L = [result.column
-         for result in _big_slice('key1', ColumnParent('Standard2'))]
-    assert L == _SIMPLE_COLUMNS, L
-
-def _verify_simple():
-    assert client.get('key1', ColumnPath('Standard1', column='c1'), ConsistencyLevel.ONE).column == Column('c1', 'value1', 0)
-    L = [result.column
-         for result in _big_slice('key1', ColumnParent('Standard1'))]
-    assert L == _SIMPLE_COLUMNS, L
-
-def _insert_super(key='key1'):
-    client.insert(key, ColumnParent('Super1', 'sc1'), Column(_i64(4), 'value4', 0), ConsistencyLevel.ONE)
-    client.insert(key, ColumnParent('Super1', 'sc2'), Column(_i64(5), 'value5', 0), ConsistencyLevel.ONE)
-    client.insert(key, ColumnParent('Super1', 'sc2'), Column(_i64(6), 'value6', 0), ConsistencyLevel.ONE)
-    time.sleep(0.1)
-
-def _insert_range():
-    client.insert('key1', ColumnParent('Standard1'), Column('c1', 'value1', 0), ConsistencyLevel.ONE)
-    client.insert('key1', ColumnParent('Standard1'), Column('c2', 'value2', 0), ConsistencyLevel.ONE)
-    client.insert('key1', ColumnParent('Standard1'), Column('c3', 'value3', 0), ConsistencyLevel.ONE)
-    time.sleep(0.1)
-
-def _insert_counter_range():
-    client.add('key1', ColumnParent('Counter1'), CounterColumn('c1', 1), ConsistencyLevel.ONE)
-    client.add('key1', ColumnParent('Counter1'), CounterColumn('c2', 2), ConsistencyLevel.ONE)
-    client.add('key1', ColumnParent('Counter1'), CounterColumn('c3', 3), ConsistencyLevel.ONE)
-    time.sleep(0.1)
-
-def _verify_range():
-    p = SlicePredicate(slice_range=SliceRange('c1', 'c2', False, 1000))
-    result = client.get_slice('key1', ColumnParent('Standard1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2
-    assert result[0].column.name == 'c1'
-    assert result[1].column.name == 'c2'
-
-    p = SlicePredicate(slice_range=SliceRange('c3', 'c2', True, 1000))
-    result = client.get_slice('key1', ColumnParent('Standard1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2
-    assert result[0].column.name == 'c3'
-    assert result[1].column.name == 'c2'
-
-    p = SlicePredicate(slice_range=SliceRange('a', 'z', False, 1000))
-    result = client.get_slice('key1', ColumnParent('Standard1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 3, result
-    
-    p = SlicePredicate(slice_range=SliceRange('a', 'z', False, 2))
-    result = client.get_slice('key1', ColumnParent('Standard1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2, result
-
-def _verify_counter_range():
-    p = SlicePredicate(slice_range=SliceRange('c1', 'c2', False, 1000))
-    result = client.get_slice('key1', ColumnParent('Counter1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2
-    assert result[0].counter_column.name == 'c1'
-    assert result[1].counter_column.name == 'c2'
-
-    p = SlicePredicate(slice_range=SliceRange('c3', 'c2', True, 1000))
-    result = client.get_slice('key1', ColumnParent('Counter1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2
-    assert result[0].counter_column.name == 'c3'
-    assert result[1].counter_column.name == 'c2'
-
-    p = SlicePredicate(slice_range=SliceRange('a', 'z', False, 1000))
-    result = client.get_slice('key1', ColumnParent('Counter1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 3, result
-
-    p = SlicePredicate(slice_range=SliceRange('a', 'z', False, 2))
-    result = client.get_slice('key1', ColumnParent('Counter1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2, result
-
-def _set_keyspace(keyspace):
-    client.set_keyspace(keyspace)
-
-def _insert_super_range():
-    client.insert('key1', ColumnParent('Super1', 'sc1'), Column(_i64(4), 'value4', 0), ConsistencyLevel.ONE)
-    client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(5), 'value5', 0), ConsistencyLevel.ONE)
-    client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(6), 'value6', 0), ConsistencyLevel.ONE)
-    client.insert('key1', ColumnParent('Super1', 'sc3'), Column(_i64(7), 'value7', 0), ConsistencyLevel.ONE)
-    time.sleep(0.1)
-
-def _insert_counter_super_range():
-    client.add('key1', ColumnParent('SuperCounter1', 'sc1'), CounterColumn(_i64(4), 4), ConsistencyLevel.ONE)
-    client.add('key1', ColumnParent('SuperCounter1', 'sc2'), CounterColumn(_i64(5), 5), ConsistencyLevel.ONE)
-    client.add('key1', ColumnParent('SuperCounter1', 'sc2'), CounterColumn(_i64(6), 6), ConsistencyLevel.ONE)
-    client.add('key1', ColumnParent('SuperCounter1', 'sc3'), CounterColumn(_i64(7), 7), ConsistencyLevel.ONE)
-    time.sleep(0.1)
-
-def _verify_super_range():
-    p = SlicePredicate(slice_range=SliceRange('sc2', 'sc3', False, 2))
-    result = client.get_slice('key1', ColumnParent('Super1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2
-    assert result[0].super_column.name == 'sc2'
-    assert result[1].super_column.name == 'sc3'
-
-    p = SlicePredicate(slice_range=SliceRange('sc3', 'sc2', True, 2))
-    result = client.get_slice('key1', ColumnParent('Super1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2
-    assert result[0].super_column.name == 'sc3'
-    assert result[1].super_column.name == 'sc2'
-
-def _verify_counter_super_range():
-    p = SlicePredicate(slice_range=SliceRange('sc2', 'sc3', False, 2))
-    result = client.get_slice('key1', ColumnParent('SuperCounter1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2
-    assert result[0].counter_super_column.name == 'sc2'
-    assert result[1].counter_super_column.name == 'sc3'
-
-    p = SlicePredicate(slice_range=SliceRange('sc3', 'sc2', True, 2))
-    result = client.get_slice('key1', ColumnParent('SuperCounter1'), p, ConsistencyLevel.ONE)
-    assert len(result) == 2
-    assert result[0].counter_super_column.name == 'sc3'
-    assert result[1].counter_super_column.name == 'sc2'
-
-def _verify_super(supercf='Super1', key='key1'):
-    assert client.get(key, ColumnPath(supercf, 'sc1', _i64(4)), ConsistencyLevel.ONE).column == Column(_i64(4), 'value4', 0)
-    slice = [result.super_column
-             for result in _big_slice(key, ColumnParent('Super1'))]
-    assert slice == _SUPER_COLUMNS, slice
-
-def _expect_exception(fn, type_):
-    try:
-        r = fn()
-    except type_, t:
-        return t
-    else:
-        raise Exception('expected %s; got %s' % (type_.__name__, r))
-    
-def _expect_missing(fn):
-    _expect_exception(fn, NotFoundException)
-
-def get_range_slice(client, parent, predicate, start, end, count, cl, row_filter=None):
-    kr = KeyRange(start, end, count=count, row_filter=row_filter)
-    return client.get_range_slices(parent, predicate, kr, cl)
-    
-
-class TestMutations(ThriftTester):
-    def test_insert(self):
-        _set_keyspace('Keyspace1')
-        _insert_simple(False)
-        time.sleep(0.1)
-        _verify_simple()
-
-    def test_empty_slice(self):
-        _set_keyspace('Keyspace1')
-        assert _big_slice('key1', ColumnParent('Standard2')) == []
-        assert _big_slice('key1', ColumnParent('Super1')) == []
-
-    def test_cas(self):
-        _set_keyspace('Keyspace1')
-        def cas(expected, updates):
-            return client.cas('key1', 'Standard1', expected, updates, ConsistencyLevel.SERIAL, ConsistencyLevel.QUORUM)
-
-        cas_result = cas(_SIMPLE_COLUMNS, _SIMPLE_COLUMNS)
-        assert not cas_result.success
-        assert len(cas_result.current_values) == 0, cas_result
-
-        assert cas([], _SIMPLE_COLUMNS).success
-
-        result = [cosc.column for cosc in _big_slice('key1', ColumnParent('Standard1'))]
-        # CAS will use its own timestamp, so we can't just compare result == _SIMPLE_COLUMNS
-
-        cas_result = cas([], _SIMPLE_COLUMNS)
-        assert not cas_result.success
-        # When we CAS for non-existence, current_values is the first live column of the row
-        assert dict((c.name, c.value) for c in cas_result.current_values) == { _SIMPLE_COLUMNS[0].name : _SIMPLE_COLUMNS[0].value }, cas_result
-
-        # CL.SERIAL for reads
-        assert client.get('key1', ColumnPath('Standard1', column='c1'), ConsistencyLevel.SERIAL).column.value == 'value1'
-
-    def test_missing_super(self):
-        _set_keyspace('Keyspace1')
-        _expect_missing(lambda: client.get('key1', ColumnPath('Super1', 'sc1', _i64(1)), ConsistencyLevel.ONE))
-        _insert_super()
-        _expect_missing(lambda: client.get('key1', ColumnPath('Super1', 'sc1', _i64(1)), ConsistencyLevel.ONE))
-
-    def test_count(self):
-        _set_keyspace('Keyspace1')
-        _insert_simple()
-        _insert_super()
-        p = SlicePredicate(slice_range=SliceRange('', '', False, 1000))
-        assert client.get_count('key1', ColumnParent('Standard2'), p, ConsistencyLevel.ONE) == 0
-        assert client.get_count('key1', ColumnParent('Standard1'), p, ConsistencyLevel.ONE) == 2
-        assert client.get_count('key1', ColumnParent('Super1', 'sc2'), p, ConsistencyLevel.ONE) == 2
-        assert client.get_count('key1', ColumnParent('Super1'), p, ConsistencyLevel.ONE) == 2
-
-        # Let's make that a little more interesting
-        client.insert('key1', ColumnParent('Standard1'), Column('c3', 'value3', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('c4', 'value4', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('c5', 'value5', 0), ConsistencyLevel.ONE)
-
-        p = SlicePredicate(slice_range=SliceRange('c2', 'c4', False, 1000)) 
-        assert client.get_count('key1', ColumnParent('Standard1'), p, ConsistencyLevel.ONE) == 3
-
-    def test_count_paging(self):
-        _set_keyspace('Keyspace1')
-        _insert_simple()
-
-        # Exercise paging
-        column_parent = ColumnParent('Standard1')
-        super_column_parent = ColumnParent('Super1', 'sc3')
-        # Paging for small columns starts at 1024 columns
-        columns_to_insert = [Column('c%d' % (i,), 'value%d' % (i,), 0) for i in xrange(3, 1026)]
-        cfmap = {'Standard1': [Mutation(ColumnOrSuperColumn(c)) for c in columns_to_insert]}
-        client.batch_mutate({'key1' : cfmap }, ConsistencyLevel.ONE)
-
-        p = SlicePredicate(slice_range=SliceRange('', '', False, 2000))
-        assert client.get_count('key1', column_parent, p, ConsistencyLevel.ONE) == 1025
-
-        # Ensure that the count limit isn't clobbered
-        p = SlicePredicate(slice_range=SliceRange('', '', False, 10))
-        assert client.get_count('key1', ColumnParent('Standard1'), p, ConsistencyLevel.ONE) == 10
-
-    # test get_count() to work correctly with 'count' settings around page size (CASSANDRA-4833)
-    def test_count_around_page_size(self):
-        def slice_predicate(count):
-            return SlicePredicate(slice_range=SliceRange('', '', False, count))
-
-        _set_keyspace('Keyspace1')
-
-        key = 'key1'
-        parent = ColumnParent('Standard1')
-        cl = ConsistencyLevel.ONE
-
-        for i in xrange(0, 3050):
-            client.insert(key, parent, Column(str(i), '', 0), cl)
-
-        # same as page size
-        assert client.get_count(key, parent, slice_predicate(1024), cl) == 1024
-
-        # 1 above page size
-        assert client.get_count(key, parent, slice_predicate(1025), cl) == 1025
-
-        # above number or columns
-        assert client.get_count(key, parent, slice_predicate(4000), cl) == 3050
-
-        # same as number of columns
-        assert client.get_count(key, parent, slice_predicate(3050), cl) == 3050
-
-        # 1 above number of columns
-        assert client.get_count(key, parent, slice_predicate(3051), cl) == 3050
-
-    def test_insert_blocking(self):
-        _set_keyspace('Keyspace1')
-        _insert_simple()
-        _verify_simple()
-
-    def test_super_insert(self):
-        _set_keyspace('Keyspace1')
-        _insert_super()
-        _verify_super()
-
-    def test_super_get(self):
-        _set_keyspace('Keyspace1')
-        _insert_super()
-        result = client.get('key1', ColumnPath('Super1', 'sc2'), ConsistencyLevel.ONE).super_column
-        assert result == _SUPER_COLUMNS[1], result
-
-    def test_super_subcolumn_limit(self):
-        _set_keyspace('Keyspace1')
-        _insert_super()
-        p = SlicePredicate(slice_range=SliceRange('', '', False, 1))
-        column_parent = ColumnParent('Super1', 'sc2')
-        slice = [result.column
-                 for result in client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE)]
-        assert slice == [Column(_i64(5), 'value5', 0)], slice
-        p = SlicePredicate(slice_range=SliceRange('', '', True, 1))
-        slice = [result.column
-                 for result in client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE)]
-        assert slice == [Column(_i64(6), 'value6', 0)], slice
-        
-    def test_long_order(self):
-        _set_keyspace('Keyspace1')
-        def long_xrange(start, stop, step):
-            i = start
-            while i < stop:
-                yield i
-                i += step
-        L = []
-        for i in long_xrange(0, 104294967296, 429496729):
-            name = _i64(i)
-            client.insert('key1', ColumnParent('StandardLong1'), Column(name, 'v', 0), ConsistencyLevel.ONE)
-            L.append(name)
-        slice = [result.column.name for result in _big_slice('key1', ColumnParent('StandardLong1'))]
-        assert slice == L, slice
-        
-    def test_integer_order(self):
-        _set_keyspace('Keyspace1')
-        def long_xrange(start, stop, step):
-            i = start
-            while i >= stop:
-                yield i
-                i -= step
-        L = []
-        for i in long_xrange(104294967296, 0, 429496729):
-            name = _i64(i)
-            client.insert('key1', ColumnParent('StandardInteger1'), Column(name, 'v', 0), ConsistencyLevel.ONE)
-            L.append(name)
-        slice = [result.column.name for result in _big_slice('key1', ColumnParent('StandardInteger1'))]
-        L.sort()
-        assert slice == L, slice
-
-    def test_time_uuid(self):
-        import uuid
-        L = []
-        _set_keyspace('Keyspace2')
-        # 100 isn't enough to fail reliably if the comparator is borked
-        for i in xrange(500):
-            L.append(uuid.uuid1())
-            client.insert('key1', ColumnParent('Super4', 'sc1'), Column(L[-1].bytes, 'value%s' % i, i), ConsistencyLevel.ONE)
-        slice = _big_slice('key1', ColumnParent('Super4', 'sc1'))
-        assert len(slice) == 500, len(slice)
-        for i in xrange(500):
-            u = slice[i].column
-            assert u.value == 'value%s' % i
-            assert u.name == L[i].bytes
-
-        p = SlicePredicate(slice_range=SliceRange('', '', True, 1))
-        column_parent = ColumnParent('Super4', 'sc1')
-        slice = [result.column
-                 for result in client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE)]
-        assert slice == [Column(L[-1].bytes, 'value499', 499)], slice
-
-        p = SlicePredicate(slice_range=SliceRange('', L[2].bytes, False, 1000))
-        column_parent = ColumnParent('Super4', 'sc1')
-        slice = [result.column
-                 for result in client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE)]
-        assert slice == [Column(L[0].bytes, 'value0', 0),
-                         Column(L[1].bytes, 'value1', 1),
-                         Column(L[2].bytes, 'value2', 2)], slice
-
-        p = SlicePredicate(slice_range=SliceRange(L[2].bytes, '', True, 1000))
-        column_parent = ColumnParent('Super4', 'sc1')
-        slice = [result.column
-                 for result in client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE)]
-        assert slice == [Column(L[2].bytes, 'value2', 2),
-                         Column(L[1].bytes, 'value1', 1),
-                         Column(L[0].bytes, 'value0', 0)], slice
-
-        p = SlicePredicate(slice_range=SliceRange(L[2].bytes, '', False, 1))
-        column_parent = ColumnParent('Super4', 'sc1')
-        slice = [result.column
-                 for result in client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE)]
-        assert slice == [Column(L[2].bytes, 'value2', 2)], slice
-        
-    def test_long_remove(self):
-        column_parent = ColumnParent('StandardLong1')
-        sp = SlicePredicate(slice_range=SliceRange('', '', False, 1))
-        _set_keyspace('Keyspace1')
-        for i in xrange(10):
-            parent = ColumnParent('StandardLong1')
-
-            client.insert('key1', parent, Column(_i64(i), 'value1', 10 * i), ConsistencyLevel.ONE)
-            client.remove('key1', ColumnPath('StandardLong1'), 10 * i + 1, ConsistencyLevel.ONE)
-            slice = client.get_slice('key1', column_parent, sp, ConsistencyLevel.ONE)
-            assert slice == [], slice
-            # resurrect
-            client.insert('key1', parent, Column(_i64(i), 'value2', 10 * i + 2), ConsistencyLevel.ONE)
-            slice = [result.column
-                     for result in client.get_slice('key1', column_parent, sp, ConsistencyLevel.ONE)]
-            assert slice == [Column(_i64(i), 'value2', 10 * i + 2)], (slice, i)
-        
-    def test_integer_remove(self):
-        column_parent = ColumnParent('StandardInteger1')
-        sp = SlicePredicate(slice_range=SliceRange('', '', False, 1))
-        _set_keyspace('Keyspace1')
-        for i in xrange(10):
-            parent = ColumnParent('StandardInteger1')
-
-            client.insert('key1', parent, Column(_i64(i), 'value1', 10 * i), ConsistencyLevel.ONE)
-            client.remove('key1', ColumnPath('StandardInteger1'), 10 * i + 1, ConsistencyLevel.ONE)
-            slice = client.get_slice('key1', column_parent, sp, ConsistencyLevel.ONE)
-            assert slice == [], slice
-            # resurrect
-            client.insert('key1', parent, Column(_i64(i), 'value2', 10 * i + 2), ConsistencyLevel.ONE)
-            slice = [result.column
-                     for result in client.get_slice('key1', column_parent, sp, ConsistencyLevel.ONE)]
-            assert slice == [Column(_i64(i), 'value2', 10 * i + 2)], (slice, i)
-
-    def test_batch_insert(self):
-        _set_keyspace('Keyspace1')
-        _insert_batch(False)
-        time.sleep(0.1)
-        _verify_batch()
-
-    def test_batch_insert_blocking(self):
-        _set_keyspace('Keyspace1')
-        _insert_batch(True)
-        _verify_batch()
-        
-    def test_batch_mutate_standard_columns(self):
-        _set_keyspace('Keyspace1')
-        column_families = ['Standard1', 'Standard2']
-        keys = ['key_%d' % i for i in  range(27,32)] 
-        mutations = [Mutation(ColumnOrSuperColumn(c)) for c in _SIMPLE_COLUMNS]
-        mutation_map = dict((column_family, mutations) for column_family in column_families)
-        keyed_mutations = dict((key, mutation_map) for key in keys)
-
-        client.batch_mutate(keyed_mutations, ConsistencyLevel.ONE)
-
-        for column_family in column_families:
-            for key in keys:
-               _assert_column(column_family, key, 'c1', 'value1')
-
-    def test_batch_mutate_standard_columns_blocking(self):
-        _set_keyspace('Keyspace1')
-        
-        column_families = ['Standard1', 'Standard2']
-        keys = ['key_%d' % i for i in  range(38,46)]
-         
-        mutations = [Mutation(ColumnOrSuperColumn(c)) for c in _SIMPLE_COLUMNS]
-        mutation_map = dict((column_family, mutations) for column_family in column_families)
-        keyed_mutations = dict((key, mutation_map) for key in keys)
-        
-        client.batch_mutate(keyed_mutations, ConsistencyLevel.ONE)
-
-        for column_family in column_families:
-            for key in keys:
-                _assert_column(column_family, key, 'c1', 'value1')
-
-    def test_batch_mutate_remove_standard_columns(self):
-        _set_keyspace('Keyspace1')
-        column_families = ['Standard1', 'Standard2']
-        keys = ['key_%d' % i for i in range(11,21)]
-        _insert_multi(keys)
-
-        mutations = [Mutation(deletion=Deletion(20, predicate=SlicePredicate(column_names=[c.name]))) for c in _SIMPLE_COLUMNS]
-        mutation_map = dict((column_family, mutations) for column_family in column_families)
-
-        keyed_mutations = dict((key, mutation_map) for key in keys)
-
-        client.batch_mutate(keyed_mutations, ConsistencyLevel.ONE)
-
-        for column_family in column_families:
-            for c in _SIMPLE_COLUMNS:
-                for key in keys:
-                    _assert_no_columnpath(key, ColumnPath(column_family, column=c.name))
-
-    def test_batch_mutate_remove_standard_row(self):
-        _set_keyspace('Keyspace1')
-        column_families = ['Standard1', 'Standard2']
-        keys = ['key_%d' % i for i in range(11,21)]
-        _insert_multi(keys)
-
-        mutations = [Mutation(deletion=Deletion(20))]
-        mutation_map = dict((column_family, mutations) for column_family in column_families)
-
-        keyed_mutations = dict((key, mutation_map) for key in keys)
-
-        client.batch_mutate(keyed_mutations, ConsistencyLevel.ONE)
-
-        for column_family in column_families:
-            for c in _SIMPLE_COLUMNS:
-                for key in keys:
-                    _assert_no_columnpath(key, ColumnPath(column_family, column=c.name))
-
-    def test_batch_mutate_remove_super_columns_with_standard_under(self):
-        _set_keyspace('Keyspace1')
-        column_families = ['Super1', 'Super2']
-        keys = ['key_%d' % i for i in range(11,21)]
-        _insert_super()
-
-        mutations = []
-        for sc in _SUPER_COLUMNS:
-            names = []
-            for c in sc.columns:
-                names.append(c.name)
-            mutations.append(Mutation(deletion=Deletion(20, super_column=c.name, predicate=SlicePredicate(column_names=names))))
-
-        mutation_map = dict((column_family, mutations) for column_family in column_families)
-
-        keyed_mutations = dict((key, mutation_map) for key in keys)
-
-        client.batch_mutate(keyed_mutations, ConsistencyLevel.ONE)
-        for column_family in column_families:
-            for sc in _SUPER_COLUMNS:
-                for c in sc.columns:
-                    for key in keys:
-                        _assert_no_columnpath(key, ColumnPath(column_family, super_column=sc.name, column=c.name))
-
-    def test_batch_mutate_remove_super_columns_with_none_given_underneath(self):
-        _set_keyspace('Keyspace1')
-        
-        keys = ['key_%d' % i for i in range(17,21)]
-
-        for key in keys:
-            _insert_super(key)
-
-        mutations = []
-
-        for sc in _SUPER_COLUMNS:
-            mutations.append(Mutation(deletion=Deletion(20,
-                                                        super_column=sc.name)))
-
-        mutation_map = {'Super1': mutations}
-
-        keyed_mutations = dict((key, mutation_map) for key in keys)
-
-        # Sanity check
-        for sc in _SUPER_COLUMNS:
-            for key in keys:
-                _assert_columnpath_exists(key, ColumnPath('Super1', super_column=sc.name))
-
-        client.batch_mutate(keyed_mutations, ConsistencyLevel.ONE)
-
-        for sc in _SUPER_COLUMNS:
-            for c in sc.columns:
-                for key in keys:
-                    _assert_no_columnpath(key, ColumnPath('Super1', super_column=sc.name))
-    
-    def test_batch_mutate_remove_super_columns_entire_row(self):
-        _set_keyspace('Keyspace1')
-        
-        keys = ['key_%d' % i for i in range(17,21)]
-
-        for key in keys:
-            _insert_super(key)
-
-        mutations = []
-
-        mutations.append(Mutation(deletion=Deletion(20)))
-
-        mutation_map = {'Super1': mutations}
-
-        keyed_mutations = dict((key, mutation_map) for key in keys)
-
-        # Sanity check
-        for sc in _SUPER_COLUMNS:
-            for key in keys:
-                _assert_columnpath_exists(key, ColumnPath('Super1', super_column=sc.name))
-
-        client.batch_mutate(keyed_mutations, ConsistencyLevel.ONE)
-
-        for sc in _SUPER_COLUMNS:
-          for key in keys:
-            _assert_no_columnpath(key, ColumnPath('Super1', super_column=sc.name))
-
-    def test_batch_mutate_remove_slice_standard(self):
-        _set_keyspace('Keyspace1')
-
-        columns = [Column('c1', 'value1', 0),
-                   Column('c2', 'value2', 0),
-                   Column('c3', 'value3', 0),
-                   Column('c4', 'value4', 0),
-                   Column('c5', 'value5', 0)]
-
-        for column in columns:
-            client.insert('key', ColumnParent('Standard1'), column, ConsistencyLevel.ONE)
-
-        d = Deletion(1, predicate=SlicePredicate(slice_range=SliceRange(start='c2', finish='c4')))
-        client.batch_mutate({'key': {'Standard1' : [Mutation(deletion=d)]}}, ConsistencyLevel.ONE)
-
-        _assert_columnpath_exists('key', ColumnPath('Standard1', column='c1'))
-        _assert_no_columnpath('key', ColumnPath('Standard1', column='c2'))
-        _assert_no_columnpath('key', ColumnPath('Standard1', column='c3'))
-        _assert_no_columnpath('key', ColumnPath('Standard1', column='c4'))
-        _assert_columnpath_exists('key', ColumnPath('Standard1', column='c5'))
-
-    def test_batch_mutate_remove_slice_of_entire_supercolumns(self):
-        _set_keyspace('Keyspace1')
-
-        columns = [SuperColumn(name='sc1', columns=[Column(_i64(1), 'value1', 0)]),
-                   SuperColumn(name='sc2',
-                               columns=[Column(_i64(2), 'value2', 0), Column(_i64(3), 'value3', 0)]),
-                   SuperColumn(name='sc3', columns=[Column(_i64(4), 'value4', 0)]),
-                   SuperColumn(name='sc4',
-                               columns=[Column(_i64(5), 'value5', 0), Column(_i64(6), 'value6', 0)]),
-                   SuperColumn(name='sc5', columns=[Column(_i64(7), 'value7', 0)])]
-
-        for column in columns:
-            for subcolumn in column.columns:
-                client.insert('key', ColumnParent('Super1', column.name), subcolumn, ConsistencyLevel.ONE)
-
-        d = Deletion(1, predicate=SlicePredicate(slice_range=SliceRange(start='sc2', finish='sc4')))
-        client.batch_mutate({'key': {'Super1' : [Mutation(deletion=d)]}}, ConsistencyLevel.ONE)
-
-        _assert_columnpath_exists('key', ColumnPath('Super1', super_column='sc1', column=_i64(1)))
-        _assert_no_columnpath('key', ColumnPath('Super1', super_column='sc2', column=_i64(2)))
-        _assert_no_columnpath('key', ColumnPath('Super1', super_column='sc2', column=_i64(3)))
-        _assert_no_columnpath('key', ColumnPath('Super1', super_column='sc3', column=_i64(4)))
-        _assert_no_columnpath('key', ColumnPath('Super1', super_column='sc4', column=_i64(5)))
-        _assert_no_columnpath('key', ColumnPath('Super1', super_column='sc4', column=_i64(6)))
-        _assert_columnpath_exists('key', ColumnPath('Super1', super_column='sc5', column=_i64(7)))
-
-    def test_batch_mutate_remove_slice_part_of_supercolumns(self):
-        _set_keyspace('Keyspace1')
-
-        columns = [Column(_i64(1), 'value1', 0),
-                   Column(_i64(2), 'value2', 0),
-                   Column(_i64(3), 'value3', 0),
-                   Column(_i64(4), 'value4', 0),
-                   Column(_i64(5), 'value5', 0)]
-
-        for column in columns:
-            client.insert('key', ColumnParent('Super1', 'sc1'), column, ConsistencyLevel.ONE)
-
-        r = SliceRange(start=_i64(2), finish=_i64(4))
-        d = Deletion(1, super_column='sc1', predicate=SlicePredicate(slice_range=r))
-        client.batch_mutate({'key': {'Super1' : [Mutation(deletion=d)]}}, ConsistencyLevel.ONE)
-
-        _assert_columnpath_exists('key', ColumnPath('Super1', super_column='sc1', column=_i64(1)))
-        _assert_no_columnpath('key', ColumnPath('Super1', super_column='sc1', column=_i64(2)))
-        _assert_no_columnpath('key', ColumnPath('Super1', super_column='sc1', column=_i64(3)))
-        _assert_no_columnpath('key', ColumnPath('Super1', super_column='sc1', column=_i64(4)))
-        _assert_columnpath_exists('key', ColumnPath('Super1', super_column='sc1', column=_i64(5)))
-
-    def test_batch_mutate_insertions_and_deletions(self):
-        _set_keyspace('Keyspace1')
-        
-        first_insert = SuperColumn("sc1",
-                                   columns=[Column(_i64(20), 'value20', 3),
-                                            Column(_i64(21), 'value21', 3)])
-        second_insert = SuperColumn("sc1",
-                                    columns=[Column(_i64(20), 'value20', 3),
-                                             Column(_i64(21), 'value21', 3)])
-        first_deletion = {'super_column': "sc1",
-                          'predicate': SlicePredicate(column_names=[_i64(22), _i64(23)])}
-        second_deletion = {'super_column': "sc2",
-                           'predicate': SlicePredicate(column_names=[_i64(22), _i64(23)])}
-
-        keys = ['key_30', 'key_31']
-        for key in keys:
-            sc = SuperColumn('sc1',[Column(_i64(22), 'value22', 0),
-                                    Column(_i64(23), 'value23', 0)])
-            cfmap = {'Super1': [Mutation(ColumnOrSuperColumn(super_column=sc))]}
-            client.batch_mutate({key: cfmap}, ConsistencyLevel.ONE)
-
-            sc2 = SuperColumn('sc2', [Column(_i64(22), 'value22', 0),
-                                      Column(_i64(23), 'value23', 0)])
-            cfmap2 = {'Super2': [Mutation(ColumnOrSuperColumn(super_column=sc2))]}
-            client.batch_mutate({key: cfmap2}, ConsistencyLevel.ONE)
-
-        cfmap3 = {
-            'Super1' : [Mutation(ColumnOrSuperColumn(super_column=first_insert)),
-                        Mutation(deletion=Deletion(3, **first_deletion))],
-        
-            'Super2' : [Mutation(deletion=Deletion(2, **second_deletion)),
-                        Mutation(ColumnOrSuperColumn(super_column=second_insert))]
-            }
-
-        keyed_mutations = dict((key, cfmap3) for key in keys)
-        client.batch_mutate(keyed_mutations, ConsistencyLevel.ONE)
-
-        for key in keys:
-            for c in [_i64(22), _i64(23)]:
-                _assert_no_columnpath(key, ColumnPath('Super1', super_column='sc1', column=c))
-                _assert_no_columnpath(key, ColumnPath('Super2', super_column='sc2', column=c))
-
-            for c in [_i64(20), _i64(21)]:
-                _assert_columnpath_exists(key, ColumnPath('Super1', super_column='sc1', column=c))
-                _assert_columnpath_exists(key, ColumnPath('Super2', super_column='sc1', column=c))
-
-    def test_bad_system_calls(self):
-        def duplicate_index_names():
-            _set_keyspace('Keyspace1')
-            cd1 = ColumnDef('foo', 'BytesType', IndexType.KEYS, 'i')
-            cd2 = ColumnDef('bar', 'BytesType', IndexType.KEYS, 'i')
-            cf = CfDef('Keyspace1', 'BadCF', column_metadata=[cd1, cd2])
-            client.system_add_column_family(cf)
-        _expect_exception(duplicate_index_names, InvalidRequestException)
-
-    def test_bad_batch_calls(self):
-        # mutate_does_not_accept_cosc_and_deletion_in_same_mutation
-        def too_full():
-            _set_keyspace('Keyspace1')
-            col = ColumnOrSuperColumn(column=Column("foo", 'bar', 0))
-            dele = Deletion(2, predicate=SlicePredicate(column_names=['baz']))
-            client.batch_mutate({'key_34': {'Standard1': [Mutation(col, dele)]}},
-                                 ConsistencyLevel.ONE)
-        _expect_exception(too_full, InvalidRequestException)
-
-        # test_batch_mutate_does_not_accept_cosc_on_undefined_cf:
-        def bad_cf():
-            _set_keyspace('Keyspace1')
-            col = ColumnOrSuperColumn(column=Column("foo", 'bar', 0))
-            client.batch_mutate({'key_36': {'Undefined': [Mutation(col)]}},
-                                 ConsistencyLevel.ONE)
-        _expect_exception(bad_cf, InvalidRequestException)
-
-        # test_batch_mutate_does_not_accept_deletion_on_undefined_cf
-        def bad_cf():
-            _set_keyspace('Keyspace1')
-            d = Deletion(2, predicate=SlicePredicate(column_names=['baz']))
-            client.batch_mutate({'key_37': {'Undefined':[Mutation(deletion=d)]}},
-                                 ConsistencyLevel.ONE)
-        _expect_exception(bad_cf, InvalidRequestException)
-
-        # a column value that does not match the declared validator
-        def send_string_instead_of_long():
-            _set_keyspace('Keyspace1')
-            col = ColumnOrSuperColumn(column=Column('birthdate', 'bar', 0))
-            client.batch_mutate({'key_38': {'Indexed1': [Mutation(col)]}},
-                                 ConsistencyLevel.ONE)
-        _expect_exception(send_string_instead_of_long, InvalidRequestException)
-
-    def test_column_name_lengths(self):
-        _set_keyspace('Keyspace1')
-        _expect_exception(lambda: client.insert('key1', ColumnParent('Standard1'), Column('', 'value', 0), ConsistencyLevel.ONE), InvalidRequestException)
-        client.insert('key1', ColumnParent('Standard1'), Column('x'*1, 'value', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('x'*127, 'value', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('x'*128, 'value', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('x'*129, 'value', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('x'*255, 'value', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('x'*256, 'value', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('x'*257, 'value', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Standard1'), Column('x'*(2**16 - 1), 'value', 0), ConsistencyLevel.ONE)
-        _expect_exception(lambda: client.insert('key1', ColumnParent('Standard1'), Column('x'*(2**16), 'value', 0), ConsistencyLevel.ONE), InvalidRequestException)
-
-    def test_bad_calls(self):
-        _set_keyspace('Keyspace1')
-        # missing arguments
-        _expect_exception(lambda: client.insert(None, None, None, None), TApplicationException)
-        # supercolumn in a non-super CF
-        _expect_exception(lambda: client.insert('key1', ColumnParent('Standard1', 'x'), Column('y', 'value', 0), ConsistencyLevel.ONE), InvalidRequestException)
-        # no supercolumn in a super CF
-        _expect_exception(lambda: client.insert('key1', ColumnParent('Super1'), Column('y', 'value', 0), ConsistencyLevel.ONE), InvalidRequestException)
-        # column but no supercolumn in remove
-        _expect_exception(lambda: client.remove('key1', ColumnPath('Super1', column='x'), 0, ConsistencyLevel.ONE), InvalidRequestException)
-        # super column in non-super CF
-        _expect_exception(lambda: client.remove('key1', ColumnPath('Standard1', 'y', 'x'), 0, ConsistencyLevel.ONE), InvalidRequestException)
-        # key too long
-        _expect_exception(lambda: client.get('x' * 2**16, ColumnPath('Standard1', column='c1'), ConsistencyLevel.ONE), InvalidRequestException)
-        # empty key
-        _expect_exception(lambda: client.get('', ColumnPath('Standard1', column='c1'), ConsistencyLevel.ONE), InvalidRequestException)
-        cfmap = {'Super1': [Mutation(ColumnOrSuperColumn(super_column=c)) for c in _SUPER_COLUMNS],
-                 'Super2': [Mutation(ColumnOrSuperColumn(super_column=c)) for c in _SUPER_COLUMNS]}
-        _expect_exception(lambda: client.batch_mutate({'': cfmap}, ConsistencyLevel.ONE), InvalidRequestException)
-        # empty column name
-        _expect_exception(lambda: client.get('key1', ColumnPath('Standard1', column=''), ConsistencyLevel.ONE), InvalidRequestException)
-        # get doesn't specify column name
-        _expect_exception(lambda: client.get('key1', ColumnPath('Standard1'), ConsistencyLevel.ONE), InvalidRequestException)
-        # supercolumn in a non-super CF
-        _expect_exception(lambda: client.get('key1', ColumnPath('Standard1', 'x', 'y'), ConsistencyLevel.ONE), InvalidRequestException)
-        # get doesn't specify supercolumn name
-        _expect_exception(lambda: client.get('key1', ColumnPath('Super1'), ConsistencyLevel.ONE), InvalidRequestException)
-        # invalid CF
-        _expect_exception(lambda: get_range_slice(client, ColumnParent('S'), SlicePredicate(column_names=['', '']), '', '', 5, ConsistencyLevel.ONE), InvalidRequestException)
-        # 'x' is not a valid Long
-        _expect_exception(lambda: client.insert('key1', ColumnParent('Super1', 'sc1'), Column('x', 'value', 0), ConsistencyLevel.ONE), InvalidRequestException)
-        # start is not a valid Long
-        p = SlicePredicate(slice_range=SliceRange('x', '', False, 1))
-        column_parent = ColumnParent('StandardLong1')
-        _expect_exception(lambda: client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE),
-                          InvalidRequestException)
-        # start > finish
-        p = SlicePredicate(slice_range=SliceRange(_i64(10), _i64(0), False, 1))
-        column_parent = ColumnParent('StandardLong1')
-        _expect_exception(lambda: client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE),
-                          InvalidRequestException)
-        # start is not a valid Long, supercolumn version
-        p = SlicePredicate(slice_range=SliceRange('x', '', False, 1))
-        column_parent = ColumnParent('Super1', 'sc1')
-        _expect_exception(lambda: client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE),
-                          InvalidRequestException)
-        # start > finish, supercolumn version
-        p = SlicePredicate(slice_range=SliceRange(_i64(10), _i64(0), False, 1))
-        column_parent = ColumnParent('Super1', 'sc1')
-        _expect_exception(lambda: client.get_slice('key1', column_parent, p, ConsistencyLevel.ONE),
-                          InvalidRequestException)
-        # start > finish, key version
-        _expect_exception(lambda: get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['']), 'z', 'a', 1, ConsistencyLevel.ONE), InvalidRequestException)
-        # ttl must be positive
-        column = Column('cttl1', 'value1', 0, 0)
-        _expect_exception(lambda: client.insert('key1', ColumnParent('Standard1'), column, ConsistencyLevel.ONE),
-                          InvalidRequestException)
-        # don't allow super_column in Deletion for standard ColumnFamily
-        deletion = Deletion(1, 'supercolumn', None)
-        mutation = Mutation(deletion=deletion)
-        mutations = {'key' : {'Standard1' : [mutation]}}
-        _expect_exception(lambda: client.batch_mutate(mutations, ConsistencyLevel.QUORUM),
-                          InvalidRequestException)
-        # 'x' is not a valid long
-        deletion = Deletion(1, 'x', None)
-        mutation = Mutation(deletion=deletion)
-        mutations = {'key' : {'Super5' : [mutation]}}
-        _expect_exception(lambda: client.batch_mutate(mutations, ConsistencyLevel.QUORUM), InvalidRequestException)
-        # counters don't support ANY
-        _expect_exception(lambda: client.add('key1', ColumnParent('Counter1', 'x'), CounterColumn('y', 1), ConsistencyLevel.ANY), InvalidRequestException)
-
-    def test_batch_insert_super(self):
-         _set_keyspace('Keyspace1')
-         cfmap = {'Super1': [Mutation(ColumnOrSuperColumn(super_column=c))
-                             for c in _SUPER_COLUMNS],
-                  'Super2': [Mutation(ColumnOrSuperColumn(super_column=c))
-                             for c in _SUPER_COLUMNS]}
-         client.batch_mutate({'key1': cfmap}, ConsistencyLevel.ONE)
-         _verify_super('Super1')
-         _verify_super('Super2')
-
-    def test_batch_insert_super_blocking(self):
-         _set_keyspace('Keyspace1')
-         cfmap = {'Super1': [Mutation(ColumnOrSuperColumn(super_column=c)) 
-                             for c in _SUPER_COLUMNS],
-                  'Super2': [Mutation(ColumnOrSuperColumn(super_column=c))
-                             for c in _SUPER_COLUMNS]}
-         client.batch_mutate({'key1': cfmap}, ConsistencyLevel.ONE)
-         _verify_super('Super1')
-         _verify_super('Super2')
-
-    def test_cf_remove_column(self):
-        _set_keyspace('Keyspace1')
-        _insert_simple()
-        client.remove('key1', ColumnPath('Standard1', column='c1'), 1, ConsistencyLevel.ONE)
-        _expect_missing(lambda: client.get('key1', ColumnPath('Standard1', column='c1'), ConsistencyLevel.ONE))
-        assert client.get('key1', ColumnPath('Standard1', column='c2'), ConsistencyLevel.ONE).column \
-            == Column('c2', 'value2', 0)
-        assert _big_slice('key1', ColumnParent('Standard1')) \
-            == [ColumnOrSuperColumn(column=Column('c2', 'value2', 0))]
-
-        # New insert, make sure it shows up post-remove:
-        client.insert('key1', ColumnParent('Standard1'), Column('c3', 'value3', 0), ConsistencyLevel.ONE)
-        columns = [result.column
-                   for result in _big_slice('key1', ColumnParent('Standard1'))]
-        assert columns == [Column('c2', 'value2', 0), Column('c3', 'value3', 0)], columns
-
-        # Test resurrection.  First, re-insert the value w/ older timestamp, 
-        # and make sure it stays removed
-        client.insert('key1', ColumnParent('Standard1'), Column('c1', 'value1', 0), ConsistencyLevel.ONE)
-        columns = [result.column
-                   for result in _big_slice('key1', ColumnParent('Standard1'))]
-        assert columns == [Column('c2', 'value2', 0), Column('c3', 'value3', 0)], columns
-        # Next, w/ a newer timestamp; it should come back:
-        client.insert('key1', ColumnParent('Standard1'), Column('c1', 'value1', 2), ConsistencyLevel.ONE)
-        columns = [result.column
-                   for result in _big_slice('key1', ColumnParent('Standard1'))]
-        assert columns == [Column('c1', 'value1', 2), Column('c2', 'value2', 0), Column('c3', 'value3', 0)], columns
-
-
-    def test_cf_remove(self):
-        _set_keyspace('Keyspace1')
-        
-        _insert_simple()
-        _insert_super()
-
-        # Remove the key1:Standard1 cf; verify super is unaffected
-        client.remove('key1', ColumnPath('Standard1'), 3, ConsistencyLevel.ONE)
-        assert _big_slice('key1', ColumnParent('Standard1')) == []
-        _verify_super()
-
-        # Test resurrection.  First, re-insert a value w/ older timestamp, 
-        # and make sure it stays removed:
-        client.insert('key1', ColumnParent('Standard1'), Column('c1', 'value1', 0), ConsistencyLevel.ONE)
-        assert _big_slice('key1', ColumnParent('Standard1')) == []
-        # Next, w/ a newer timestamp; it should come back:
-        client.insert('key1', ColumnParent('Standard1'), Column('c1', 'value1', 4), ConsistencyLevel.ONE)
-        result = _big_slice('key1', ColumnParent('Standard1'))
-        assert result == [ColumnOrSuperColumn(column=Column('c1', 'value1', 4))], result
-
-        # check removing the entire super cf, too.
-        client.remove('key1', ColumnPath('Super1'), 3, ConsistencyLevel.ONE)
-        assert _big_slice('key1', ColumnParent('Super1')) == []
-        assert _big_slice('key1', ColumnParent('Super1', 'sc1')) == []
-
-
-    def test_super_cf_remove_and_range_slice(self):
-        _set_keyspace('Keyspace1')
-
-        client.insert('key3', ColumnParent('Super1', 'sc1'), Column(_i64(1), 'v1', 0), ConsistencyLevel.ONE)
-        client.remove('key3', ColumnPath('Super1', 'sc1'), 5, ConsistencyLevel.ONE)
-
-        rows = {}
-        for row in get_range_slice(client, ColumnParent('Super1'), SlicePredicate(slice_range=SliceRange('', '', False, 1000)), '', '', 1000, ConsistencyLevel.ONE):
-            scs = [cosc.super_column for cosc in row.columns]
-            rows[row.key] = scs
-        assert rows == {'key3': []}, rows
-
-    def test_super_cf_remove_column(self):
-        _set_keyspace('Keyspace1')
-        _insert_simple()
-        _insert_super()
-
-        # Make sure remove clears out what it's supposed to, and _only_ that:
-        client.remove('key1', ColumnPath('Super1', 'sc2', _i64(5)), 5, ConsistencyLevel.ONE)
-        _expect_missing(lambda: client.get('key1', ColumnPath('Super1', 'sc2', _i64(5)), ConsistencyLevel.ONE))
-        super_columns = [result.super_column for result in _big_slice('key1', ColumnParent('Super1'))]
-        assert super_columns == [SuperColumn(name='sc1', columns=[Column(_i64(4), 'value4', 0)]),
-                                 SuperColumn(name='sc2', columns=[Column(_i64(6), 'value6', 0)])]
-        _verify_simple()
-
-        # New insert, make sure it shows up post-remove:
-        client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(7), 'value7', 0), ConsistencyLevel.ONE)
-        super_columns_expected = [SuperColumn(name='sc1', 
-                                              columns=[Column(_i64(4), 'value4', 0)]),
-                                  SuperColumn(name='sc2', 
-                                              columns=[Column(_i64(6), 'value6', 0), Column(_i64(7), 'value7', 0)])]
-
-        super_columns = [result.super_column for result in _big_slice('key1', ColumnParent('Super1'))]
-        assert super_columns == super_columns_expected, actual
-
-        # Test resurrection.  First, re-insert the value w/ older timestamp, 
-        # and make sure it stays removed:
-        client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(5), 'value5', 0), ConsistencyLevel.ONE)
-
-        super_columns = [result.super_column for result in _big_slice('key1', ColumnParent('Super1'))]
-        assert super_columns == super_columns_expected, super_columns
-
-        # Next, w/ a newer timestamp; it should come back
-        client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(5), 'value5', 6), ConsistencyLevel.ONE)
-        super_columns = [result.super_column for result in _big_slice('key1', ColumnParent('Super1'))]
-        super_columns_expected = [SuperColumn(name='sc1', columns=[Column(_i64(4), 'value4', 0)]), 
-                                  SuperColumn(name='sc2', columns=[Column(_i64(5), 'value5', 6), 
-                                                                   Column(_i64(6), 'value6', 0), 
-                                                                   Column(_i64(7), 'value7', 0)])]
-        assert super_columns == super_columns_expected, super_columns
-
-        # shouldn't be able to specify a column w/o a super column for remove
-        cp = ColumnPath(column_family='Super1', column='sc2')
-        e = _expect_exception(lambda: client.remove('key1', cp, 5, ConsistencyLevel.ONE), InvalidRequestException)
-        assert e.why.find("column cannot be specified without") >= 0
-
-    def test_super_cf_remove_supercolumn(self):
-        _set_keyspace('Keyspace1')
-        
-        _insert_simple()
-        _insert_super()
-
-        # Make sure remove clears out what it's supposed to, and _only_ that:
-        client.remove('key1', ColumnPath('Super1', 'sc2'), 5, ConsistencyLevel.ONE)
-        _expect_missing(lambda: client.get('key1', ColumnPath('Super1', 'sc2', _i64(5)), ConsistencyLevel.ONE))
-        super_columns = _big_slice('key1', ColumnParent('Super1', 'sc2'))
-        assert super_columns == [], super_columns
-        super_columns_expected = [SuperColumn(name='sc1', columns=[Column(_i64(4), 'value4', 0)])]
-        super_columns = [result.super_column
-                         for result in _big_slice('key1', ColumnParent('Super1'))]
-        assert super_columns == super_columns_expected, super_columns
-        _verify_simple()
-
-        # Test resurrection.  First, re-insert the value w/ older timestamp, 
-        # and make sure it stays removed:
-        client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(5), 'value5', 1), ConsistencyLevel.ONE)
-        super_columns = [result.super_column
-                         for result in _big_slice('key1', ColumnParent('Super1'))]
-        assert super_columns == super_columns_expected, super_columns
-
-        # Next, w/ a newer timestamp; it should come back
-        client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(5), 'value5', 6), ConsistencyLevel.ONE)
-        super_columns = [result.super_column
-                         for result in _big_slice('key1', ColumnParent('Super1'))]
-        super_columns_expected = [SuperColumn(name='sc1', columns=[Column(_i64(4), 'value4', 0)]),
-                                  SuperColumn(name='sc2', columns=[Column(_i64(5), 'value5', 6)])]
-        assert super_columns == super_columns_expected, super_columns
-
-        # check slicing at the subcolumn level too
-        p = SlicePredicate(slice_range=SliceRange('', '', False, 1000))
-        columns = [result.column
-                   for result in client.get_slice('key1', ColumnParent('Super1', 'sc2'), p, ConsistencyLevel.ONE)]
-        assert columns == [Column(_i64(5), 'value5', 6)], columns
-
-
-    def test_super_cf_resurrect_subcolumn(self):
-        _set_keyspace('Keyspace1')
-        key = 'vijay'
-        client.insert(key, ColumnParent('Super1', 'sc1'), Column(_i64(4), 'value4', 0), ConsistencyLevel.ONE)
-
-        client.remove(key, ColumnPath('Super1', 'sc1'), 1, ConsistencyLevel.ONE)
-
-        client.insert(key, ColumnParent('Super1', 'sc1'), Column(_i64(4), 'value4', 2), ConsistencyLevel.ONE)
-
-        result = client.get(key, ColumnPath('Super1', 'sc1'), ConsistencyLevel.ONE)
-        assert result.super_column.columns is not None, result.super_column
-
-
-    def test_empty_range(self):
-        _set_keyspace('Keyspace1')
-        assert get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['c1', 'c1']), '', '', 1000, ConsistencyLevel.ONE) == []
-        _insert_simple()
-        assert get_range_slice(client, ColumnParent('Super1'), SlicePredicate(column_names=['c1', 'c1']), '', '', 1000, ConsistencyLevel.ONE) == []
-
-    def test_range_with_remove(self):
-        _set_keyspace('Keyspace1')
-        _insert_simple()
-        assert get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['c1', 'c1']), 'key1', '', 1000, ConsistencyLevel.ONE)[0].key == 'key1'
-
-        client.remove('key1', ColumnPath('Standard1', column='c1'), 1, ConsistencyLevel.ONE)
-        client.remove('key1', ColumnPath('Standard1', column='c2'), 1, ConsistencyLevel.ONE)
-        actual = get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['c1', 'c2']), '', '', 1000, ConsistencyLevel.ONE)
-        assert actual == [KeySlice(columns=[], key='key1')], actual
-
-    def test_range_with_remove_cf(self):
-        _set_keyspace('Keyspace1')
-        _insert_simple()
-        assert get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['c1', 'c1']), 'key1', '', 1000, ConsistencyLevel.ONE)[0].key == 'key1'
-
-        client.remove('key1', ColumnPath('Standard1'), 1, ConsistencyLevel.ONE)
-        actual = get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['c1', 'c1']), '', '', 1000, ConsistencyLevel.ONE)
-        assert actual == [KeySlice(columns=[], key='key1')], actual
-
-    def test_range_collation(self):
-        _set_keyspace('Keyspace1')
-        for key in ['-a', '-b', 'a', 'b'] + [str(i) for i in xrange(100)]:
-            client.insert(key, ColumnParent('Standard1'), Column(key, 'v', 0), ConsistencyLevel.ONE)
-
-        slices = get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['-a', '-a']), '', '', 1000, ConsistencyLevel.ONE)
-        L = ['-a', '-b', '0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27','28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', 'a', 'b']
-        assert len(slices) == len(L)
-        for key, ks in zip(L, slices):
-            assert key == ks.key
-
-    def test_range_partial(self):
-        _set_keyspace('Keyspace1')
-        
-        for key in ['-a', '-b', 'a', 'b'] + [str(i) for i in xrange(100)]:
-            client.insert(key, ColumnParent('Standard1'), Column(key, 'v', 0), ConsistencyLevel.ONE)
-
-        def check_slices_against_keys(keyList, sliceList):
-            assert len(keyList) == len(sliceList), "%d vs %d" % (len(keyList), len(sliceList))
-            for key, ks in zip(keyList, sliceList):
-                assert key == ks.key
-        
-        slices = get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['-a', '-a']), 'a', '', 1000, ConsistencyLevel.ONE)
-        check_slices_against_keys(['a', 'b'], slices)
-        
-        slices = get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['-a', '-a']), '', '15', 1000, ConsistencyLevel.ONE)
-        check_slices_against_keys(['-a', '-b', '0', '1', '10', '11', '12', '13', '14', '15'], slices)
-
-        slices = get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['-a', '-a']), '50', '51', 1000, ConsistencyLevel.ONE)
-        check_slices_against_keys(['50', '51'], slices)
-        
-        slices = get_range_slice(client, ColumnParent('Standard1'), SlicePredicate(column_names=['-a', '-a']), '1', '', 10, ConsistencyLevel.ONE)
-        check_slices_against_keys(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18'], slices)
-
-    def test_get_slice_range(self):
-        _set_keyspace('Keyspace1')
-        _insert_range()
-        _verify_range()
-        
-    def test_get_slice_super_range(self):
-        _set_keyspace('Keyspace1')
-        _insert_super_range()
-        _verify_super_range()
-
-    def test_get_range_slices_tokens(self):
-        _set_keyspace('Keyspace2')
-        for key in ['key1', 'key2', 'key3', 'key4', 'key5']:
-            for cname in ['col1', 'col2', 'col3', 'col4', 'col5']:
-                client.insert(key, ColumnParent('Super3', 'sc1'), Column(cname, 'v-' + cname, 0), ConsistencyLevel.ONE)
-
-        cp = ColumnParent('Super3', 'sc1')
-        predicate = SlicePredicate(column_names=['col1', 'col3'])
-        range = KeyRange(start_token='55', end_token='55', count=100)
-        result = client.get_range_slices(cp, predicate, range, ConsistencyLevel.ONE)
-        assert len(result) == 5
-        assert result[0].columns[0].column.name == 'col1'
-        assert result[0].columns[1].column.name == 'col3'
-
-    def test_get_range_slice_super(self):
-        _set_keyspace('Keyspace2')
-        for key in ['key1', 'key2', 'key3', 'key4', 'key5']:
-            for cname in ['col1', 'col2', 'col3', 'col4', 'col5']:
-                client.insert(key, ColumnParent('Super3', 'sc1'), Column(cname, 'v-' + cname, 0), ConsistencyLevel.ONE)
-
-        cp = ColumnParent('Super3', 'sc1')
-        result = get_range_slice(client, cp, SlicePredicate(column_names=['col1', 'col3']), 'key2', 'key4', 5, ConsistencyLevel.ONE)
-        assert len(result) == 3
-        assert result[0].columns[0].column.name == 'col1'
-        assert result[0].columns[1].column.name == 'col3'
-
-        cp = ColumnParent('Super3')
-        result = get_range_slice(client, cp, SlicePredicate(column_names=['sc1']), 'key2', 'key4', 5, ConsistencyLevel.ONE)
-        assert len(result) == 3
-        assert list(set(row.columns[0].super_column.name for row in result))[0] == 'sc1'
-        
-    def test_get_range_slice(self):
-        _set_keyspace('Keyspace1')
-        for key in ['key1', 'key2', 'key3', 'key4', 'key5']:
-            for cname in ['col1', 'col2', 'col3', 'col4', 'col5']:
-                client.insert(key, ColumnParent('Standard1'), Column(cname, 'v-' + cname, 0), ConsistencyLevel.ONE)
-        cp = ColumnParent('Standard1')
-
-        # test empty slice
-        result = get_range_slice(client, cp, SlicePredicate(column_names=['col1', 'col3']), 'key6', '', 1, ConsistencyLevel.ONE)
-        assert len(result) == 0
-
-        # test empty columns
-        result = get_range_slice(client, cp, SlicePredicate(column_names=['a']), 'key2', '', 1, ConsistencyLevel.ONE)
-        assert len(result) == 1
-        assert len(result[0].columns) == 0
-
-        # test column_names predicate
-        result = get_range_slice(client, cp, SlicePredicate(column_names=['col1', 'col3']), 'key2', 'key4', 5, ConsistencyLevel.ONE)
-        assert len(result) == 3, result
-        assert result[0].columns[0].column.name == 'col1'
-        assert result[0].columns[1].column.name == 'col3'
-
-        # row limiting via count.
-        result = get_range_slice(client, cp, SlicePredicate(column_names=['col1', 'col3']), 'key2', 'key4', 1, ConsistencyLevel.ONE)
-        assert len(result) == 1
-
-        # test column slice predicate
-        result = get_range_slice(client, cp, SlicePredicate(slice_range=SliceRange(start='col2', finish='col4', reversed=False, count=5)), 'key1', 'key2', 5, ConsistencyLevel.ONE)
-        assert len(result) == 2
-        assert result[0].key == 'key1'
-        assert result[1].key == 'key2'
-        assert len(result[0].columns) == 3
-        assert result[0].columns[0].column.name == 'col2'
-        assert result[0].columns[2].column.name == 'col4'
-
-        # col limiting via count
-        result = get_range_slice(client, cp, SlicePredicate(slice_range=SliceRange(start='col2', finish='col4', reversed=False, count=2)), 'key1', 'key2', 5, ConsistencyLevel.ONE)
-        assert len(result[0].columns) == 2
-
-        # and reversed 
-        result = get_range_slice(client, cp, SlicePredicate(slice_range=SliceRange(start='col4', finish='col2', reversed=True, count=5)), 'key1', 'key2', 5, ConsistencyLevel.ONE)
-        assert result[0].columns[0].column.name == 'col4'
-        assert result[0].columns[2].column.name == 'col2'
-
-        # row limiting via count
-        result = get_range_slice(client, cp, SlicePredicate(slice_range=SliceRange(start='col2', finish='col4', reversed=False, count=5)), 'key1', 'key2', 1, ConsistencyLevel.ONE)
-        assert len(result) == 1
-
-        # removed data
-        client.remove('key1', ColumnPath('Standard1', column='col1'), 1, ConsistencyLevel.ONE)
-        result = get_range_slice(client, cp, SlicePredicate(slice_range=SliceRange('', '')), 'key1', 'key2', 5, ConsistencyLevel.ONE)
-        assert len(result) == 2, result
-        assert result[0].columns[0].column.name == 'col2', result[0].columns[0].column.name
-        assert result[1].columns[0].column.name == 'col1'
-        
-    
-    def test_wrapped_range_slices(self):
-        _set_keyspace('Keyspace1')
-
-        def copp_token(key):
-            # I cheated and generated this from Java
-            return {'a': '00530000000100000001', 
-                    'b': '00540000000100000001', 
-                    'c': '00550000000100000001',
-                    'd': '00560000000100000001', 
-                    'e': '00580000000100000001'}[key]
-
-        for key in ['a', 'b', 'c', 'd', 'e']:
-            for cname in ['col1', 'col2', 'col3', 'col4', 'col5']:
-                client.insert(key, ColumnParent('Standard1'), Column(cname, 'v-' + cname, 0), ConsistencyLevel.ONE)
-        cp = ColumnParent('Standard1')
-
-        result = client.get_range_slices(cp, SlicePredicate(column_names=['col1', 'col3']), KeyRange(start_token=copp_token('e'), end_token=copp_token('e')), ConsistencyLevel.ONE)
-        assert [row.key for row in result] == ['a', 'b', 'c', 'd', 'e',], [row.key for row in result]
-
-        result = client.get_range_slices(cp, SlicePredicate(column_names=['col1', 'col3']), KeyRange(start_token=copp_token('c'), end_token=copp_token('c')), ConsistencyLevel.ONE)
-        assert [row.key for row in result] == ['a', 'b', 'c', 'd', 'e',], [row.key for row in result]
-        
-
-    def test_get_slice_by_names(self):
-        _set_keyspace('Keyspace1')
-        _insert_range()
-        p = SlicePredicate(column_names=['c1', 'c2'])
-        result = client.get_slice('key1', ColumnParent('Standard1'), p, ConsistencyLevel.ONE) 
-        assert len(result) == 2
-        assert result[0].column.name == 'c1'
-        assert result[1].column.name == 'c2'
-
-        _insert_super()
-        p = SlicePredicate(column_names=[_i64(4)])
-        result = client.get_slice('key1', ColumnParent('Super1', 'sc1'), p, ConsistencyLevel.ONE) 
-        assert len(result) == 1
-        assert result[0].column.name == _i64(4)
-
-    def test_multiget_slice(self):
-        """Insert multiple keys and retrieve them using the multiget_slice interface"""
-
-        _set_keyspace('Keyspace1')
-        # Generate a list of 10 keys and insert them
-        num_keys = 10
-        keys = ['key'+str(i) for i in range(1, num_keys+1)]
-        _insert_multi(keys)
-
-        # Retrieve all 10 key slices
-        rows = _big_multislice(keys, ColumnParent('Standard1'))
-        keys1 = rows.keys().sort()
-        keys2 = keys.sort()
-
-        columns = [ColumnOrSuperColumn(c) for c in _SIMPLE_COLUMNS]
-        # Validate if the returned rows have the keys requested and if the ColumnOrSuperColumn is what was inserted
-        for key in keys:
-            assert rows.has_key(key) == True
-            assert columns == rows[key]
-
-    def test_multi_count(self):
-        """Insert multiple keys and count them using the multiget interface"""
-        _set_keyspace('Keyspace1')
-
-        # Generate a list of 10 keys countaining 1 to 10 columns and insert them
-        num_keys = 10
-        for i in range(1, num_keys+1):
-          key = 'key'+str(i)
-          for j in range(1, i+1):
-            client.insert(key, ColumnParent('Standard1'), Column('c'+str(j), 'value'+str(j), 0), ConsistencyLevel.ONE)
-
-        # Count columns in all 10 keys
-        keys = ['key'+str(i) for i in range(1, num_keys+1)]
-        p = SlicePredicate(slice_range=SliceRange('', '', False, 1000))
-        counts = client.multiget_count(keys, ColumnParent('Standard1'), p, ConsistencyLevel.ONE)
-
-        # Check the returned counts
-        for i in range(1, num_keys+1):
-          key = 'key'+str(i)
-          assert counts[key] == i
-
-    def test_batch_mutate_super_deletion(self):
-        _set_keyspace('Keyspace1')
-        _insert_super('test')
-        d = Deletion(1, predicate=SlicePredicate(column_names=['sc1']))
-        cfmap = {'Super1': [Mutation(deletion=d)]}
-        client.batch_mutate({'test': cfmap}, ConsistencyLevel.ONE)
-        _expect_missing(lambda: client.get('key1', ColumnPath('Super1', 'sc1'), ConsistencyLevel.ONE))
-
-    def test_super_reinsert(self):
-        _set_keyspace('Keyspace1')
-        for x in xrange(3):
-            client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(x), 'value', 1), ConsistencyLevel.ONE)
-
-        client.remove('key1', ColumnPath('Super1'), 2, ConsistencyLevel.ONE)
-
-        for x in xrange(3):
-            client.insert('key1', ColumnParent('Super1', 'sc2'), Column(_i64(x + 3), 'value', 3), ConsistencyLevel.ONE)
-
-        for n in xrange(1, 4):
-            p =  SlicePredicate(slice_range=SliceRange('', '', False, n))
-            slice = client.get_slice('key1', ColumnParent('Super1', 'sc2'), p, ConsistencyLevel.ONE)
-            assert len(slice) == n, "expected %s results; found %s" % (n, slice)
-
-    def test_describe_keyspace(self):
-        kspaces = client.describe_keyspaces()
-        assert len(kspaces) == 4, kspaces # ['Keyspace2', 'Keyspace1', 'system', 'system_traces']
-
-        sysks = client.describe_keyspace("system")
-        assert sysks in kspaces
-
-        ks1 = client.describe_keyspace("Keyspace1")
-        assert ks1.strategy_options['replication_factor'] == '1', ks1.strategy_options
-        for cf in ks1.cf_defs:
-            if cf.name == "Standard1":
-                cf0 = cf
-                break;
-        assert cf0.comparator_type == "org.apache.cassandra.db.marshal.BytesType"
-
-    def test_describe(self):
-        assert client.describe_cluster_name() == 'Test Cluster'
-
-    def test_describe_ring(self):
-        assert list(client.describe_ring('Keyspace1'))[0].endpoints == ['127.0.0.1']
-
-    def test_describe_token_map(self):
-        # test/conf/cassandra.yaml specifies org.apache.cassandra.dht.ByteOrderedPartitioner
-        # which uses BytesToken, so this just tests that the string representation of the token
-        # matches a regex pattern for BytesToken.toString().
-        ring = client.describe_token_map().items()
-        assert len(ring) == 1
-        token, node = ring[0]
-        assert re.match("[0-9A-Fa-f]{32}", token)
-        assert node == '127.0.0.1'
-
-    def test_describe_partitioner(self):
-        # Make sure this just reads back the values from the config.
-        assert client.describe_partitioner() == "org.apache.cassandra.dht.ByteOrderedPartitioner"
-
-    def test_describe_snitch(self):
-        assert client.describe_snitch() == "org.apache.cassandra.locator.SimpleSnitch"
-
-    def test_invalid_ks_names(self):
-        def invalid_keyspace():
-            client.system_add_keyspace(KsDef('in-valid', 'org.apache.cassandra.locator.SimpleStrategy', {'replication_factor':'1'}, cf_defs=[]))
-        _expect_exception(invalid_keyspace, InvalidRequestException)
-
-    def test_invalid_strategy_class(self):
-        def add_invalid_keyspace():
-            client.system_add_keyspace(KsDef('ValidKs', 'InvalidStrategyClass', {}, cf_defs=[]))
-        exc = _expect_exception(add_invalid_keyspace, InvalidRequestException)
-        s = str(exc)
-        assert s.find("InvalidStrategyClass") > -1, s
-        assert s.find("Unable to find replication strategy") > -1, s
-
-        def update_invalid_keyspace():
-            client.system_add_keyspace(KsDef('ValidKsForUpdate', 'org.apache.cassandra.locator.SimpleStrategy', {'replication_factor':'1'}, cf_defs=[]))
-            client.system_update_keyspace(KsDef('ValidKsForUpdate', 'InvalidStrategyClass', {}, cf_defs=[]))
-
-        exc = _expect_exception(update_invalid_keyspace, InvalidRequestException)
-        s = str(exc)
-        assert s.find("InvalidStrategyClass") > -1, s
-        assert s.find("Unable to find replication strategy") > -1, s
-
-    def test_invalid_cf_names(self):
-        def invalid_cf():
-            _set_keyspace('Keyspace1')
-            newcf = CfDef('Keyspace1', 'in-valid')
-            client.system_add_column_family(newcf)
-        _expect_exception(invalid_cf, InvalidRequestException)
-        
-        def invalid_cf_inside_new_ks():
-            cf = CfDef('ValidKsName_invalid_cf', 'in-valid')
-            _set_keyspace('system')
-            client.system_add_keyspace(KsDef('ValidKsName_invalid_cf', 'org.apache.cassandra.locator.SimpleStrategy', {'replication_factor': '1'}, cf_defs=[cf]))
-        _expect_exception(invalid_cf_inside_new_ks, InvalidRequestException)
-    
-    def test_system_cf_recreate(self):
-        "ensures that keyspaces and column familes can be dropped and recreated in short order"
-        for x in range(2):
-            
-            keyspace = 'test_cf_recreate'
-            cf_name = 'recreate_cf'
-            
-            # create
-            newcf = CfDef(keyspace, cf_name)
-            newks = KsDef(keyspace, 'org.apache.cassandra.locator.SimpleStrategy', {'replication_factor':'1'}, cf_defs=[newcf])
-            client.system_add_keyspace(newks)
-            _set_keyspace(keyspace)
-            
-            # insert
-            client.insert('key0', ColumnParent(cf_name), Column('colA', 'colA-value', 0), ConsistencyLevel.ONE)
-            col1 = client.get_slice('key0', ColumnParent(cf_name), SlicePredicate(slice_range=SliceRange('', '', False, 100)), ConsistencyLevel.ONE)[0].column
-            assert col1.name == 'colA' and col1.value == 'colA-value'
-                    
-            # drop
-            client.system_drop_column_family(cf_name) 
-            
-            # recreate
-            client.system_add_column_family(newcf)
-            
-            # query
-            cosc_list = client.get_slice('key0', ColumnParent(cf_name), SlicePredicate(slice_range=SliceRange('', '', False, 100)), ConsistencyLevel.ONE)
-            # this was failing prior to CASSANDRA-1477.
-            assert len(cosc_list) == 0 , 'cosc length test failed'
-            
-            client.system_drop_keyspace(keyspace)
-    
-    def test_system_keyspace_operations(self):
-        # create.  note large RF, this is OK
-        keyspace = KsDef('CreateKeyspace', 
-                         'org.apache.cassandra.locator.SimpleStrategy', 
-                         {'replication_factor': '10'},
-                         cf_defs=[CfDef('CreateKeyspace', 'CreateKsCf')])
-        client.system_add_keyspace(keyspace)
-        newks = client.describe_keyspace('CreateKeyspace')
-        assert 'CreateKsCf' in [x.name for x in newks.cf_defs]
-        
-        _set_keyspace('CreateKeyspace')
-        
-        # modify valid
-        modified_keyspace = KsDef('CreateKeyspace', 
-                                  'org.apache.cassandra.locator.OldNetworkTopologyStrategy', 
-                                  {'replication_factor': '1'},
-                                  cf_defs=[])
-        client.system_update_keyspace(modified_keyspace)
-        modks = client.describe_keyspace('CreateKeyspace')
-        assert modks.strategy_class == modified_keyspace.strategy_class
-        assert modks.strategy_options == modified_keyspace.strategy_options
-        
-        # drop
-        client.system_drop_keyspace('CreateKeyspace')
-        def get_second_ks():
-            client.describe_keyspace('CreateKeyspace')
-        _expect_exception(get_second_ks, NotFoundException)
-        
-    def test_create_then_drop_ks(self):
-        keyspace = KsDef('AddThenDrop', 
-                strategy_class='org.apache.cassandra.locator.SimpleStrategy',
-                strategy_options={'replication_factor':'1'},
-                cf_defs=[])
-        def test_existence():
-            client.describe_keyspace(keyspace.name)
-        _expect_exception(test_existence, NotFoundException)
-        client.set_keyspace('system')
-        client.system_add_keyspace(keyspace)
-        test_existence()
-        client.system_drop_keyspace(keyspace.name)
-  
-    def test_column_validators(self):
-        # columndef validation for regular CF
-        ks = 'Keyspace1'
-        _set_keyspace(ks)
-        cd = ColumnDef('col', 'LongType', None, None)
-        cf = CfDef('Keyspace1', 'ValidatorColumnFamily', column_metadata=[cd])
-        client.system_add_column_family(cf)
-        ks_def = client.describe_keyspace(ks)
-        assert 'ValidatorColumnFamily' in [x.name for x in ks_def.cf_defs]
-
-        cp = ColumnParent('ValidatorColumnFamily')
-        col0 = Column('col', _i64(42), 0)
-        col1 = Column('col', "ceci n'est pas 64bit", 0)
-        client.insert('key0', cp, col0, ConsistencyLevel.ONE)
-        e = _expect_exception(lambda: client.insert('key1', cp, col1, ConsistencyLevel.ONE), InvalidRequestException)
-        assert e.why.find("failed validation") >= 0
-
-        # columndef validation for super CF
-        scf = CfDef('Keyspace1', 'ValidatorSuperColumnFamily', column_type='Super', column_metadata=[cd])
-        client.system_add_column_family(scf)
-        ks_def = client.describe_keyspace(ks)
-        assert 'ValidatorSuperColumnFamily' in [x.name for x in ks_def.cf_defs]
-
-        scp = ColumnParent('ValidatorSuperColumnFamily','sc1')
-        client.insert('key0', scp, col0, ConsistencyLevel.ONE)
-        e = _expect_exception(lambda: client.insert('key1', scp, col1, ConsistencyLevel.ONE), InvalidRequestException)
-        assert e.why.find("failed validation") >= 0
-
-        # columndef and cfdef default validation
-        cf = CfDef('Keyspace1', 'DefaultValidatorColumnFamily', column_metadata=[cd], default_validation_class='UTF8Type')
-        client.system_add_column_family(cf)
-        ks_def = client.describe_keyspace(ks)
-        assert 'DefaultValidatorColumnFamily' in [x.name for x in ks_def.cf_defs]
-
-        dcp = ColumnParent('DefaultValidatorColumnFamily')
-        # inserting a longtype into column 'col' is valid at the columndef level
-        client.insert('key0', dcp, col0, ConsistencyLevel.ONE)
-        # inserting a UTF8type into column 'col' fails at the columndef level
-        e = _expect_exception(lambda: client.insert('key1', dcp, col1, ConsistencyLevel.ONE), InvalidRequestException)
-        assert e.why.find("failed validation") >= 0
-        
-        # insert a longtype into column 'fcol' should fail at the cfdef level
-        col2 = Column('fcol', _i64(4224), 0)
-        e = _expect_exception(lambda: client.insert('key1', dcp, col2, ConsistencyLevel.ONE), InvalidRequestException)
-        assert e.why.find("failed validation") >= 0
-        # insert a UTF8type into column 'fcol' is valid at the cfdef level
-        col3 = Column('fcol', "Stringin' it up in the Stringtel Stringifornia", 0)
-        client.insert('key0', dcp, col3, ConsistencyLevel.ONE)
-
-    def test_system_column_family_operations(self):
-        _set_keyspace('Keyspace1')
-        # create
-        cd = ColumnDef('ValidationColumn', 'BytesType', None, None)
-        newcf = CfDef('Keyspace1', 'NewColumnFamily', column_metadata=[cd])
-        client.system_add_column_family(newcf)
-        ks1 = client.describe_keyspace('Keyspace1')
-        assert 'NewColumnFamily' in [x.name for x in ks1.cf_defs]
-        cfid = [x.id for x in ks1.cf_defs if x.name=='NewColumnFamily'][0]
-        
-        # modify invalid
-        modified_cf = CfDef('Keyspace1', 'NewColumnFamily', column_metadata=[cd])
-        modified_cf.id = cfid
-        def fail_invalid_field():
-            modified_cf.comparator_type = 'LongType'
-            client.system_update_column_family(modified_cf)
-        _expect_exception(fail_invalid_field, InvalidRequestException)
-        
-        # modify valid
-        modified_cf.comparator_type = 'BytesType' # revert back to old value.
-        modified_cf.gc_grace_seconds = 1
-        client.system_update_column_family(modified_cf)
-        ks1 = client.describe_keyspace('Keyspace1')
-        server_cf = [x for x in ks1.cf_defs if x.name=='NewColumnFamily'][0]
-        assert server_cf
-        assert server_cf.gc_grace_seconds == 1
-        
-        # drop
-        client.system_drop_column_family('NewColumnFamily')
-        ks1 = client.describe_keyspace('Keyspace1')
-        assert 'NewColumnFamily' not in [x.name for x in ks1.cf_defs]
-        assert 'Standard1' in [x.name for x in ks1.cf_defs]
-
-        # Make a LongType CF and add a validator
-        newcf = CfDef('Keyspace1', 'NewLongColumnFamily', comparator_type='LongType')
-        client.system_add_column_family(newcf)
-
-        three = _i64(3)
-        cd = ColumnDef(three, 'LongType', None, None)
-        ks1 = client.describe_keyspace('Keyspace1')
-        modified_cf = [x for x in ks1.cf_defs if x.name=='NewLongColumnFamily'][0]
-        modified_cf.column_metadata = [cd]
-        client.system_update_column_family(modified_cf)
-
-        ks1 = client.describe_keyspace('Keyspace1')
-        server_cf = [x for x in ks1.cf_defs if x.name=='NewLongColumnFamily'][0]
-        assert server_cf.column_metadata[0].name == _i64(3), server_cf.column_metadata
-
-    def test_dynamic_indexes_creation_deletion(self):
-        _set_keyspace('Keyspace1')
-        cfdef = CfDef('Keyspace1', 'BlankCF')
-        client.system_add_column_family(cfdef)
-
-        ks1 = client.describe_keyspace('Keyspace1')
-        cfid = [x.id for x in ks1.cf_defs if x.name=='BlankCF'][0]
-        modified_cd = ColumnDef('birthdate', 'BytesType', IndexType.KEYS, None)
-        modified_cf = CfDef('Keyspace1', 'BlankCF', column_metadata=[modified_cd])
-        modified_cf.id = cfid
-        client.system_update_column_family(modified_cf)
-
-        # Add a second indexed CF ...
-        birthdate_coldef = ColumnDef('birthdate', 'BytesType', IndexType.KEYS, None)
-        age_coldef = ColumnDef('age', 'BytesType', IndexType.KEYS, 'age_index')
-        cfdef = CfDef('Keyspace1', 'BlankCF2', column_metadata=[birthdate_coldef, age_coldef])
-        client.system_add_column_family(cfdef)
- 
-        # ... and update it to have a third index
-        ks1 = client.describe_keyspace('Keyspace1')
-        cfdef = [x for x in ks1.cf_defs if x.name=='BlankCF2'][0]
-        name_coldef = ColumnDef('name', 'BytesType', IndexType.KEYS, 'name_index')
-        cfdef.column_metadata.append(name_coldef)
-        client.system_update_column_family(cfdef)
-       
-        # Now drop the indexes
-        ks1 = client.describe_keyspace('Keyspace1')
-        cfdef = [x for x in ks1.cf_defs if x.name=='BlankCF2'][0]
-        birthdate_coldef = ColumnDef('birthdate', 'BytesType', None, None)
-        age_coldef = ColumnDef('age', 'BytesType', None, None)
-        name_coldef = ColumnDef('name', 'BytesType', None, None)
-        cfdef.column_metadata = [birthdate_coldef, age_coldef, name_coldef]
-        client.system_update_column_family(cfdef)
-
-        ks1 = client.describe_keyspace('Keyspace1')
-        cfdef = [x for x in ks1.cf_defs if x.name=='BlankCF'][0]
-        birthdate_coldef = ColumnDef('birthdate', 'BytesType', None, None)
-        cfdef.column_metadata = [birthdate_coldef]
-        client.system_update_column_family(cfdef)
-        
-        client.system_drop_column_family('BlankCF')
-        client.system_drop_column_family('BlankCF2')
-
-    def test_dynamic_indexes_with_system_update_cf(self):
-        _set_keyspace('Keyspace1')
-        cd = ColumnDef('birthdate', 'BytesType', None, None)
-        newcf = CfDef('Keyspace1', 'ToBeIndexed', default_validation_class='LongType', column_metadata=[cd])
-        client.system_add_column_family(newcf)
-
-        client.insert('key1', ColumnParent('ToBeIndexed'), Column('birthdate', _i64(1), 0), ConsistencyLevel.ONE)
-        client.insert('key2', ColumnParent('ToBeIndexed'), Column('birthdate', _i64(2), 0), ConsistencyLevel.ONE)
-        client.insert('key2', ColumnParent('ToBeIndexed'), Column('b', _i64(2), 0), ConsistencyLevel.ONE)
-        client.insert('key3', ColumnParent('ToBeIndexed'), Column('birthdate', _i64(3), 0), ConsistencyLevel.ONE)
-        client.insert('key3', ColumnParent('ToBeIndexed'), Column('b', _i64(3), 0), ConsistencyLevel.ONE)
-
-        # First without index
-        cp = ColumnParent('ToBeIndexed')
-        sp = SlicePredicate(slice_range=SliceRange('', ''))
-        key_range = KeyRange('', '', None, None, [IndexExpression('birthdate', IndexOperator.EQ, _i64(1))], 100)
-        result = client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE)
-        assert len(result) == 1, result
-        assert result[0].key == 'key1'
-        assert len(result[0].columns) == 1, result[0].columns
-
-        # add an index on 'birthdate'
-        ks1 = client.describe_keyspace('Keyspace1')
-        cfid = [x.id for x in ks1.cf_defs if x.name=='ToBeIndexed'][0]
-        modified_cd = ColumnDef('birthdate', 'BytesType', IndexType.KEYS, 'bd_index')
-        modified_cf = CfDef('Keyspace1', 'ToBeIndexed', column_metadata=[modified_cd])
-        modified_cf.id = cfid
-        client.system_update_column_family(modified_cf)
-        
-        ks1 = client.describe_keyspace('Keyspace1')
-        server_cf = [x for x in ks1.cf_defs if x.name=='ToBeIndexed'][0]
-        assert server_cf
-        assert server_cf.column_metadata[0].index_type == modified_cd.index_type
-        assert server_cf.column_metadata[0].index_name == modified_cd.index_name
-        
-        # sleep a bit to give time for the index to build.
-        time.sleep(0.5)
-        
-        # repeat query on one index expression
-        result = client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE)
-        assert len(result) == 1, result
-        assert result[0].key == 'key1'
-        assert len(result[0].columns) == 1, result[0].columns
-
-    def test_system_super_column_family_operations(self):
-        _set_keyspace('Keyspace1')
-        
-        # create
-        cd = ColumnDef('ValidationColumn', 'BytesType', None, None)
-        newcf = CfDef('Keyspace1', 'NewSuperColumnFamily', 'Super', column_metadata=[cd])
-        client.system_add_column_family(newcf)
-        ks1 = client.describe_keyspace('Keyspace1')
-        assert 'NewSuperColumnFamily' in [x.name for x in ks1.cf_defs]
-        
-        # drop
-        client.system_drop_column_family('NewSuperColumnFamily')
-        ks1 = client.describe_keyspace('Keyspace1')
-        assert 'NewSuperColumnFamily' not in [x.name for x in ks1.cf_defs]
-        assert 'Standard1' in [x.name for x in ks1.cf_defs]
-
-    def test_insert_ttl(self):
-        """ Test simple insertion of a column with ttl """
-        _set_keyspace('Keyspace1')
-        column = Column('cttl1', 'value1', 0, 5)
-        client.insert('key1', ColumnParent('Standard1'), column, ConsistencyLevel.ONE)
-        assert client.get('key1', ColumnPath('Standard1', column='cttl1'), ConsistencyLevel.ONE).column == column
-
-    def test_simple_expiration(self):
-        """ Test that column ttled do expires """
-        _set_keyspace('Keyspace1')
-        column = Column('cttl3', 'value1', 0, 2)
-        client.insert('key1', ColumnParent('Standard1'), column, ConsistencyLevel.ONE)
-        time.sleep(1)
-        c = client.get('key1', ColumnPath('Standard1', column='cttl3'), ConsistencyLevel.ONE).column
-        assert c == column
-        assert client.get('key1', ColumnPath('Standard1', column='cttl3'), ConsistencyLevel.ONE).column == column
-        time.sleep(2)
-        _expect_missing(lambda: client.get('key1', ColumnPath('Standard1', column='cttl3'), ConsistencyLevel.ONE))
-    
-    def test_simple_expiration_batch_mutate(self):
-        """ Test that column ttled do expires using batch_mutate """
-        _set_keyspace('Keyspace1')
-        column = Column('cttl4', 'value1', 0, 2)
-        cfmap = {'Standard1': [Mutation(ColumnOrSuperColumn(column))]}
-        client.batch_mutate({'key1': cfmap}, ConsistencyLevel.ONE)
-        time.sleep(1)
-        c = client.get('key1', ColumnPath('Standard1', column='cttl4'), ConsistencyLevel.ONE).column
-        assert c == column
-        assert client.get('key1', ColumnPath('Standard1', column='cttl4'), ConsistencyLevel.ONE).column == column
-        time.sleep(2)
-        _expect_missing(lambda: client.get('key1', ColumnPath('Standard1', column='cttl3'), ConsistencyLevel.ONE))
-
-    def test_update_expiring(self):
-        """ Test that updating a column with ttl override the ttl """
-        _set_keyspace('Keyspace1')
-        column1 = Column('cttl4', 'value1', 0, 1)
-        client.insert('key1', ColumnParent('Standard1'), column1, ConsistencyLevel.ONE)
-        column2 = Column('cttl4', 'value1', 1)
-        client.insert('key1', ColumnParent('Standard1'), column2, ConsistencyLevel.ONE)
-        time.sleep(1.5)
-        assert client.get('key1', ColumnPath('Standard1', column='cttl4'), ConsistencyLevel.ONE).column == column2
-
-    def test_remove_expiring(self):
-        """ Test removing a column with ttl """
-        _set_keyspace('Keyspace1')
-        column = Column('cttl5', 'value1', 0, 10)
-        client.insert('key1', ColumnParent('Standard1'), column, ConsistencyLevel.ONE)
-        client.remove('key1', ColumnPath('Standard1', column='cttl5'), 1, ConsistencyLevel.ONE)
-        _expect_missing(lambda: client.get('key1', ColumnPath('Standard1', column='ctt5'), ConsistencyLevel.ONE))
-    
-    def test_describe_ring_on_invalid_keyspace(self):
-        def req():
-            client.describe_ring('system')
-        _expect_exception(req, InvalidRequestException)
-
-    def test_incr_decr_standard_add(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 12
-        d2 = -21
-        d3 = 35
-        # insert positive and negative values and check the counts
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(0.1)
-        rv1 = client.get('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv1.counter_column.value == d1
-
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c1', d2), ConsistencyLevel.ONE)
-        time.sleep(0.1)
-        rv2 = client.get('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv2.counter_column.value == (d1+d2)
-
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c1', d3), ConsistencyLevel.ONE)
-        time.sleep(0.1)
-        rv3 = client.get('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv3.counter_column.value == (d1+d2+d3)
-
-    def test_incr_decr_super_add(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = -234
-        d2 = 52345
-        d3 = 3123
-
-        client.add('key1', ColumnParent(column_family='SuperCounter1', super_column='sc1'),  CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='SuperCounter1', super_column='sc1'),  CounterColumn('c2', d2), ConsistencyLevel.ONE)
-        time.sleep(0.1)
-        rv1 = client.get('key1', ColumnPath(column_family='SuperCounter1', super_column='sc1'), ConsistencyLevel.ONE)
-        assert rv1.counter_super_column.columns[0].value == d1
-        assert rv1.counter_super_column.columns[1].value == d2
-
-        client.add('key1', ColumnParent(column_family='SuperCounter1', super_column='sc1'),  CounterColumn('c1', d2), ConsistencyLevel.ONE)
-        time.sleep(0.1)
-        rv2 = client.get('key1', ColumnPath('SuperCounter1', 'sc1', 'c1'), ConsistencyLevel.ONE)
-        assert rv2.counter_column.value == (d1+d2)
-
-        client.add('key1', ColumnParent(column_family='SuperCounter1', super_column='sc1'),  CounterColumn('c1', d3), ConsistencyLevel.ONE)
-        time.sleep(0.1)
-        rv3 = client.get('key1', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'), ConsistencyLevel.ONE)
-        assert rv3.counter_column.value == (d1+d2+d3)
-
-    def test_incr_standard_remove(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 124
-
-        # insert value and check it exists
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv1 = client.get('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv1.counter_column.value == d1
-
-        # remove the previous column and check that it is gone
-        client.remove_counter('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key1', ColumnPath(column_family='Counter1', column='c1'))
-
-        # insert again and this time delete the whole row, check that it is gone
-        client.add('key2', ColumnParent(column_family='Counter1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv2 = client.get('key2', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv2.counter_column.value == d1
-        client.remove_counter('key2', ColumnPath(column_family='Counter1'), ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key2', ColumnPath(column_family='Counter1', column='c1'))
-
-    def test_incr_super_remove(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 52345
-
-        # insert value and check it exists
-        client.add('key1', ColumnParent(column_family='SuperCounter1', super_column='sc1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv1 = client.get('key1', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'), ConsistencyLevel.ONE)
-        assert rv1.counter_column.value == d1
-
-        # remove the previous column and check that it is gone
-        client.remove_counter('key1', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'), ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key1', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'))
-
-        # insert again and this time delete the whole row, check that it is gone
-        client.add('key2', ColumnParent(column_family='SuperCounter1', super_column='sc1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv2 = client.get('key2', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'), ConsistencyLevel.ONE)
-        assert rv2.counter_column.value == d1
-        client.remove_counter('key2', ColumnPath(column_family='SuperCounter1', super_column='sc1'), ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key2', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'))
-
-    def test_incr_decr_standard_remove(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 124
-
-        # insert value and check it exists
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv1 = client.get('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv1.counter_column.value == d1
-
-        # remove the previous column and check that it is gone
-        client.remove_counter('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key1', ColumnPath(column_family='Counter1', column='c1'))
-
-        # insert again and this time delete the whole row, check that it is gone
-        client.add('key2', ColumnParent(column_family='Counter1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv2 = client.get('key2', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv2.counter_column.value == d1
-        client.remove_counter('key2', ColumnPath(column_family='Counter1'), ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key2', ColumnPath(column_family='Counter1', column='c1'))
-
-    def test_incr_decr_super_remove(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 52345
-
-        # insert value and check it exists
-        client.add('key1', ColumnParent(column_family='SuperCounter1', super_column='sc1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv1 = client.get('key1', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'), ConsistencyLevel.ONE)
-        assert rv1.counter_column.value == d1
-
-        # remove the previous column and check that it is gone
-        client.remove_counter('key1', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'), ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key1', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'))
-
-        # insert again and this time delete the whole row, check that it is gone
-        client.add('key2', ColumnParent(column_family='SuperCounter1', super_column='sc1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv2 = client.get('key2', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'), ConsistencyLevel.ONE)
-        assert rv2.counter_column.value == d1
-        client.remove_counter('key2', ColumnPath(column_family='SuperCounter1', super_column='sc1'), ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key2', ColumnPath(column_family='SuperCounter1', super_column='sc1', column='c1'))
-        
-    def test_incr_decr_standard_batch_add(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 12
-        d2 = -21
-        update_map = {'key1': {'Counter1': [
-            Mutation(column_or_supercolumn=ColumnOrSuperColumn(counter_column=CounterColumn('c1', d1))),
-            Mutation(column_or_supercolumn=ColumnOrSuperColumn(counter_column=CounterColumn('c1', d2))),
-            ]}}
-        
-        # insert positive and negative values and check the counts
-        client.batch_mutate(update_map, ConsistencyLevel.ONE)
-        time.sleep(0.1)
-        rv1 = client.get('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv1.counter_column.value == d1+d2
-
-    def test_incr_decr_standard_batch_remove(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 12
-        d2 = -21
-
-        # insert positive and negative values and check the counts
-        update_map = {'key1': {'Counter1': [
-            Mutation(column_or_supercolumn=ColumnOrSuperColumn(counter_column=CounterColumn('c1', d1))),
-            Mutation(column_or_supercolumn=ColumnOrSuperColumn(counter_column=CounterColumn('c1', d2))),
-            ]}}
-        client.batch_mutate(update_map, ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv1 = client.get('key1', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv1.counter_column.value == d1+d2
-
-        # remove the previous column and check that it is gone
-        update_map = {'key1': {'Counter1': [
-            Mutation(deletion=Deletion(predicate=SlicePredicate(column_names=['c1']))),
-            ]}}
-        client.batch_mutate(update_map, ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key1', ColumnPath(column_family='Counter1', column='c1'))
-
-        # insert again and this time delete the whole row, check that it is gone
-        update_map = {'key2': {'Counter1': [
-            Mutation(column_or_supercolumn=ColumnOrSuperColumn(counter_column=CounterColumn('c1', d1))),
-            Mutation(column_or_supercolumn=ColumnOrSuperColumn(counter_column=CounterColumn('c1', d2))),
-            ]}}
-        client.batch_mutate(update_map, ConsistencyLevel.ONE)
-        time.sleep(5)
-        rv2 = client.get('key2', ColumnPath(column_family='Counter1', column='c1'), ConsistencyLevel.ONE)
-        assert rv2.counter_column.value == d1+d2
-
-        update_map = {'key2': {'Counter1': [
-            Mutation(deletion=Deletion()),
-            ]}}
-        client.batch_mutate(update_map, ConsistencyLevel.ONE)
-        time.sleep(5)
-        _assert_no_columnpath('key2', ColumnPath(column_family='Counter1', column='c1'))
-        
-    def test_incr_decr_standard_slice(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 12
-        d2 = -21
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c1', d1), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c2', d1), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c3', d1), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c3', d2), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c4', d1), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c5', d1), ConsistencyLevel.ONE)
-
-        time.sleep(0.1)
-        # insert positive and negative values and check the counts
-        counters = client.get_slice('key1', ColumnParent('Counter1'), SlicePredicate(['c3', 'c4']), ConsistencyLevel.ONE)
-        
-        assert counters[0].counter_column.value == d1+d2
-        assert counters[1].counter_column.value == d1
-         
-    def test_incr_decr_standard_muliget_slice(self):
-        _set_keyspace('Keyspace1')
-
-        d1 = 12
-        d2 = -21
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c2', d1), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c3', d1), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c3', d2), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c4', d1), ConsistencyLevel.ONE)
-        client.add('key1', ColumnParent(column_family='Counter1'), CounterColumn('c5', d1), ConsistencyLevel.ONE)
-
-        client.add('key2', ColumnParent(column_family='Counter1'), CounterColumn('c2', d1), ConsistencyLevel.ONE)
-        client.add('key2', ColumnParent(column_family='Counter1'), CounterColumn('c3', d1), ConsistencyLevel.ONE)
-        client.add('key2', ColumnParent(column_family='Counter1'), CounterColumn('c3', d2), ConsistencyLevel.ONE)
-        client.add('key2', ColumnParent(column_family='Counter1'), CounterColumn('c4', d1), ConsistencyLevel.ONE)
-        client.add('key2', ColumnParent(column_family='Counter1'), CounterColumn('c5', d1), ConsistencyLevel.ONE)
-
-
-        time.sleep(0.1)
-        # insert positive and negative values and check the counts
-        counters = client.multiget_slice(['key1', 'key2'], ColumnParent('Counter1'), SlicePredicate(['c3', 'c4']), ConsistencyLevel.ONE)
-        
-        assert counters['key1'][0].counter_column.value == d1+d2
-        assert counters['key1'][1].counter_column.value == d1   
-        assert counters['key2'][0].counter_column.value == d1+d2
-        assert counters['key2'][1].counter_column.value == d1
-
-    def test_counter_get_slice_range(self):
-        _set_keyspace('Keyspace1')
-        _insert_counter_range()
-        _verify_counter_range()
-
-    def test_counter_get_slice_super_range(self):
-        _set_keyspace('Keyspace1')
-        _insert_counter_super_range()
-        _verify_counter_super_range()
-
-    def test_index_scan(self):
-        _set_keyspace('Keyspace1')
-        client.insert('key1', ColumnParent('Indexed1'), Column('birthdate', _i64(1), 0), ConsistencyLevel.ONE)
-        client.insert('key2', ColumnParent('Indexed1'), Column('birthdate', _i64(2), 0), ConsistencyLevel.ONE)
-        client.insert('key2', ColumnParent('Indexed1'), Column('b', _i64(2), 0), ConsistencyLevel.ONE)
-        client.insert('key3', ColumnParent('Indexed1'), Column('birthdate', _i64(3), 0), ConsistencyLevel.ONE)
-        client.insert('key3', ColumnParent('Indexed1'), Column('b', _i64(3), 0), ConsistencyLevel.ONE)
-
-        # simple query on one index expression
-        cp = ColumnParent('Indexed1')
-        sp = SlicePredicate(slice_range=SliceRange('', ''))
-        key_range = KeyRange('', '', None, None, [IndexExpression('birthdate', IndexOperator.EQ, _i64(1))], 100)
-        result = client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE)
-        assert len(result) == 1, result
-        assert result[0].key == 'key1'
-        assert len(result[0].columns) == 1, result[0].columns
-
-        # without index
-        key_range = KeyRange('', '', None, None, [IndexExpression('b', IndexOperator.EQ, _i64(1))], 100)
-        result = client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE)
-        assert len(result) == 0, result
-
-        # but unindexed expression added to indexed one is ok
-        key_range = KeyRange('', '', None, None, [IndexExpression('b', IndexOperator.EQ, _i64(3)), IndexExpression('birthdate', IndexOperator.EQ, _i64(3))], 100)
-        result = client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE)
-        assert len(result) == 1, result
-        assert result[0].key == 'key3'
-        assert len(result[0].columns) == 2, result[0].columns
-        
-    def test_index_scan_uuid_names(self):
-        _set_keyspace('Keyspace1')
-        sp = SlicePredicate(slice_range=SliceRange('', ''))
-        cp = ColumnParent('Indexed3') # timeuuid name, utf8 values
-        u = uuid.UUID('00000000-0000-1000-0000-000000000000').bytes
-        u2 = uuid.UUID('00000000-0000-1000-0000-000000000001').bytes
-        client.insert('key1', ColumnParent('Indexed3'), Column(u, 'a', 0), ConsistencyLevel.ONE)
-        client.insert('key1', ColumnParent('Indexed3'), Column(u2, 'b', 0), ConsistencyLevel.ONE)
-        # name comparator + data validator of incompatible types -- see CASSANDRA-2347
-        key_range = KeyRange('', '', None, None, [IndexExpression(u, IndexOperator.EQ, 'a'), IndexExpression(u2, IndexOperator.EQ, 'b')], 100)
-        result = client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE)
-        assert len(result) == 1, result
-
-        cp = ColumnParent('Indexed2') # timeuuid name, long values
-
-        # name must be valid (TimeUUID)
-        key_range = KeyRange('', '', None, None, [IndexExpression('foo', IndexOperator.EQ, uuid.UUID('00000000-0000-1000-0000-000000000000').bytes)], 100)
-        _expect_exception(lambda: client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE), InvalidRequestException)
-        
-        # value must be valid (TimeUUID)
-        key_range = KeyRange('', '', None, None, [IndexExpression(uuid.UUID('00000000-0000-1000-0000-000000000000').bytes, IndexOperator.EQ, "foo")], 100)
-        _expect_exception(lambda: client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE), InvalidRequestException)
-        
-    def test_index_scan_expiring(self):
-        """ Test that column ttled expires from KEYS index"""
-        _set_keyspace('Keyspace1')
-        client.insert('key1', ColumnParent('Indexed1'), Column('birthdate', _i64(1), 0, 1), ConsistencyLevel.ONE)
-        cp = ColumnParent('Indexed1')
-        sp = SlicePredicate(slice_range=SliceRange('', ''))
-        key_range = KeyRange('', '', None, None, [IndexExpression('birthdate', IndexOperator.EQ, _i64(1))], 100)
-        # query before expiration
-        result = client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE)
-        assert len(result) == 1, result
-        # wait for expiration and requery
-        time.sleep(2)
-        result = client.get_range_slices(cp, sp, key_range, ConsistencyLevel.ONE)
-        assert len(result) == 0, result
-     
-    def test_column_not_found_quorum(self): 
-        _set_keyspace('Keyspace1')
-        key = 'doesntexist'
-        column_path = ColumnPath(column_family="Standard1", column="idontexist")
-        try:
-            client.get(key, column_path, ConsistencyLevel.QUORUM)
-            assert False, ('columnpath %s existed in %s when it should not' % (column_path, key))
-        except NotFoundException:
-            assert True, 'column did not exist'
-
-    def test_get_range_slice_after_deletion(self):
-        _set_keyspace('Keyspace2')
-        key = 'key1'
-        # three supercoluns, each with "col1" subcolumn
-        for i in range(1,4):
-            client.insert(key, ColumnParent('Super3', 'sc%d' % i), Column('col1', 'val1', 0), ConsistencyLevel.ONE)
-
-        cp = ColumnParent('Super3')
-        predicate = SlicePredicate(slice_range=SliceRange('sc1', 'sc3', False, count=1))
-        k_range = KeyRange(start_key=key, end_key=key, count=1)
-
-        # validate count=1 restricts to 1 supercolumn
-        result = client.get_range_slices(cp, predicate, k_range, ConsistencyLevel.ONE)
-        assert len(result[0].columns) == 1
-
-        # remove sc1; add back subcolumn to override tombstone
-        client.remove(key, ColumnPath('Super3', 'sc1'), 1, ConsistencyLevel.ONE)
-        result = client.get_range_slices(cp, predicate, k_range, ConsistencyLevel.ONE)
-        assert len(result[0].columns) == 1
-        client.insert(key, ColumnParent('Super3', 'sc1'), Column('col1', 'val1', 2), ConsistencyLevel.ONE)
-        result = client.get_range_slices(cp, predicate, k_range, ConsistencyLevel.ONE)
-        assert len(result[0].columns) == 1, result[0].columns
-        assert result[0].columns[0].super_column.name == 'sc1'
-
-
-class TestTruncate(ThriftTester):
-    def test_truncate(self):
-        _set_keyspace('Keyspace1')
-        
-        _insert_simple()
-        _insert_super()
-
-        # truncate Standard1
-        client.truncate('Standard1')
-        assert _big_slice('key1', ColumnParent('Standard1')) == []
-
-        # truncate Super1
-        client.truncate('Super1')
-        assert _big_slice('key1', ColumnParent('Super1')) == []
-        assert _big_slice('key1', ColumnParent('Super1', 'sc1')) == []

diff --git a/test/unit/org/apache/cassandra/AbstractSerializationsTester.java b/test/unit/org/apache/cassandra/AbstractSerializationsTester.java
index 22b2424..1c97eae 100644
--- a/test/unit/org/apache/cassandra/AbstractSerializationsTester.java
+++ b/test/unit/org/apache/cassandra/AbstractSerializationsTester.java

@@ -19,13 +19,12 @@
  */
 package org.apache.cassandra;
 
-import com.google.common.io.ByteArrayDataOutput;
-import com.google.common.io.ByteStreams;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.net.MessagingService;
 
 import java.io.DataInputStream;
-import java.io.DataOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
@@ -35,27 +34,28 @@
 
 public class AbstractSerializationsTester extends SchemaLoader
 {
-    protected static final String CUR_VER = System.getProperty("cassandra.version", "2.0");
+    protected static final String CUR_VER = System.getProperty("cassandra.version", "2.1");
     protected static final Map<String, Integer> VERSION_MAP = new HashMap<String, Integer> ()
     {{
         put("0.7", 1);
         put("1.0", 3);
         put("1.2", MessagingService.VERSION_12);
         put("2.0", MessagingService.VERSION_20);
+        put("2.1", MessagingService.VERSION_21);
     }};
 
     protected static final boolean EXECUTE_WRITES = Boolean.getBoolean("cassandra.test-serialization-writes");
 
-    protected final int getVersion()
+    protected static int getVersion()
     {
         return VERSION_MAP.get(CUR_VER);
     }
 
     protected <T> void testSerializedSize(T obj, IVersionedSerializer<T> serializer) throws IOException
     {
-        ByteArrayDataOutput out = ByteStreams.newDataOutput();
+        DataOutputBuffer out = new DataOutputBuffer();
         serializer.serialize(obj, out, getVersion());
-        assert out.toByteArray().length == serializer.serializedSize(obj, getVersion());
+        assert out.getLength() == serializer.serializedSize(obj, getVersion());
     }
 
     protected static DataInputStream getInput(String name) throws IOException
@@ -65,10 +65,10 @@
         return new DataInputStream(new FileInputStream(f));
     }
 
-    protected static DataOutputStream getOutput(String name) throws IOException
+    protected static DataOutputStreamAndChannel getOutput(String name) throws IOException
     {
         File f = new File("test/data/serialization/" + CUR_VER + "/" + name);
         f.getParentFile().mkdirs();
-        return new DataOutputStream(new FileOutputStream(f));
+        return new DataOutputStreamAndChannel(new FileOutputStream(f));
     }
 }

diff --git a/test/unit/org/apache/cassandra/EmbeddedServer.java b/test/unit/org/apache/cassandra/EmbeddedServer.java
index c948cfa..25754ea 100644
--- a/test/unit/org/apache/cassandra/EmbeddedServer.java
+++ b/test/unit/org/apache/cassandra/EmbeddedServer.java

@@ -18,7 +18,6 @@
  */
 package org.apache.cassandra;
 
-import java.io.IOException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
@@ -44,7 +43,7 @@
     static ExecutorService executor = Executors.newSingleThreadExecutor();
 
     @BeforeClass
-    public static void startCassandra() throws IOException
+    public static void startCassandra()
 
     {
         executor.execute(new Runnable()

diff --git a/test/unit/org/apache/cassandra/MethodComparator.java b/test/unit/org/apache/cassandra/MethodComparator.java
index 8cc163a..6422d5a 100644
--- a/test/unit/org/apache/cassandra/MethodComparator.java
+++ b/test/unit/org/apache/cassandra/MethodComparator.java

@@ -75,9 +75,9 @@
 
     private MethodPosition getIndexOfMethodPosition(final Method method)
     {
-        final Class aClass = method.getDeclaringClass();
         if (method.getAnnotation(Ignore.class) == null)
         {
+            final Class<?> aClass = method.getDeclaringClass();
             return getIndexOfMethodPosition(aClass, method.getName());
         }
         else
@@ -86,7 +86,7 @@
         }
     }
 
-    private MethodPosition getIndexOfMethodPosition(final Class aClass, final String methodName)
+    private MethodPosition getIndexOfMethodPosition(final Class<?> aClass, final String methodName)
     {
         MethodPosition methodPosition;
         for (final char methodSeparator : METHOD_SEPARATORS)
@@ -100,7 +100,7 @@
         return new NullMethodPosition();
     }
 
-    private MethodPosition getIndexOfMethodPosition(final Class aClass, final String methodName, final char methodSeparator)
+    private MethodPosition getIndexOfMethodPosition(final Class<?> aClass, final String methodName, final char methodSeparator)
     {
         final InputStream inputStream = aClass.getResourceAsStream(aClass.getSimpleName() + ".class");
         final LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(inputStream));

diff --git a/test/unit/org/apache/cassandra/OffsetAwareConfigurationLoader.java b/test/unit/org/apache/cassandra/OffsetAwareConfigurationLoader.java
new file mode 100644
index 0000000..9023b11
--- /dev/null
+++ b/test/unit/org/apache/cassandra/OffsetAwareConfigurationLoader.java

@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra;
+
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.YamlConfigurationLoader;
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+import java.io.File;
+
+
+public class OffsetAwareConfigurationLoader extends YamlConfigurationLoader
+{
+
+    static final String OFFSET_PROPERTY = "cassandra.test.offsetseed";
+    int offset = 0;
+
+    public OffsetAwareConfigurationLoader()
+    {
+        String offsetStr = System.getProperty(OFFSET_PROPERTY);
+
+        if (offsetStr == null)
+            throw new RuntimeException("offset property is not set: "+OFFSET_PROPERTY);
+
+        offset = Integer.valueOf(offsetStr);
+
+        assert offset >= 0;
+    }
+
+    @Override
+    public Config loadConfig() throws ConfigurationException
+    {
+        Config config = super.loadConfig();
+
+
+        config.rpc_port += offset;
+        config.native_transport_port += offset;
+        config.storage_port += offset;
+
+        config.commitlog_directory += File.pathSeparator + offset;
+        config.saved_caches_directory += File.pathSeparator + offset;
+        for (int i = 0; i < config.data_file_directories.length; i++)
+            config.data_file_directories[i] += File.pathSeparator + offset;
+
+
+        return config;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/SchemaLoader.java b/test/unit/org/apache/cassandra/SchemaLoader.java
index 7dea52c..1f6ffa9 100644
--- a/test/unit/org/apache/cassandra/SchemaLoader.java
+++ b/test/unit/org/apache/cassandra/SchemaLoader.java

@@ -18,23 +18,22 @@
 package org.apache.cassandra;
 
 import java.io.File;
-import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.nio.charset.StandardCharsets;
 import java.util.*;
 
-import org.apache.cassandra.db.index.PerRowSecondaryIndexTest;
-import org.apache.cassandra.db.index.SecondaryIndex;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.compaction.LeveledCompactionStrategy;
-import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.index.PerRowSecondaryIndexTest;
+import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.Gossiper;
@@ -44,21 +43,28 @@
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.thrift.IndexType;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 
 public class SchemaLoader
 {
     private static Logger logger = LoggerFactory.getLogger(SchemaLoader.class);
 
     @BeforeClass
-    public static void loadSchema() throws IOException, ConfigurationException
+    public static void loadSchema() throws ConfigurationException
     {
-        loadSchema(false);
+        prepareServer();
+
+        // Migrations aren't happy if gossiper is not started.  Even if we don't use migrations though,
+        // some tests now expect us to start gossip for them.
+        startGossiper();
+
+        // if you're messing with low-level sstable stuff, it can be useful to inject the schema directly
+        // Schema.instance.load(schemaDefinition());
+        for (KSMetaData ksm : schemaDefinition())
+            MigrationManager.announceNewKeyspace(ksm);
     }
 
-    public static void loadSchema(boolean withOldCfIds) throws IOException, ConfigurationException
+    public static void prepareServer()
     {
         // Cleanup first
         cleanupAndLeaveDirs();
@@ -73,13 +79,7 @@
             }
         });
 
-        // Migrations aren't happy if gossiper is not started.  Even if we don't use migrations though,
-        // some tests now expect us to start gossip for them.
-        startGossiper();
-        // if you're messing with low-level sstable stuff, it can be useful to inject the schema directly
-        // Schema.instance.load(schemaDefinition(withOldCfIds));
-        for (KSMetaData ksm : schemaDefinition(withOldCfIds))
-            MigrationManager.announceNewKeyspace(ksm);
+        Keyspace.setInitialized();
     }
 
     public static void startGossiper()
@@ -93,7 +93,7 @@
         Gossiper.instance.stop();
     }
 
-    public static Collection<KSMetaData> schemaDefinition(boolean withOldCfIds) throws ConfigurationException
+    public static Collection<KSMetaData> schemaDefinition() throws ConfigurationException
     {
         List<KSMetaData> schema = new ArrayList<KSMetaData>();
 
@@ -106,6 +106,7 @@
         String ks6 = "Keyspace6";
         String ks_kcs = "KeyCacheSpace";
         String ks_rcs = "RowCacheSpace";
+        String ks_ccs = "CounterCacheSpace";
         String ks_nocommit = "NoCommitlogSpace";
         String ks_prsi = "PerRowSecondaryIndex";
         String ks_cql = "cql_keyspace";
@@ -117,8 +118,6 @@
         Map<String, String> opts_rf3 = KSMetaData.optsWithRF(3);
         Map<String, String> opts_rf5 = KSMetaData.optsWithRF(5);
 
-        ColumnFamilyType st = ColumnFamilyType.Standard;
-        ColumnFamilyType su = ColumnFamilyType.Super;
         AbstractType bytes = BytesType.instance;
 
         AbstractType<?> composite = CompositeType.getInstance(Arrays.asList(new AbstractType<?>[]{BytesType.instance, TimeUUIDType.instance, IntegerType.instance}));
@@ -130,12 +129,6 @@
         aliases.put((byte)'T', ReversedType.getInstance(TimeUUIDType.instance));
         AbstractType<?> dynamicComposite = DynamicCompositeType.getInstance(aliases);
 
-        // these column definitions will will be applied to the jdbc utf and integer column familes respectively.
-        Map<ByteBuffer, ColumnDefinition> integerColumn = new HashMap<ByteBuffer, ColumnDefinition>();
-        integerColumn.put(IntegerType.instance.fromString("42"), ColumnDefinition.regularDef(IntegerType.instance.fromString("42"), UTF8Type.instance, null));
-        Map<ByteBuffer, ColumnDefinition> utf8Column = new HashMap<ByteBuffer, ColumnDefinition>();
-        utf8Column.put(UTF8Type.instance.fromString("fortytwo"), ColumnDefinition.regularDef(UTF8Type.instance.fromString("fortytwo"), IntegerType.instance, null));
-
         // Make it easy to test compaction
         Map<String, String> compactionOptions = new HashMap<String, String>();
         compactionOptions.put("tombstone_compaction_interval", "1");
@@ -155,12 +148,7 @@
                                            standardCFMD(ks1, "StandardGCGS0").gcGraceSeconds(0),
                                            standardCFMD(ks1, "StandardLong1"),
                                            standardCFMD(ks1, "StandardLong2"),
-                                           new CFMetaData(ks1,
-                                                          "ValuesWithQuotes",
-                                                          st,
-                                                          BytesType.instance,
-                                                          null)
-                                                   .defaultValidator(UTF8Type.instance),
+                                           CFMetaData.denseCFMetaData(ks1, "ValuesWithQuotes", BytesType.instance).defaultValidator(UTF8Type.instance),
                                            superCFMD(ks1, "Super1", LongType.instance),
                                            superCFMD(ks1, "Super2", LongType.instance),
                                            superCFMD(ks1, "Super3", LongType.instance),
@@ -169,66 +157,34 @@
                                            superCFMD(ks1, "Super6", LexicalUUIDType.instance, UTF8Type.instance),
                                            indexCFMD(ks1, "Indexed1", true),
                                            indexCFMD(ks1, "Indexed2", false),
-                                           new CFMetaData(ks1,
-                                                          "StandardInteger1",
-                                                          st,
-                                                          IntegerType.instance,
-                                                          null),
-                                           new CFMetaData(ks1,
-                                                          "StandardLong3",
-                                                          st,
-                                                          LongType.instance,
-                                                          null),
-                                           new CFMetaData(ks1,
-                                                          "Counter1",
-                                                          st,
-                                                          bytes,
-                                                          null)
-                                                   .defaultValidator(CounterColumnType.instance),
-                                           new CFMetaData(ks1,
-                                                          "SuperCounter1",
-                                                          su,
-                                                          bytes,
-                                                          bytes)
-                                                   .defaultValidator(CounterColumnType.instance),
+                                           CFMetaData.denseCFMetaData(ks1, "StandardInteger1", IntegerType.instance),
+                                           CFMetaData.denseCFMetaData(ks1, "StandardLong3", IntegerType.instance),
+                                           CFMetaData.denseCFMetaData(ks1, "Counter1", bytes).defaultValidator(CounterColumnType.instance),
+                                           CFMetaData.denseCFMetaData(ks1, "SuperCounter1", bytes, bytes).defaultValidator(CounterColumnType.instance),
                                            superCFMD(ks1, "SuperDirectGC", BytesType.instance).gcGraceSeconds(0),
-                                           jdbcCFMD(ks1, "JdbcInteger", IntegerType.instance).columnMetadata(integerColumn),
-                                           jdbcCFMD(ks1, "JdbcUtf8", UTF8Type.instance).columnMetadata(utf8Column),
+                                           jdbcSparseCFMD(ks1, "JdbcInteger", IntegerType.instance).addColumnDefinition(integerColumn(ks1, "JdbcInteger")),
+                                           jdbcSparseCFMD(ks1, "JdbcUtf8", UTF8Type.instance).addColumnDefinition(utf8Column(ks1, "JdbcUtf8")),
                                            jdbcCFMD(ks1, "JdbcLong", LongType.instance),
                                            jdbcCFMD(ks1, "JdbcBytes", bytes),
                                            jdbcCFMD(ks1, "JdbcAscii", AsciiType.instance),
-                                           new CFMetaData(ks1,
-                                                          "StandardComposite",
-                                                          st,
-                                                          composite,
-                                                          null),
-                                           new CFMetaData(ks1,
-                                                          "StandardComposite2",
-                                                          st,
-                                                          compositeMaxMin,
-                                                          null),
-                                           new CFMetaData(ks1,
-                                                          "StandardDynamicComposite",
-                                                          st,
-                                                          dynamicComposite,
-                                                          null),
+                                           CFMetaData.denseCFMetaData(ks1, "StandardComposite", composite),
+                                           CFMetaData.denseCFMetaData(ks1, "StandardComposite2", compositeMaxMin),
+                                           CFMetaData.denseCFMetaData(ks1, "StandardDynamicComposite", dynamicComposite),
                                            standardCFMD(ks1, "StandardLeveled")
                                                                                .compactionStrategyClass(LeveledCompactionStrategy.class)
                                                                                .compactionStrategyOptions(leveledOptions),
                                            standardCFMD(ks1, "legacyleveled")
                                                                                .compactionStrategyClass(LeveledCompactionStrategy.class)
                                                                                .compactionStrategyOptions(leveledOptions),
+                                           standardCFMD(ks1, "StandardLowIndexInterval").minIndexInterval(8)
+                                                                                        .maxIndexInterval(256)
+                                                                                        .caching(CachingOptions.NONE),
+
                                            standardCFMD(ks1, "UUIDKeys").keyValidator(UUIDType.instance),
-                                           new CFMetaData(ks1,
-                                                          "MixedTypes",
-                                                          st,
-                                                          LongType.instance,
-                                                          null).keyValidator(UUIDType.instance).defaultValidator(BooleanType.instance),
-                                           new CFMetaData(ks1,
-                                                          "MixedTypesComposite",
-                                                          st,
-                                                          composite,
-                                                          null).keyValidator(composite).defaultValidator(BooleanType.instance)));
+                                           CFMetaData.denseCFMetaData(ks1, "MixedTypes", LongType.instance).keyValidator(UUIDType.instance).defaultValidator(BooleanType.instance),
+                                           CFMetaData.denseCFMetaData(ks1, "MixedTypesComposite", composite).keyValidator(composite).defaultValidator(BooleanType.instance),
+                                           standardCFMD(ks1, "AsciiKeys").keyValidator(AsciiType.instance)
+        ));
 
         // Keyspace 2
         schema.add(KSMetaData.testMetadata(ks2,
@@ -241,8 +197,8 @@
                                            superCFMD(ks2, "Super3", bytes),
                                            superCFMD(ks2, "Super4", TimeUUIDType.instance),
                                            indexCFMD(ks2, "Indexed1", true),
-                                           compositeIndexCFMD(ks2, "Indexed2", true, withOldCfIds),
-                                           compositeIndexCFMD(ks2, "Indexed3", true, withOldCfIds).gcGraceSeconds(0)));
+                                           compositeIndexCFMD(ks2, "Indexed2", true),
+                                           compositeIndexCFMD(ks2, "Indexed3", true).gcGraceSeconds(0)));
 
         // Keyspace 3
         schema.add(KSMetaData.testMetadata(ks3,
@@ -263,11 +219,7 @@
                                            standardCFMD(ks4, "Standard3"),
                                            superCFMD(ks4, "Super3", bytes),
                                            superCFMD(ks4, "Super4", TimeUUIDType.instance),
-                                           new CFMetaData(ks4,
-                                                          "Super5",
-                                                          su,
-                                                          TimeUUIDType.instance,
-                                                          bytes)));
+                                           CFMetaData.denseCFMetaData(ks4, "Super5", TimeUUIDType.instance, bytes)));
 
         // Keyspace 5
         schema.add(KSMetaData.testMetadata(ks5,
@@ -295,8 +247,19 @@
         schema.add(KSMetaData.testMetadata(ks_rcs,
                                            simple,
                                            opts_rf1,
-                                           standardCFMD(ks_rcs, "CFWithoutCache").caching(CFMetaData.Caching.NONE),
-                                           standardCFMD(ks_rcs, "CachedCF").caching(CFMetaData.Caching.ALL)));
+                                           standardCFMD(ks_rcs, "CFWithoutCache").caching(CachingOptions.NONE),
+                                           standardCFMD(ks_rcs, "CachedCF").caching(CachingOptions.ALL),
+                                           standardCFMD(ks_rcs, "CachedIntCF").
+                                                   defaultValidator(IntegerType.instance).
+                                                   caching(new CachingOptions(new CachingOptions.KeyCache(CachingOptions.KeyCache.Type.ALL),
+                                                                                  new CachingOptions.RowCache(CachingOptions.RowCache.Type.HEAD, 100)))));
+
+        // CounterCacheSpace
+        schema.add(KSMetaData.testMetadata(ks_ccs,
+                                           simple,
+                                           opts_rf1,
+                                           standardCFMD(ks_ccs, "Counter1").defaultValidator(CounterColumnType.instance),
+                                           standardCFMD(ks_ccs, "Counter2").defaultValidator(CounterColumnType.instance)));
 
         schema.add(KSMetaData.testMetadataNotDurable(ks_nocommit,
                                                      simple,
@@ -307,7 +270,7 @@
         schema.add(KSMetaData.testMetadata(ks_prsi,
                                            simple,
                                            opts_rf1,
-                                           perRowIndexedCFMD(ks_prsi, "Indexed1", withOldCfIds)));
+                                           perRowIndexedCFMD(ks_prsi, "Indexed1")));
 
         // CQLKeyspace
         schema.add(KSMetaData.testMetadata(ks_cql,
@@ -349,23 +312,43 @@
         return schema;
     }
 
-    private static CFMetaData perRowIndexedCFMD(String ksName, String cfName, boolean withOldCfIds)
+    private static ColumnDefinition integerColumn(String ksName, String cfName)
+    {
+        return new ColumnDefinition(ksName,
+                                    cfName,
+                                    new ColumnIdentifier(IntegerType.instance.fromString("42"), IntegerType.instance),
+                                    UTF8Type.instance,
+                                    null,
+                                    null,
+                                    null,
+                                    null,
+                                    ColumnDefinition.Kind.REGULAR);
+    }
+
+    private static ColumnDefinition utf8Column(String ksName, String cfName)
+    {
+        return new ColumnDefinition(ksName,
+                                    cfName,
+                                    new ColumnIdentifier("fortytwo", true),
+                                    UTF8Type.instance,
+                                    null,
+                                    null,
+                                    null,
+                                    null,
+                                    ColumnDefinition.Kind.REGULAR);
+    }
+
+    private static CFMetaData perRowIndexedCFMD(String ksName, String cfName)
     {
         final Map<String, String> indexOptions = Collections.singletonMap(
                                                       SecondaryIndex.CUSTOM_INDEX_OPTION_NAME,
                                                       PerRowSecondaryIndexTest.TestIndex.class.getName());
-        return standardCFMD(ksName, cfName)
-                .keyValidator(AsciiType.instance)
-                .columnMetadata(new HashMap<ByteBuffer, ColumnDefinition>()
-                {{
-                        ByteBuffer cName = ByteBuffer.wrap("indexed".getBytes(StandardCharsets.UTF_8));
-                        put(cName, new ColumnDefinition(cName,
-                                AsciiType.instance,
-                                IndexType.CUSTOM,
-                                indexOptions,
-                                ByteBufferUtil.bytesToHex(cName),
-                                null, ColumnDefinition.Type.REGULAR));
-                }});
+
+        CFMetaData cfm =  CFMetaData.sparseCFMetaData(ksName, cfName, BytesType.instance).keyValidator(AsciiType.instance);
+
+        ByteBuffer cName = ByteBufferUtil.bytes("indexed");
+        return cfm.addOrReplaceColumnDefinition(ColumnDefinition.regularDef(cfm, cName, AsciiType.instance, null)
+                                                                .setIndex("indexe1", IndexType.CUSTOM, indexOptions));
     }
 
     private static void useCompression(List<KSMetaData> schema)
@@ -381,7 +364,7 @@
 
     private static CFMetaData standardCFMD(String ksName, String cfName)
     {
-        return new CFMetaData(ksName, cfName, ColumnFamilyType.Standard, BytesType.instance, null);
+        return CFMetaData.denseCFMetaData(ksName, cfName, BytesType.instance);
     }
     private static CFMetaData superCFMD(String ksName, String cfName, AbstractType subcc)
     {
@@ -389,39 +372,36 @@
     }
     private static CFMetaData superCFMD(String ksName, String cfName, AbstractType cc, AbstractType subcc)
     {
-        return new CFMetaData(ksName, cfName, ColumnFamilyType.Super, cc, subcc);
+        return CFMetaData.denseCFMetaData(ksName, cfName, cc, subcc);
     }
     private static CFMetaData indexCFMD(String ksName, String cfName, final Boolean withIdxType) throws ConfigurationException
     {
-        return standardCFMD(ksName, cfName)
-               .keyValidator(AsciiType.instance)
-               .columnMetadata(new HashMap<ByteBuffer, ColumnDefinition>()
-                   {{
-                        ByteBuffer cName = ByteBuffer.wrap("birthdate".getBytes(StandardCharsets.UTF_8));
-                        IndexType keys = withIdxType ? IndexType.KEYS : null;
-                        put(cName, ColumnDefinition.regularDef(cName, LongType.instance, null).setIndex(withIdxType ? ByteBufferUtil.bytesToHex(cName) : null, keys, null));
-                    }});
+        CFMetaData cfm = CFMetaData.sparseCFMetaData(ksName, cfName, BytesType.instance).keyValidator(AsciiType.instance);
+
+        ByteBuffer cName = ByteBufferUtil.bytes("birthdate");
+        IndexType keys = withIdxType ? IndexType.KEYS : null;
+        return cfm.addColumnDefinition(ColumnDefinition.regularDef(cfm, cName, LongType.instance, null)
+                                                       .setIndex(withIdxType ? ByteBufferUtil.bytesToHex(cName) : null, keys, null));
     }
-    private static CFMetaData compositeIndexCFMD(String ksName, String cfName, final Boolean withIdxType, boolean withOldCfIds) throws ConfigurationException
+    private static CFMetaData compositeIndexCFMD(String ksName, String cfName, final Boolean withIdxType) throws ConfigurationException
     {
         final CompositeType composite = CompositeType.getInstance(Arrays.asList(new AbstractType<?>[]{UTF8Type.instance, UTF8Type.instance})); 
-        return new CFMetaData(ksName,
-                cfName,
-                ColumnFamilyType.Standard,
-                composite,
-                null)
-               .columnMetadata(new HashMap<ByteBuffer, ColumnDefinition>()
-                {{
-                   ByteBuffer cName = ByteBuffer.wrap("col1".getBytes(StandardCharsets.UTF_8));
-                   IndexType idxType = withIdxType ? IndexType.COMPOSITES : null;
-                   put(cName, ColumnDefinition.regularDef(cName, UTF8Type.instance, 1)
-                                              .setIndex(withIdxType ? "col1_idx" : null, idxType, Collections.<String, String>emptyMap()));
-                }});
+        CFMetaData cfm = CFMetaData.sparseCFMetaData(ksName, cfName, composite);
+
+        ByteBuffer cName = ByteBufferUtil.bytes("col1");
+        IndexType idxType = withIdxType ? IndexType.COMPOSITES : null;
+        return cfm.addColumnDefinition(ColumnDefinition.regularDef(cfm, cName, UTF8Type.instance, 1)
+                                                       .setIndex(withIdxType ? "col1_idx" : null, idxType, Collections.<String, String>emptyMap()));
     }
     
     private static CFMetaData jdbcCFMD(String ksName, String cfName, AbstractType comp)
     {
-        return new CFMetaData(ksName, cfName, ColumnFamilyType.Standard, comp, null).defaultValidator(comp);
+        return CFMetaData.denseCFMetaData(ksName, cfName, comp).defaultValidator(comp);
+    }
+
+    private static CFMetaData jdbcSparseCFMD(String ksName, String cfName, AbstractType comp)
+    {
+        return CFMetaData.sparseCFMetaData(ksName, cfName, comp).defaultValidator(comp);
     }
 
     public static void cleanupAndLeaveDirs()
@@ -461,25 +441,25 @@
         DatabaseDescriptor.createAllDirectories();
     }
 
-    protected void insertData(String keyspace, String columnFamily, int offset, int numberOfRows) throws IOException
+    protected void insertData(String keyspace, String columnFamily, int offset, int numberOfRows)
     {
         for (int i = offset; i < offset + numberOfRows; i++)
         {
             ByteBuffer key = ByteBufferUtil.bytes("key" + i);
-            RowMutation rowMutation = new RowMutation(keyspace, key);
-            rowMutation.add(columnFamily, ByteBufferUtil.bytes("col" + i), ByteBufferUtil.bytes("val" + i), System.currentTimeMillis());
-            rowMutation.applyUnsafe();
+            Mutation mutation = new Mutation(keyspace, key);
+            mutation.add(columnFamily, Util.cellname("col" + i), ByteBufferUtil.bytes("val" + i), System.currentTimeMillis());
+            mutation.applyUnsafe();
         }
     }
 
     /* usually used to populate the cache */
-    protected void readData(String keyspace, String columnFamily, int offset, int numberOfRows) throws IOException
+    protected void readData(String keyspace, String columnFamily, int offset, int numberOfRows)
     {
         ColumnFamilyStore store = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily);
         for (int i = offset; i < offset + numberOfRows; i++)
         {
             DecoratedKey key = Util.dk("key" + i);
-            store.getColumnFamily(QueryFilter.getNamesFilter(key, columnFamily, FBUtilities.singleton(ByteBufferUtil.bytes("col" + i), store.getComparator()), System.currentTimeMillis()));
+            store.getColumnFamily(Util.namesQueryFilter(store, key, "col" + i));
         }
     }
 

diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index a71dc48..1015be6 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java

@@ -28,25 +28,29 @@
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
+import java.util.SortedSet;
+import java.util.TreeSet;
 import java.util.UUID;
 import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.compaction.AbstractCompactionTask;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.filter.NamesQueryFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.gms.ApplicationState;
 import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.gms.VersionedValue;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.CounterId;
@@ -62,6 +66,11 @@
         return StorageService.getPartitioner().decorateKey(ByteBufferUtil.bytes(key));
     }
 
+    public static DecoratedKey dk(String key, AbstractType type)
+    {
+        return StorageService.getPartitioner().decorateKey(type.fromString(key));
+    }
+
     public static DecoratedKey dk(ByteBuffer key)
     {
         return StorageService.getPartitioner().decorateKey(key);
@@ -74,22 +83,43 @@
 
     public static RowPosition rp(String key, IPartitioner partitioner)
     {
-        return RowPosition.forKey(ByteBufferUtil.bytes(key), partitioner);
+        return RowPosition.ForKey.get(ByteBufferUtil.bytes(key), partitioner);
     }
 
-    public static Column column(String name, String value, long timestamp)
+    public static CellName cellname(ByteBuffer... bbs)
     {
-        return new Column(ByteBufferUtil.bytes(name), ByteBufferUtil.bytes(value), timestamp);
+        if (bbs.length == 1)
+            return CellNames.simpleDense(bbs[0]);
+        else
+            return CellNames.compositeDense(bbs);
     }
 
-    public static Column expiringColumn(String name, String value, long timestamp, int ttl)
+    public static CellName cellname(String... strs)
     {
-        return new ExpiringColumn(ByteBufferUtil.bytes(name), ByteBufferUtil.bytes(value), timestamp, ttl);
+        ByteBuffer[] bbs = new ByteBuffer[strs.length];
+        for (int i = 0; i < strs.length; i++)
+            bbs[i] = ByteBufferUtil.bytes(strs[i]);
+        return cellname(bbs);
     }
 
-    public static Column counterColumn(String name, long value, long timestamp)
+    public static CellName cellname(int i)
     {
-        return new CounterUpdateColumn(ByteBufferUtil.bytes(name), value, timestamp);
+        return CellNames.simpleDense(ByteBufferUtil.bytes(i));
+    }
+
+    public static CellName cellname(long l)
+    {
+        return CellNames.simpleDense(ByteBufferUtil.bytes(l));
+    }
+
+    public static Cell column(String name, String value, long timestamp)
+    {
+        return new BufferCell(cellname(name), ByteBufferUtil.bytes(value), timestamp);
+    }
+
+    public static Cell expiringColumn(String name, String value, long timestamp, int ttl)
+    {
+        return new BufferExpiringCell(cellname(name), ByteBufferUtil.bytes(value), timestamp, ttl);
     }
 
     public static Token token(String key)
@@ -112,11 +142,11 @@
         return new Bounds<RowPosition>(rp(left), rp(right));
     }
 
-    public static void addMutation(RowMutation rm, String columnFamilyName, String superColumnName, long columnName, String value, long timestamp)
+    public static void addMutation(Mutation rm, String columnFamilyName, String superColumnName, long columnName, String value, long timestamp)
     {
-        ByteBuffer cname = superColumnName == null
-                         ? getBytes(columnName)
-                         : CompositeType.build(ByteBufferUtil.bytes(superColumnName), getBytes(columnName));
+        CellName cname = superColumnName == null
+                       ? CellNames.simpleDense(getBytes(columnName))
+                       : CellNames.compositeDense(ByteBufferUtil.bytes(superColumnName), getBytes(columnName));
         rm.add(columnFamilyName, cname, ByteBufferUtil.bytes(value), timestamp);
     }
 
@@ -138,12 +168,12 @@
         return bb;
     }
 
-    public static List<Row> getRangeSlice(ColumnFamilyStore cfs) throws IOException, ExecutionException, InterruptedException
+    public static List<Row> getRangeSlice(ColumnFamilyStore cfs)
     {
         return getRangeSlice(cfs, null);
     }
 
-    public static List<Row> getRangeSlice(ColumnFamilyStore cfs, ByteBuffer superColumn) throws IOException, ExecutionException, InterruptedException
+    public static List<Row> getRangeSlice(ColumnFamilyStore cfs, ByteBuffer superColumn)
     {
         IDiskAtomFilter filter = superColumn == null
                                ? new IdentityQueryFilter()
@@ -156,16 +186,16 @@
     /**
      * Writes out a bunch of mutations for a single column family.
      *
-     * @param mutations A group of RowMutations for the same keyspace and column family.
+     * @param mutations A group of Mutations for the same keyspace and column family.
      * @return The ColumnFamilyStore that was used.
      */
-    public static ColumnFamilyStore writeColumnFamily(List<IMutation> mutations) throws IOException, ExecutionException, InterruptedException
+    public static ColumnFamilyStore writeColumnFamily(List<Mutation> mutations)
     {
         IMutation first = mutations.get(0);
         String keyspaceName = first.getKeyspaceName();
         UUID cfid = first.getColumnFamilyIds().iterator().next();
 
-        for (IMutation rm : mutations)
+        for (Mutation rm : mutations)
             rm.apply();
 
         ColumnFamilyStore store = Keyspace.open(keyspaceName).getColumnFamilyStore(cfid);
@@ -173,7 +203,7 @@
         return store;
     }
 
-    public static ColumnFamily getColumnFamily(Keyspace keyspace, DecoratedKey key, String cfName) throws IOException
+    public static ColumnFamily getColumnFamily(Keyspace keyspace, DecoratedKey key, String cfName)
     {
         ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(cfName);
         assert cfStore != null : "Column family " + cfName + " has not been defined";
@@ -281,4 +311,69 @@
 
         assert thrown : exception.getName() + " not received";
     }
+
+    public static ByteBuffer serializeForSSTable(ColumnFamily cf)
+    {
+        try
+        {
+            DataOutputBuffer out = new DataOutputBuffer();
+            DeletionTime.serializer.serialize(cf.deletionInfo().getTopLevelDeletion(), out);
+            out.writeInt(cf.getColumnCount());
+            new ColumnIndex.Builder(cf, ByteBufferUtil.EMPTY_BYTE_BUFFER, out).build(cf);
+            return ByteBuffer.wrap(out.toByteArray());
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static QueryFilter namesQueryFilter(ColumnFamilyStore cfs, DecoratedKey key)
+    {
+        SortedSet<CellName> s = new TreeSet<CellName>(cfs.getComparator());
+        return QueryFilter.getNamesFilter(key, cfs.name, s, System.currentTimeMillis());
+    }
+
+    public static QueryFilter namesQueryFilter(ColumnFamilyStore cfs, DecoratedKey key, String... names)
+    {
+        SortedSet<CellName> s = new TreeSet<CellName>(cfs.getComparator());
+        for (String str : names)
+            s.add(cellname(str));
+        return QueryFilter.getNamesFilter(key, cfs.name, s, System.currentTimeMillis());
+    }
+
+    public static QueryFilter namesQueryFilter(ColumnFamilyStore cfs, DecoratedKey key, CellName... names)
+    {
+        SortedSet<CellName> s = new TreeSet<CellName>(cfs.getComparator());
+        for (CellName n : names)
+            s.add(n);
+        return QueryFilter.getNamesFilter(key, cfs.name, s, System.currentTimeMillis());
+    }
+
+    public static NamesQueryFilter namesFilter(ColumnFamilyStore cfs, String... names)
+    {
+        SortedSet<CellName> s = new TreeSet<CellName>(cfs.getComparator());
+        for (String str : names)
+            s.add(cellname(str));
+        return new NamesQueryFilter(s);
+    }
+
+    public static String string(ByteBuffer bb)
+    {
+        try
+        {
+            return ByteBufferUtil.string(bb);
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static RangeTombstone tombstone(String start, String finish, long timestamp, int localtime)
+    {
+        Composite startName = CellNames.simpleDense(ByteBufferUtil.bytes(start));
+        Composite endName = CellNames.simpleDense(ByteBufferUtil.bytes(finish));
+        return new RangeTombstone(startName, endName, timestamp , localtime);
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
new file mode 100644
index 0000000..28afef1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java

@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cache;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.service.CacheService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class AutoSavingCacheTest extends SchemaLoader
+{
+    @Test
+    public void testSerializeAndLoadKeyCache() throws Exception
+    {
+        ColumnFamilyStore cfs = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard1");
+        for (int i = 0; i < 2; i++)
+        {
+            Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("key1"));
+            rm.add("Standard1", Util.cellname("c1"), ByteBufferUtil.bytes(i), 0);
+            rm.apply();
+            cfs.forceBlockingFlush();
+        }
+
+        Assert.assertEquals(2, cfs.getSSTables().size());
+
+        // preheat key cache
+        for (SSTableReader sstable : cfs.getSSTables())
+            sstable.getPosition(Util.dk("key1"), SSTableReader.Operator.EQ);
+
+        AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache = CacheService.instance.keyCache;
+
+        // serialize to file
+        keyCache.submitWrite(keyCache.size()).get();
+        keyCache.clear();
+
+        Assert.assertEquals(0, keyCache.size());
+
+        // then load saved
+        keyCache.loadSaved(cfs);
+        Assert.assertEquals(2, keyCache.size());
+        for (SSTableReader sstable : cfs.getSSTables())
+            Assert.assertNotNull(keyCache.get(new KeyCacheKey(cfs.metadata.cfId, sstable.descriptor, ByteBufferUtil.bytes("key1"))));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cache/CacheProviderTest.java b/test/unit/org/apache/cassandra/cache/CacheProviderTest.java
index 189e888..71d4f80 100644
--- a/test/unit/org/apache/cassandra/cache/CacheProviderTest.java
+++ b/test/unit/org/apache/cassandra/cache/CacheProviderTest.java

@@ -29,10 +29,10 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.ArrayBackedSortedColumns;
 import org.apache.cassandra.db.ColumnFamily;
 
 import com.googlecode.concurrentlinkedhashmap.Weighers;
-import org.apache.cassandra.db.TreeMapBackedSortedColumns;
 
 import static org.apache.cassandra.Util.column;
 import static org.junit.Assert.*;
@@ -51,7 +51,7 @@
     private void simpleCase(ColumnFamily cf, ICache<MeasureableString, IRowCacheEntry> cache)
     {
         cache.put(key1, cf);
-        assert cache.get(key1) != null;
+        assertNotNull(cache.get(key1));
 
         assertDigests(cache.get(key1), cf);
         cache.put(key2, cf);
@@ -65,8 +65,8 @@
     private void assertDigests(IRowCacheEntry one, ColumnFamily two)
     {
         // CF does not implement .equals
-        assert one instanceof ColumnFamily;
-        assert ColumnFamily.digest((ColumnFamily)one).equals(ColumnFamily.digest(two));
+        assertTrue(one instanceof ColumnFamily);
+        assertEquals(ColumnFamily.digest((ColumnFamily)one), ColumnFamily.digest(two));
     }
 
     // TODO this isn't terribly useful
@@ -100,7 +100,7 @@
 
     private ColumnFamily createCF()
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(keyspaceName, cfName);
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(keyspaceName, cfName);
         cf.addColumn(column("vijay", "great", 1));
         cf.addColumn(column("awesome", "vijay", 1));
         return cf;
@@ -142,7 +142,7 @@
             this.string = input;
         }
 
-        public long memorySize()
+        public long unsharedHeapSize()
         {
             return string.length();
         }

diff --git a/test/unit/org/apache/cassandra/cache/ObjectSizeTest.java b/test/unit/org/apache/cassandra/cache/ObjectSizeTest.java
deleted file mode 100644
index 4bb9b20..0000000
--- a/test/unit/org/apache/cassandra/cache/ObjectSizeTest.java
+++ /dev/null

@@ -1,105 +0,0 @@
-package org.apache.cassandra.cache;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.nio.ByteBuffer;
-import java.util.UUID;
-
-import org.junit.Assert;
-
-import org.apache.cassandra.db.ColumnIndex;
-import org.apache.cassandra.db.DeletionTime;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.utils.ObjectSizes;
-import org.github.jamm.MemoryMeter;
-import org.junit.Test;
-
-public class ObjectSizeTest
-{
-    public static final MemoryMeter meter = new MemoryMeter().omitSharedBufferOverhead();
-
-    @Test
-    public void testArraySizes()
-    {
-        long size = ObjectSizes.getArraySize(0, 1);
-        long size2 = meter.measureDeep(new byte[0]);
-        Assert.assertEquals(size, size2);
-    }
-
-    @Test
-    public void testBiggerArraySizes()
-    {
-        long size = ObjectSizes.getArraySize(0, 1);
-        long size2 = meter.measureDeep(new byte[0]);
-        Assert.assertEquals(size, size2);
-
-        size = ObjectSizes.getArraySize(8, 1);
-        size2 = meter.measureDeep(new byte[8]);
-        Assert.assertEquals(size, size2);
-    }
-
-    @Test
-    public void testKeyCacheKey()
-    {
-        KeyCacheKey key = new KeyCacheKey(null, ByteBuffer.wrap(new byte[0]));
-        long size = key.memorySize();
-        long size2 = meter.measureDeep(key);
-        Assert.assertEquals(size, size2);
-    }
-
-    @Test
-    public void testKeyCacheValue()
-    {
-        RowIndexEntry entry = new RowIndexEntry(123);
-        long size = entry.memorySize();
-        long size2 = meter.measureDeep(entry);
-        Assert.assertEquals(size, size2);
-    }
-
-    @Test
-    public void testKeyCacheValueWithDelInfo()
-    {
-        RowIndexEntry entry = RowIndexEntry.create(123, new DeletionTime(123, 123), ColumnIndex.nothing());
-        long size = entry.memorySize();
-        long size2 = meter.measureDeep(entry);
-        Assert.assertEquals(size, size2);
-    }
-
-    @Test
-    public void testRowCacheKey()
-    {
-        UUID id = UUID.randomUUID();
-        RowCacheKey key = new RowCacheKey(id, ByteBuffer.wrap(new byte[11]));
-        long size = key.memorySize();
-        long size2 = meter.measureDeep(key) - meter.measureDeep(id);
-        Assert.assertEquals(size, size2);
-    }
-
-    @Test
-    public void testRowCacheSentinel()
-    {
-        RowCacheSentinel sentinel = new RowCacheSentinel(123);
-        long size = sentinel.memorySize();
-        long size2 = meter.measureDeep(sentinel);
-        Assert.assertEquals(size, size2);
-    }
-}

diff --git a/test/unit/org/apache/cassandra/cli/CliTest.java b/test/unit/org/apache/cassandra/cli/CliTest.java
index 5789fac..38f567b 100644
--- a/test/unit/org/apache/cassandra/cli/CliTest.java
+++ b/test/unit/org/apache/cassandra/cli/CliTest.java

@@ -19,7 +19,7 @@
 package org.apache.cassandra.cli;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.service.EmbeddedCassandraService;
 import org.apache.cassandra.thrift.*;
@@ -227,7 +227,7 @@
         ByteArrayOutputStream outStream = new ByteArrayOutputStream();
 
         // checking if we can connect to the running cassandra node on localhost
-        CliMain.connect("127.0.0.1", 9170);
+        CliMain.connect("127.0.0.1", DatabaseDescriptor.getRpcPort());
 
         // setting new output stream
         CliMain.sessionState.setOut(new PrintStream(outStream));
@@ -254,8 +254,8 @@
             // System.out.println("Result:\n" + result);
             if (statement.startsWith("show schema"))
                 assertEquals(errStream.toString() + "processing" + statement,
-                             "\nWARNING: CQL3 tables are intentionally omitted from 'show schema' output.\n"
-                             + "See https://issues.apache.org/jira/browse/CASSANDRA-4377 for details.\n\n",
+                             "\nWARNING: CQL3 tables are intentionally omitted from 'show schema' output." + String.format("%n")
+                             + "See https://issues.apache.org/jira/browse/CASSANDRA-4377 for details.\n" + String.format("%n"),
                              errStream.toString());
             else
                 assertEquals(errStream.toString() + " processing " + statement, "", errStream.toString());

diff --git a/test/unit/org/apache/cassandra/concurrent/DebuggableThreadPoolExecutorTest.java b/test/unit/org/apache/cassandra/concurrent/DebuggableThreadPoolExecutorTest.java
index aba84ea..5040a24 100644
--- a/test/unit/org/apache/cassandra/concurrent/DebuggableThreadPoolExecutorTest.java
+++ b/test/unit/org/apache/cassandra/concurrent/DebuggableThreadPoolExecutorTest.java

@@ -31,7 +31,7 @@
 public class DebuggableThreadPoolExecutorTest
 {
     @Test
-    public void testSerialization() throws InterruptedException
+    public void testSerialization()
     {
         LinkedBlockingQueue<Runnable> q = new LinkedBlockingQueue<Runnable>(1);
         DebuggableThreadPoolExecutor executor = new DebuggableThreadPoolExecutor(1,

diff --git a/test/unit/org/apache/cassandra/concurrent/WaitQueueTest.java b/test/unit/org/apache/cassandra/concurrent/WaitQueueTest.java
new file mode 100644
index 0000000..3e7cb7b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/concurrent/WaitQueueTest.java

@@ -0,0 +1,158 @@
+package org.apache.cassandra.concurrent;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+import org.junit.*;
+
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import static org.junit.Assert.*;
+
+public class WaitQueueTest
+{
+
+    @Test
+    public void testSerial() throws InterruptedException
+    {
+        testSerial(new WaitQueue());
+    }
+    public void testSerial(final WaitQueue queue) throws InterruptedException
+    {
+        Thread[] ts = new Thread[4];
+        for (int i = 0 ; i < ts.length ; i++)
+            ts[i] = new Thread(new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                WaitQueue.Signal wait = queue.register();
+                try
+                {
+                    wait.await();
+                } catch (InterruptedException e)
+                {
+                    e.printStackTrace();
+                }
+            }
+        });
+        for (int i = 0 ; i < ts.length ; i++)
+            ts[i].start();
+        Thread.sleep(100);
+        queue.signal();
+        queue.signal();
+        queue.signal();
+        queue.signal();
+        for (int i = 0 ; i < ts.length ; i++)
+        {
+            ts[i].join(100);
+            assertFalse(queue.getClass().getName(), ts[i].isAlive());
+        }
+    }
+
+
+    @Test
+    public void testCondition1() throws InterruptedException
+    {
+        testCondition1(new WaitQueue());
+    }
+
+    public void testCondition1(final WaitQueue queue) throws InterruptedException
+    {
+        final AtomicBoolean cond1 = new AtomicBoolean(false);
+        final AtomicBoolean fail = new AtomicBoolean(false);
+        Thread t1 = new Thread(new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                try
+                {
+                    Thread.sleep(200);
+                } catch (InterruptedException e)
+                {
+                    e.printStackTrace();
+                }
+                WaitQueue.Signal wait = queue.register();
+                if (!cond1.get())
+                {
+                    System.err.println("Condition should have already been met");
+                    fail.set(true);
+                }
+            }
+        });
+        t1.start();
+        Thread.sleep(50);
+        cond1.set(true);
+        Thread.sleep(300);
+        queue.signal();
+        t1.join(300);
+        assertFalse(queue.getClass().getName(), t1.isAlive());
+        assertFalse(fail.get());
+    }
+
+    @Test
+    public void testCondition2() throws InterruptedException
+    {
+        testCondition2(new WaitQueue());
+    }
+    public void testCondition2(final WaitQueue queue) throws InterruptedException
+    {
+        final AtomicBoolean condition = new AtomicBoolean(false);
+        final AtomicBoolean fail = new AtomicBoolean(false);
+        Thread t = new Thread(new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                WaitQueue.Signal wait = queue.register();
+                if (condition.get())
+                {
+                    System.err.println("");
+                    fail.set(true);
+                }
+
+                try
+                {
+                    Thread.sleep(200);
+                    wait.await();
+                } catch (InterruptedException e)
+                {
+                    e.printStackTrace();
+                }
+                if (!condition.get())
+                {
+                    System.err.println("Woke up when condition not met");
+                    fail.set(true);
+                }
+            }
+        });
+        t.start();
+        Thread.sleep(50);
+        condition.set(true);
+        queue.signal();
+        t.join(300);
+        assertFalse(queue.getClass().getName(), t.isAlive());
+        assertFalse(fail.get());
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/config/CFMetaDataTest.java b/test/unit/org/apache/cassandra/config/CFMetaDataTest.java
index b0cafc4..71f21a2 100644
--- a/test/unit/org/apache/cassandra/config/CFMetaDataTest.java
+++ b/test/unit/org/apache/cassandra/config/CFMetaDataTest.java

@@ -21,6 +21,7 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.HashMap;
+import java.util.HashSet;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.cql3.QueryProcessor;
@@ -92,7 +93,7 @@
         assertEquals(thriftCfDef.name, converted.name);
         assertEquals(thriftCfDef.default_validation_class, converted.default_validation_class);
         assertEquals(thriftCfDef.comment, converted.comment);
-        assertEquals(thriftCfDef.column_metadata, converted.column_metadata);
+        assertEquals(new HashSet<>(thriftCfDef.column_metadata), new HashSet<>(converted.column_metadata));
     }
 
     @Test
@@ -105,10 +106,11 @@
                 CFMetaData cfm = cfs.metadata;
                 if (!cfm.isThriftCompatible())
                     continue;
+
                 checkInverses(cfm);
 
                 // Testing with compression to catch #3558
-                CFMetaData withCompression = CFMetaData.rename(cfm, cfm.cfName); // basically a clone
+                CFMetaData withCompression = cfm.copy();
                 withCompression.compressionParameters(new CompressionParameters(SnappyCompressor.instance, 32768, new HashMap<String, String>()));
                 checkInverses(withCompression);
             }
@@ -122,14 +124,14 @@
         // Test thrift conversion
         CFMetaData before = cfm;
         CFMetaData after = CFMetaData.fromThriftForUpdate(before.toThrift(), before);
-        assertThat(after, is(before));
+        assert before.equals(after) : String.format("%n%s%n!=%n%s", before, after);
 
         // Test schema conversion
-        RowMutation rm = cfm.toSchema(System.currentTimeMillis());
+        Mutation rm = cfm.toSchema(System.currentTimeMillis());
         ColumnFamily serializedCf = rm.getColumnFamily(Schema.instance.getId(Keyspace.SYSTEM_KS, SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF));
         ColumnFamily serializedCD = rm.getColumnFamily(Schema.instance.getId(Keyspace.SYSTEM_KS, SystemKeyspace.SCHEMA_COLUMNS_CF));
         UntypedResultSet.Row result = QueryProcessor.resultify("SELECT * FROM system.schema_columnfamilies", new Row(k, serializedCf)).one();
-        CFMetaData newCfm = CFMetaData.addColumnDefinitionsFromSchema(CFMetaData.fromSchemaNoColumnsNoTriggers(result), new Row(k, serializedCD));
-        assertThat(newCfm, is(cfm));
+        CFMetaData newCfm = CFMetaData.fromSchemaNoTriggers(result, ColumnDefinition.resultify(new Row(k, serializedCD)));
+        assert cfm.equals(newCfm) : String.format("%n%s%n!=%n%s", cfm, newCfm);
     }
 }

diff --git a/test/unit/org/apache/cassandra/config/ColumnDefinitionTest.java b/test/unit/org/apache/cassandra/config/ColumnDefinitionTest.java
index fa7343c..890c46c 100644
--- a/test/unit/org/apache/cassandra/config/ColumnDefinitionTest.java
+++ b/test/unit/org/apache/cassandra/config/ColumnDefinitionTest.java

@@ -20,12 +20,11 @@
  *
  */
 
-
+import org.junit.Assert;
 import org.junit.Test;
 
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.thrift.IndexType;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class ColumnDefinitionTest
@@ -33,20 +32,22 @@
     @Test
     public void testSerializeDeserialize() throws Exception
     {
-        ColumnDefinition cd0 = ColumnDefinition.regularDef(ByteBufferUtil.bytes("TestColumnDefinitionName0"), BytesType.instance, null)
+        CFMetaData cfm = CFMetaData.denseCFMetaData("ks", "cf", UTF8Type.instance);
+
+        ColumnDefinition cd0 = ColumnDefinition.regularDef(cfm, ByteBufferUtil.bytes("TestColumnDefinitionName0"), BytesType.instance, null)
                                                .setIndex("random index name 0", IndexType.KEYS, null);
 
-        ColumnDefinition cd1 = ColumnDefinition.regularDef(ByteBufferUtil.bytes("TestColumnDefinition1"), LongType.instance, null);
+        ColumnDefinition cd1 = ColumnDefinition.regularDef(cfm, ByteBufferUtil.bytes("TestColumnDefinition1"), LongType.instance, null);
 
-        testSerializeDeserialize(cd0);
-        testSerializeDeserialize(cd1);
+        testSerializeDeserialize(cfm, cd0);
+        testSerializeDeserialize(cfm, cd1);
     }
 
-    protected void testSerializeDeserialize(ColumnDefinition cd) throws Exception
+    protected void testSerializeDeserialize(CFMetaData cfm, ColumnDefinition cd) throws Exception
     {
-        ColumnDefinition newCd = ColumnDefinition.fromThrift(cd.toThrift(), false);
-        assert cd != newCd;
-        assert cd.hashCode() == newCd.hashCode();
-        assert cd.equals(newCd);
+        ColumnDefinition newCd = ColumnDefinition.fromThrift(cfm.ksName, cfm.cfName, cfm.comparator.asAbstractType(), null, cd.toThrift());
+        Assert.assertNotSame(cd, newCd);
+        Assert.assertEquals(cd.hashCode(), newCd.hashCode());
+        Assert.assertEquals(cd, newCd);
     }
 }

diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
index 7d1c82b..f6d4ad4 100644
--- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
+++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java

@@ -18,27 +18,27 @@
 */
 package org.apache.cassandra.config;
 
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.service.MigrationManager;
 
-import org.junit.Test;
-import org.junit.runner.RunWith;
-
-import static org.junit.Assert.*;
-
-
-import java.io.IOException;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class DatabaseDescriptorTest
 {
     @Test
-    public void testCFMetaDataSerialization() throws IOException, ConfigurationException, InvalidRequestException
+    public void testCFMetaDataSerialization() throws ConfigurationException, InvalidRequestException
     {
         // test serialization of all defined test CFs.
         for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
@@ -46,21 +46,21 @@
             for (CFMetaData cfm : Schema.instance.getKeyspaceMetaData(keyspaceName).values())
             {
                 CFMetaData cfmDupe = CFMetaData.fromThrift(cfm.toThrift());
-                assert cfmDupe != null;
-                assert cfmDupe.equals(cfm);
+                assertNotNull(cfmDupe);
+                assertEquals(cfm, cfmDupe);
             }
         }
     }
 
     @Test
-    public void testKSMetaDataSerialization() throws IOException, ConfigurationException
+    public void testKSMetaDataSerialization() throws ConfigurationException
     {
         for (KSMetaData ksm : Schema.instance.getKeyspaceDefinitions())
         {
             // Not testing round-trip on the KsDef via serDe() because maps
             KSMetaData ksmDupe = KSMetaData.fromThrift(ksm.toThrift());
-            assert ksmDupe != null;
-            assert ksmDupe.equals(ksm);
+            assertNotNull(ksmDupe);
+            assertEquals(ksm, ksmDupe);
         }
     }
 
@@ -70,9 +70,10 @@
     {
         SchemaLoader.cleanupAndLeaveDirs();
         DatabaseDescriptor.loadSchemas();
-        assert Schema.instance.getNonSystemKeyspaces().size() == 0;
+        assertEquals(0, Schema.instance.getNonSystemKeyspaces().size());
 
         Gossiper.instance.start((int)(System.currentTimeMillis() / 1000));
+        Keyspace.setInitialized();
 
         try
         {
@@ -80,19 +81,19 @@
             MigrationManager.announceNewKeyspace(KSMetaData.testMetadata("ks0", SimpleStrategy.class, KSMetaData.optsWithRF(3)));
             MigrationManager.announceNewKeyspace(KSMetaData.testMetadata("ks1", SimpleStrategy.class, KSMetaData.optsWithRF(3)));
 
-            assert Schema.instance.getKSMetaData("ks0") != null;
-            assert Schema.instance.getKSMetaData("ks1") != null;
+            assertNotNull(Schema.instance.getKSMetaData("ks0"));
+            assertNotNull(Schema.instance.getKSMetaData("ks1"));
 
             Schema.instance.clearKeyspaceDefinition(Schema.instance.getKSMetaData("ks0"));
             Schema.instance.clearKeyspaceDefinition(Schema.instance.getKSMetaData("ks1"));
 
-            assert Schema.instance.getKSMetaData("ks0") == null;
-            assert Schema.instance.getKSMetaData("ks1") == null;
+            assertNull(Schema.instance.getKSMetaData("ks0"));
+            assertNull(Schema.instance.getKSMetaData("ks1"));
 
             DatabaseDescriptor.loadSchemas();
 
-            assert Schema.instance.getKSMetaData("ks0") != null;
-            assert Schema.instance.getKSMetaData("ks1") != null;
+            assertNotNull(Schema.instance.getKSMetaData("ks0"));
+            assertNotNull(Schema.instance.getKSMetaData("ks1"));
         }
         finally
         {
@@ -106,6 +107,7 @@
         // By default, we should load from the yaml
         Config config = DatabaseDescriptor.loadConfig();
         assertEquals("Test Cluster", config.cluster_name);
+        Keyspace.setInitialized();
 
         // Now try custom loader
         ConfigurationLoader testLoader = new TestLoader();

diff --git a/test/unit/org/apache/cassandra/config/DefsTest.java b/test/unit/org/apache/cassandra/config/DefsTest.java
index 943745a..94738ac 100644
--- a/test/unit/org/apache/cassandra/config/DefsTest.java
+++ b/test/unit/org/apache/cassandra/config/DefsTest.java

@@ -19,16 +19,13 @@
 package org.apache.cassandra.config;
 
 import java.io.File;
-import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.ExecutionException;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.db.marshal.TimeUUIDType;
@@ -39,10 +36,10 @@
 import org.apache.cassandra.locator.OldNetworkTopologyStrategy;
 import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.thrift.IndexType;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
+import static org.apache.cassandra.Util.cellname;
 
+import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
@@ -52,51 +49,48 @@
     @Test
     public void testCFMetaDataApply() throws ConfigurationException
     {
-        Map<ByteBuffer, ColumnDefinition> indexes = new HashMap<ByteBuffer, ColumnDefinition>();
-        for (int i = 0; i < 5; i++)
-        {
-            ByteBuffer name = ByteBuffer.wrap(new byte[] { (byte)i });
-            indexes.put(name, ColumnDefinition.regularDef(name, BytesType.instance, null).setIndex(Integer.toString(i), IndexType.KEYS, null));
-        }
         CFMetaData cfm = new CFMetaData("Keyspace1",
                                         "TestApplyCFM_CF",
                                         ColumnFamilyType.Standard,
-                                        BytesType.instance);
+                                        new SimpleDenseCellNameType(BytesType.instance));
+
+        for (int i = 0; i < 5; i++)
+        {
+            ByteBuffer name = ByteBuffer.wrap(new byte[] { (byte)i });
+            cfm.addColumnDefinition(ColumnDefinition.regularDef(cfm, name, BytesType.instance, null).setIndex(Integer.toString(i), IndexType.KEYS, null));
+        }
 
         cfm.comment("No comment")
            .readRepairChance(0.5)
-           .replicateOnWrite(false)
            .gcGraceSeconds(100000)
-           .defaultValidator(null)
            .minCompactionThreshold(500)
-           .maxCompactionThreshold(500)
-           .columnMetadata(indexes);
+           .maxCompactionThreshold(500);
 
         // we'll be adding this one later. make sure it's not already there.
         assert cfm.getColumnDefinition(ByteBuffer.wrap(new byte[] { 5 })) == null;
 
-        CFMetaData cfNew = cfm.clone();
+        CFMetaData cfNew = cfm.copy();
 
         // add one.
-        ColumnDefinition addIndexDef = ColumnDefinition.regularDef(ByteBuffer.wrap(new byte[] { 5 }), BytesType.instance, null)
+        ColumnDefinition addIndexDef = ColumnDefinition.regularDef(cfm, ByteBuffer.wrap(new byte[] { 5 }), BytesType.instance, null)
                                                        .setIndex("5", IndexType.KEYS, null);
         cfNew.addColumnDefinition(addIndexDef);
 
         // remove one.
-        ColumnDefinition removeIndexDef = ColumnDefinition.regularDef(ByteBuffer.wrap(new byte[] { 0 }), BytesType.instance, null)
+        ColumnDefinition removeIndexDef = ColumnDefinition.regularDef(cfm, ByteBuffer.wrap(new byte[] { 0 }), BytesType.instance, null)
                                                           .setIndex("0", IndexType.KEYS, null);
         assert cfNew.removeColumnDefinition(removeIndexDef);
 
         cfm.apply(cfNew);
 
-        for (int i = 1; i < indexes.size(); i++)
+        for (int i = 1; i < cfm.allColumns().size(); i++)
             assert cfm.getColumnDefinition(ByteBuffer.wrap(new byte[] { 1 })) != null;
         assert cfm.getColumnDefinition(ByteBuffer.wrap(new byte[] { 0 })) == null;
         assert cfm.getColumnDefinition(ByteBuffer.wrap(new byte[] { 5 })) != null;
     }
 
     @Test
-    public void testInvalidNames() throws IOException
+    public void testInvalidNames()
     {
         String[] valid = {"1", "a", "_1", "b_", "__", "1_a"};
         for (String s : valid)
@@ -107,8 +101,9 @@
             assert !CFMetaData.isNameValid(s);
     }
 
+    @Ignore
     @Test
-    public void saveAndRestore() throws IOException
+    public void saveAndRestore()
     {
         /*
         // verify dump and reload.
@@ -127,7 +122,7 @@
     }
 
     @Test
-    public void addNewCfToBogusKeyspace() throws InterruptedException
+    public void addNewCfToBogusKeyspace()
     {
         CFMetaData newCf = addTestCF("MadeUpKeyspace", "NewCF", "new cf");
         try
@@ -141,7 +136,7 @@
     }
 
     @Test
-    public void addNewCfWithNullComment() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void addNewCfWithNullComment() throws ConfigurationException
     {
         final String ks = "Keyspace1";
         final String cf = "BrandNewCfWithNull";
@@ -157,7 +152,7 @@
     }
 
     @Test
-    public void addNewCF() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void addNewCF() throws ConfigurationException
     {
         final String ks = "Keyspace1";
         final String cf = "BrandNewCf";
@@ -172,22 +167,23 @@
         assert Schema.instance.getKSMetaData(ks).cfMetaData().get(newCf.cfName).equals(newCf);
 
         // now read and write to it.
+        CellName col0 = cellname("col0");
         DecoratedKey dk = Util.dk("key0");
-        RowMutation rm = new RowMutation(ks, dk.key);
-        rm.add(cf, ByteBufferUtil.bytes("col0"), ByteBufferUtil.bytes("value0"), 1L);
+        Mutation rm = new Mutation(ks, dk.getKey());
+        rm.add(cf, col0, ByteBufferUtil.bytes("value0"), 1L);
         rm.apply();
         ColumnFamilyStore store = Keyspace.open(ks).getColumnFamilyStore(cf);
         assert store != null;
         store.forceBlockingFlush();
 
-        ColumnFamily cfam = store.getColumnFamily(QueryFilter.getNamesFilter(dk, cf, FBUtilities.singleton(ByteBufferUtil.bytes("col0"), store.getComparator()), System.currentTimeMillis()));
-        assert cfam.getColumn(ByteBufferUtil.bytes("col0")) != null;
-        Column col = cfam.getColumn(ByteBufferUtil.bytes("col0"));
+        ColumnFamily cfam = store.getColumnFamily(Util.namesQueryFilter(store, dk, col0));
+        assert cfam.getColumn(col0) != null;
+        Cell col = cfam.getColumn(col0);
         assert ByteBufferUtil.bytes("value0").equals(col.value());
     }
 
     @Test
-    public void dropCf() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void dropCf() throws ConfigurationException
     {
         DecoratedKey dk = Util.dk("dropCf");
         // sanity
@@ -197,9 +193,9 @@
         assert cfm != null;
 
         // write some data, force a flush, then verify that files exist on disk.
-        RowMutation rm = new RowMutation(ks.name, dk.key);
+        Mutation rm = new Mutation(ks.name, dk.getKey());
         for (int i = 0; i < 100; i++)
-            rm.add(cfm.cfName, ByteBufferUtil.bytes(("col" + i)), ByteBufferUtil.bytes("anyvalue"), 1L);
+            rm.add(cfm.cfName, cellname("col" + i), ByteBufferUtil.bytes("anyvalue"), 1L);
         rm.apply();
         ColumnFamilyStore store = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName);
         assert store != null;
@@ -211,11 +207,11 @@
         assert !Schema.instance.getKSMetaData(ks.name).cfMetaData().containsKey(cfm.cfName);
 
         // any write should fail.
-        rm = new RowMutation(ks.name, dk.key);
+        rm = new Mutation(ks.name, dk.getKey());
         boolean success = true;
         try
         {
-            rm.add("Standard1", ByteBufferUtil.bytes("col0"), ByteBufferUtil.bytes("value0"), 1L);
+            rm.add("Standard1", cellname("col0"), ByteBufferUtil.bytes("value0"), 1L);
             rm.apply();
         }
         catch (Throwable th)
@@ -233,7 +229,7 @@
     }
 
     @Test
-    public void addNewKS() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void addNewKS() throws ConfigurationException
     {
         DecoratedKey dk = Util.dk("key0");
         CFMetaData newCf = addTestCF("NewKeyspace1", "AddedStandard1", "A new cf for a new ks");
@@ -246,21 +242,22 @@
         assert Schema.instance.getKSMetaData(newCf.ksName).equals(newKs);
 
         // test reads and writes.
-        RowMutation rm = new RowMutation(newCf.ksName, dk.key);
-        rm.add(newCf.cfName, ByteBufferUtil.bytes("col0"), ByteBufferUtil.bytes("value0"), 1L);
+        CellName col0 = cellname("col0");
+        Mutation rm = new Mutation(newCf.ksName, dk.getKey());
+        rm.add(newCf.cfName, col0, ByteBufferUtil.bytes("value0"), 1L);
         rm.apply();
         ColumnFamilyStore store = Keyspace.open(newCf.ksName).getColumnFamilyStore(newCf.cfName);
         assert store != null;
         store.forceBlockingFlush();
 
-        ColumnFamily cfam = store.getColumnFamily(QueryFilter.getNamesFilter(dk, newCf.cfName, FBUtilities.singleton(ByteBufferUtil.bytes("col0"), store.getComparator()), System.currentTimeMillis()));
-        assert cfam.getColumn(ByteBufferUtil.bytes("col0")) != null;
-        Column col = cfam.getColumn(ByteBufferUtil.bytes("col0"));
+        ColumnFamily cfam = store.getColumnFamily(Util.namesQueryFilter(store, dk, col0));
+        assert cfam.getColumn(col0) != null;
+        Cell col = cfam.getColumn(col0);
         assert ByteBufferUtil.bytes("value0").equals(col.value());
     }
 
     @Test
-    public void dropKS() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void dropKS() throws ConfigurationException
     {
         DecoratedKey dk = Util.dk("dropKs");
         // sanity
@@ -270,9 +267,9 @@
         assert cfm != null;
 
         // write some data, force a flush, then verify that files exist on disk.
-        RowMutation rm = new RowMutation(ks.name, dk.key);
+        Mutation rm = new Mutation(ks.name, dk.getKey());
         for (int i = 0; i < 100; i++)
-            rm.add(cfm.cfName, ByteBufferUtil.bytes(("col" + i)), ByteBufferUtil.bytes("anyvalue"), 1L);
+            rm.add(cfm.cfName, cellname("col" + i), ByteBufferUtil.bytes("anyvalue"), 1L);
         rm.apply();
         ColumnFamilyStore store = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName);
         assert store != null;
@@ -284,11 +281,11 @@
         assert Schema.instance.getKSMetaData(ks.name) == null;
 
         // write should fail.
-        rm = new RowMutation(ks.name, dk.key);
+        rm = new Mutation(ks.name, dk.getKey());
         boolean success = true;
         try
         {
-            rm.add("Standard1", ByteBufferUtil.bytes("col0"), ByteBufferUtil.bytes("value0"), 1L);
+            rm.add("Standard1", cellname("col0"), ByteBufferUtil.bytes("value0"), 1L);
             rm.apply();
         }
         catch (Throwable th)
@@ -311,7 +308,7 @@
     }
 
     @Test
-    public void dropKSUnflushed() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void dropKSUnflushed() throws ConfigurationException
     {
         DecoratedKey dk = Util.dk("dropKs");
         // sanity
@@ -321,9 +318,9 @@
         assert cfm != null;
 
         // write some data
-        RowMutation rm = new RowMutation(ks.name, dk.key);
+        Mutation rm = new Mutation(ks.name, dk.getKey());
         for (int i = 0; i < 100; i++)
-            rm.add(cfm.cfName, ByteBufferUtil.bytes(("col" + i)), ByteBufferUtil.bytes("anyvalue"), 1L);
+            rm.add(cfm.cfName, cellname("col" + i), ByteBufferUtil.bytes("anyvalue"), 1L);
         rm.apply();
 
         MigrationManager.announceKeyspaceDrop(ks.name);
@@ -332,7 +329,7 @@
     }
 
     @Test
-    public void createEmptyKsAddNewCf() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void createEmptyKsAddNewCf() throws ConfigurationException
     {
         assert Schema.instance.getKSMetaData("EmptyKeyspace") == null;
 
@@ -353,22 +350,23 @@
         assert Schema.instance.getKSMetaData(newKs.name).cfMetaData().get(newCf.cfName).equals(newCf);
 
         // now read and write to it.
+        CellName col0 = cellname("col0");
         DecoratedKey dk = Util.dk("key0");
-        RowMutation rm = new RowMutation(newKs.name, dk.key);
-        rm.add(newCf.cfName, ByteBufferUtil.bytes("col0"), ByteBufferUtil.bytes("value0"), 1L);
+        Mutation rm = new Mutation(newKs.name, dk.getKey());
+        rm.add(newCf.cfName, col0, ByteBufferUtil.bytes("value0"), 1L);
         rm.apply();
         ColumnFamilyStore store = Keyspace.open(newKs.name).getColumnFamilyStore(newCf.cfName);
         assert store != null;
         store.forceBlockingFlush();
 
-        ColumnFamily cfam = store.getColumnFamily(QueryFilter.getNamesFilter(dk, newCf.cfName, FBUtilities.singleton(ByteBufferUtil.bytes("col0"), store.getComparator()), System.currentTimeMillis()));
-        assert cfam.getColumn(ByteBufferUtil.bytes("col0")) != null;
-        Column col = cfam.getColumn(ByteBufferUtil.bytes("col0"));
+        ColumnFamily cfam = store.getColumnFamily(Util.namesQueryFilter(store, dk, col0));
+        assert cfam.getColumn(col0) != null;
+        Cell col = cfam.getColumn(col0);
         assert ByteBufferUtil.bytes("value0").equals(col.value());
     }
 
     @Test
-    public void testUpdateKeyspace() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void testUpdateKeyspace() throws ConfigurationException
     {
         // create a keyspace to serve as existing.
         CFMetaData cf = addTestCF("UpdatedKeyspace", "AddedStandard1", "A new cf for a new ks");
@@ -400,7 +398,7 @@
     }
 
     @Test
-    public void testUpdateColumnFamilyNoIndexes() throws ConfigurationException, IOException, ExecutionException, InterruptedException
+    public void testUpdateColumnFamilyNoIndexes() throws ConfigurationException
     {
         // create a keyspace with a cf to update.
         CFMetaData cf = addTestCF("UpdatedCfKs", "Standard1added", "A new cf that will be updated");
@@ -412,8 +410,7 @@
         assert Schema.instance.getCFMetaData(cf.ksName, cf.cfName) != null;
 
         // updating certain fields should fail.
-        CFMetaData newCfm = cf.clone();
-        newCfm.columnMetadata(new HashMap<ByteBuffer, ColumnDefinition>());
+        CFMetaData newCfm = cf.copy();
         newCfm.defaultValidator(BytesType.instance);
         newCfm.minCompactionThreshold(5);
         newCfm.maxCompactionThreshold(31);
@@ -446,7 +443,7 @@
         assert Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getDefaultValidator() == UTF8Type.instance;
 
         // Change cfId
-        newCfm = new CFMetaData(cf.ksName, cf.cfName, cf.cfType, cf.comparator, UUID.randomUUID());
+        newCfm = new CFMetaData(cf.ksName, cf.cfName, cf.cfType, cf.comparator);
         CFMetaData.copyOpts(newCfm, cf);
         try
         {
@@ -486,7 +483,7 @@
         catch (ConfigurationException expected) {}
 
         // Change comparator
-        newCfm = new CFMetaData(cf.ksName, cf.cfName, cf.cfType, TimeUUIDType.instance);
+        newCfm = new CFMetaData(cf.ksName, cf.cfName, cf.cfType, new SimpleDenseCellNameType(TimeUUIDType.instance));
         CFMetaData.copyOpts(newCfm, cf);
         try
         {
@@ -497,25 +494,25 @@
     }
 
     @Test
-    public void testDropIndex() throws IOException, ExecutionException, InterruptedException, ConfigurationException
+    public void testDropIndex() throws ConfigurationException
     {
         // persist keyspace definition in the system keyspace
         Schema.instance.getKSMetaData("Keyspace6").toSchema(System.currentTimeMillis()).apply();
+        ColumnFamilyStore cfs = Keyspace.open("Keyspace6").getColumnFamilyStore("Indexed1");
 
         // insert some data.  save the sstable descriptor so we can make sure it's marked for delete after the drop
-        RowMutation rm = new RowMutation("Keyspace6", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(1L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 0);
+        Mutation rm = new Mutation("Keyspace6", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", cellname("notbirthdate"), ByteBufferUtil.bytes(1L), 0);
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), 0);
         rm.apply();
-        ColumnFamilyStore cfs = Keyspace.open("Keyspace6").getColumnFamilyStore("Indexed1");
         cfs.forceBlockingFlush();
         ColumnFamilyStore indexedCfs = cfs.indexManager.getIndexForColumn(ByteBufferUtil.bytes("birthdate")).getIndexCfs();
         Descriptor desc = indexedCfs.getSSTables().iterator().next().descriptor;
 
         // drop the index
-        CFMetaData meta = cfs.metadata.clone();
+        CFMetaData meta = cfs.metadata.copy();
         ColumnDefinition cdOld = meta.regularColumns().iterator().next();
-        ColumnDefinition cdNew = ColumnDefinition.regularDef(cdOld.name, cdOld.getValidator(), null);
+        ColumnDefinition cdNew = ColumnDefinition.regularDef(meta, cdOld.name.bytes, cdOld.type, null);
         meta.addOrReplaceColumnDefinition(cdNew);
         MigrationManager.announceColumnFamilyUpdate(meta, false);
 
@@ -527,7 +524,7 @@
 
     private CFMetaData addTestCF(String ks, String cf, String comment)
     {
-        CFMetaData newCFMD = new CFMetaData(ks, cf, ColumnFamilyType.Standard, UTF8Type.instance);
+        CFMetaData newCFMD = new CFMetaData(ks, cf, ColumnFamilyType.Standard, new SimpleDenseCellNameType(UTF8Type.instance));
         newCFMD.comment(comment)
                .readRepairChance(0.0);
 

diff --git a/test/unit/org/apache/cassandra/cql3/AlterTableTest.java b/test/unit/org/apache/cassandra/cql3/AlterTableTest.java
new file mode 100644
index 0000000..f5747ed
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/AlterTableTest.java

@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+import org.apache.cassandra.exceptions.InvalidRequestException;
+
+public class AlterTableTest extends CQLTester
+{
+    @Test
+    public void testAddList() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, content text);");
+        execute("ALTER TABLE %s ADD myCollection list<text>;");
+        execute("INSERT INTO %s (id, content , myCollection) VALUES ('test', 'first test', ['first element']);");
+
+        assertRows(execute("SELECT * FROM %s;"), row("test", "first test", list("first element")));
+    }
+
+    @Test
+    public void testDropList() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, content text, myCollection list<text>);");
+        execute("INSERT INTO %s (id, content , myCollection) VALUES ('test', 'first test', ['first element']);");
+        execute("ALTER TABLE %s DROP myCollection;");
+
+        assertRows(execute("SELECT * FROM %s;"), row("test", "first test"));
+    }
+    @Test
+    public void testAddMap() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, content text);");
+        execute("ALTER TABLE %s ADD myCollection map<text, text>;");
+        execute("INSERT INTO %s (id, content , myCollection) VALUES ('test', 'first test', { '1' : 'first element'});");
+
+        assertRows(execute("SELECT * FROM %s;"), row("test", "first test", map("1", "first element")));
+    }
+
+    @Test
+    public void testDropMap() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, content text, myCollection map<text, text>);");
+        execute("INSERT INTO %s (id, content , myCollection) VALUES ('test', 'first test', { '1' : 'first element'});");
+        execute("ALTER TABLE %s DROP myCollection;");
+
+        assertRows(execute("SELECT * FROM %s;"), row("test", "first test"));
+    }
+
+    @Test
+    public void testDropListAndAddListWithSameName() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, content text, myCollection list<text>);");
+        execute("INSERT INTO %s (id, content , myCollection) VALUES ('test', 'first test', ['first element']);");
+        execute("ALTER TABLE %s DROP myCollection;");
+        execute("ALTER TABLE %s ADD myCollection list<text>;");
+
+        assertRows(execute("SELECT * FROM %s;"), row("test", "first test", null));
+        execute("UPDATE %s set myCollection = ['second element'] WHERE id = 'test';");
+        assertRows(execute("SELECT * FROM %s;"), row("test", "first test", list("second element")));
+    }
+    @Test
+    public void testDropListAndAddMapWithSameName() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, content text, myCollection list<text>);");
+        execute("INSERT INTO %s (id, content , myCollection) VALUES ('test', 'first test', ['first element']);");
+        execute("ALTER TABLE %s DROP myCollection;");
+
+        assertInvalid("ALTER TABLE %s ADD myCollection map<int, int>;");
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/BatchTests.java b/test/unit/org/apache/cassandra/cql3/BatchTests.java
new file mode 100644
index 0000000..27d407e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/BatchTests.java

@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import com.datastax.driver.core.BatchStatement;
+import com.datastax.driver.core.Cluster;
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.Session;
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.service.EmbeddedCassandraService;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+
+public class BatchTests
+{
+    private static EmbeddedCassandraService cassandra;
+
+    private static Cluster cluster;
+    private static Session session;
+
+
+    private static PreparedStatement counter;
+    private static PreparedStatement noncounter;
+
+    @BeforeClass()
+    public static void setup() throws ConfigurationException, IOException
+    {
+        cassandra = new EmbeddedCassandraService();
+        cassandra.start();
+
+        cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build();
+        session = cluster.connect();
+
+        session.execute("drop keyspace if exists junit;");
+        session.execute("create keyspace junit WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };");
+        session.execute("CREATE TABLE junit.noncounter (\n" +
+                "  id int PRIMARY KEY,\n" +
+                "  val text\n" +
+                ");");
+        session.execute("CREATE TABLE junit.counter (\n" +
+                "  id int PRIMARY KEY,\n" +
+                "  val counter,\n" +
+                ");");
+
+
+        noncounter = session.prepare("insert into junit.noncounter(id, val)values(?,?)");
+        counter = session.prepare("update junit.counter set val = val + ? where id = ?");
+    }
+
+    @Test(expected = InvalidQueryException.class)
+    public void testMixedInCounterBatch()
+    {
+       sendBatch(BatchStatement.Type.COUNTER, true, true);
+    }
+
+    @Test(expected = InvalidQueryException.class)
+    public void testMixedInLoggedBatch()
+    {
+        sendBatch(BatchStatement.Type.LOGGED, true, true);
+    }
+
+    @Test(expected = InvalidQueryException.class)
+    public void testMixedInUnLoggedBatch()
+    {
+        sendBatch(BatchStatement.Type.UNLOGGED, true, true);
+    }
+
+    @Test(expected = InvalidQueryException.class)
+    public void testNonCounterInCounterBatch()
+    {
+        sendBatch(BatchStatement.Type.COUNTER, false, true);
+    }
+
+    @Test
+    public void testNonCounterInLoggedBatch()
+    {
+        sendBatch(BatchStatement.Type.LOGGED, false, true);
+    }
+
+    @Test
+    public void testNonCounterInUnLoggedBatch()
+    {
+        sendBatch(BatchStatement.Type.UNLOGGED, false, true);
+    }
+
+    @Test
+    public void testCounterInCounterBatch()
+    {
+        sendBatch(BatchStatement.Type.COUNTER, true, false);
+    }
+
+    @Test
+    public void testCounterInUnLoggedBatch()
+    {
+        sendBatch(BatchStatement.Type.UNLOGGED, true, false);
+    }
+
+
+    @Test(expected = InvalidQueryException.class)
+    public void testCounterInLoggedBatch()
+    {
+        sendBatch(BatchStatement.Type.LOGGED, true, false);
+    }
+
+
+
+    public void sendBatch(BatchStatement.Type type, boolean addCounter, boolean addNonCounter)
+    {
+
+        assert addCounter || addNonCounter;
+        BatchStatement b = new BatchStatement(type);
+
+        for (int i = 0; i < 10; i++)
+        {
+            if (addNonCounter)
+                b.add(noncounter.bind(i, "foo"));
+
+            if (addCounter)
+                b.add(counter.bind((long)i, i));
+        }
+
+        session.execute(b);
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
new file mode 100644
index 0000000..6e4a5a9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java

@@ -0,0 +1,678 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableSet;
+import org.junit.AfterClass;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.serializers.TypeSerializer;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Base class for CQL tests.
+ */
+public abstract class CQLTester
+{
+    protected static final Logger logger = LoggerFactory.getLogger(CQLTester.class);
+
+    public static final String KEYSPACE = "cql_test_keyspace";
+    private static final boolean USE_PREPARED_VALUES = Boolean.valueOf(System.getProperty("cassandra.test.use_prepared", "true"));
+    private static final AtomicInteger seqNumber = new AtomicInteger();
+
+    static
+    {
+        // Once per-JVM is enough
+        SchemaLoader.prepareServer();
+    }
+
+    private String currentTable;
+    private final Set<String> currentTypes = new HashSet<>();
+
+    @BeforeClass
+    public static void setUpClass() throws Throwable
+    {
+        schemaChange(String.format("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", KEYSPACE));
+    }
+
+    @AfterClass
+    public static void tearDownClass()
+    {
+    }
+
+    @After
+    public void afterTest() throws Throwable
+    {
+        if (currentTable == null)
+            return;
+
+        final String tableToDrop = currentTable;
+        final Set<String> typesToDrop = currentTypes.isEmpty() ? Collections.emptySet() : new HashSet(currentTypes);
+        currentTable = null;
+        currentTypes.clear();
+
+        // We want to clean up after the test, but dropping a table is rather long so just do that asynchronously
+        StorageService.optionalTasks.execute(new Runnable()
+        {
+            public void run()
+            {
+                try
+                {
+                    schemaChange(String.format("DROP TABLE %s.%s", KEYSPACE, tableToDrop));
+
+                    for (String typeName : typesToDrop)
+                        schemaChange(String.format("DROP TYPE %s.%s", KEYSPACE, typeName));
+
+                    // Dropping doesn't delete the sstables. It's not a huge deal but it's cleaner to cleanup after us
+                    // Thas said, we shouldn't delete blindly before the SSTableDeletingTask for the table we drop
+                    // have run or they will be unhappy. Since those taks are scheduled on StorageService.tasks and that's
+                    // mono-threaded, just push a task on the queue to find when it's empty. No perfect but good enough.
+
+                    final CountDownLatch latch = new CountDownLatch(1);
+                    StorageService.tasks.execute(new Runnable()
+                    {
+                            public void run()
+                            {
+                                latch.countDown();
+                            }
+                    });
+                    latch.await(2, TimeUnit.SECONDS);
+
+                    removeAllSSTables(KEYSPACE, tableToDrop);
+                }
+                catch (Exception e)
+                {
+                    throw new RuntimeException(e);
+                }
+            }
+        });
+    }
+
+    public void flush()
+    {
+        try
+        {
+            if (currentTable != null)
+                Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable).forceFlush().get();
+        }
+        catch (InterruptedException e)
+        {
+            throw new RuntimeException(e);
+        }
+        catch (ExecutionException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public boolean usePrepared()
+    {
+        return USE_PREPARED_VALUES;
+    }
+
+    private static void removeAllSSTables(String ks, String table)
+    {
+        // clean up data directory which are stored as data directory/keyspace/data files
+        for (File d : Directories.getKSChildDirectories(ks))
+        {
+            if (d.exists() && d.getName().contains(table))
+                FileUtils.deleteRecursive(d);
+        }
+    }
+
+    protected String keyspace()
+    {
+        return KEYSPACE;
+    }
+
+    protected String currentTable()
+    {
+        return currentTable;
+    }
+
+    protected String createType(String query)
+    {
+        String typeName = "type_" + seqNumber.getAndIncrement();
+        String fullQuery = String.format(query, KEYSPACE + "." + typeName);
+        currentTypes.add(typeName);
+        logger.info(fullQuery);
+        schemaChange(fullQuery);
+        return typeName;
+    }
+
+    protected void createTable(String query)
+    {
+        currentTable = "table_" + seqNumber.getAndIncrement();
+        String fullQuery = String.format(query, KEYSPACE + "." + currentTable);
+        logger.info(fullQuery);
+        schemaChange(fullQuery);
+    }
+
+    protected void alterTable(String query)
+    {
+        String fullQuery = String.format(query, KEYSPACE + "." + currentTable);
+        logger.info(fullQuery);
+        schemaChange(fullQuery);
+    }
+
+    protected void createIndex(String query)
+    {
+        String fullQuery = String.format(query, KEYSPACE + "." + currentTable);
+        logger.info(fullQuery);
+        schemaChange(fullQuery);
+    }
+
+    private static void schemaChange(String query)
+    {
+        try
+        {
+            // executeOnceInternal don't work for schema changes
+            QueryProcessor.executeOnceInternal(query);
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException("Error setting schema for test (query was: " + query + ")", e);
+        }
+    }
+
+    protected CFMetaData currentTableMetadata()
+    {
+        return Schema.instance.getCFMetaData(KEYSPACE, currentTable);
+    }
+
+    protected UntypedResultSet execute(String query, Object... values) throws Throwable
+    {
+        try
+        {
+            query = currentTable == null ? query : String.format(query, KEYSPACE + "." + currentTable);
+
+            UntypedResultSet rs;
+            if (USE_PREPARED_VALUES)
+            {
+                logger.info("Executing: {} with values {}", query, formatAllValues(values));
+                rs = QueryProcessor.executeOnceInternal(query, transformValues(values));
+            }
+            else
+            {
+                query = replaceValues(query, values);
+                logger.info("Executing: {}", query);
+                rs = QueryProcessor.executeOnceInternal(query);
+            }
+            if (rs != null)
+                logger.info("Got {} rows", rs.size());
+            return rs;
+        }
+        catch (RuntimeException e)
+        {
+            Throwable cause = e.getCause() != null ? e.getCause() : e;
+            logger.info("Got error: {}", cause.getMessage() == null ? cause.toString() : cause.getMessage());
+            throw cause;
+        }
+    }
+
+    protected void assertRows(UntypedResultSet result, Object[]... rows)
+    {
+        if (result == null)
+        {
+            if (rows.length > 0)
+                Assert.fail(String.format("No rows returned by query but %d expected", rows.length));
+            return;
+        }
+
+        List<ColumnSpecification> meta = result.metadata();
+        Iterator<UntypedResultSet.Row> iter = result.iterator();
+        int i = 0;
+        while (iter.hasNext() && i < rows.length)
+        {
+            Object[] expected = rows[i++];
+            UntypedResultSet.Row actual = iter.next();
+
+            Assert.assertEquals(String.format("Invalid number of (expected) values provided for row %d", i), meta.size(), expected.length);
+
+            for (int j = 0; j < meta.size(); j++)
+            {
+                ColumnSpecification column = meta.get(j);
+                Object expectedValue = expected[j];
+                ByteBuffer expectedByteValue = makeByteBuffer(expected[j], (AbstractType)column.type);
+                ByteBuffer actualValue = actual.getBytes(column.name.toString());
+
+                if (!Objects.equal(expectedByteValue, actualValue))
+                    Assert.fail(String.format("Invalid value for row %d column %d (%s of type %s), expected <%s> but got <%s>",
+                                              i, j, column.name, column.type.asCQL3Type(), formatValue(expectedByteValue, column.type), formatValue(actualValue, column.type)));
+            }
+        }
+
+        if (iter.hasNext())
+        {
+            while (iter.hasNext())
+            {
+                iter.next();
+                i++;
+            }
+            Assert.fail(String.format("Got less rows than expected. Expected %d but got %d.", rows.length, i));
+        }
+
+        Assert.assertTrue(String.format("Got more rows than expected. Expected %d but got %d", rows.length, i), i == rows.length);
+    }
+
+    protected void assertAllRows(Object[]... rows) throws Throwable
+    {
+        assertRows(execute("SELECT * FROM %s"), rows);
+    }
+
+    protected Object[] row(Object... expected)
+    {
+        return expected;
+    }
+
+    protected void assertEmpty(UntypedResultSet result) throws Throwable
+    {
+        if (result != null && result.size() != 0)
+            throw new InvalidRequestException(String.format("Expected empty result but got %d rows", result.size()));
+    }
+
+    protected void assertInvalid(String query, Object... values) throws Throwable
+    {
+        try
+        {
+            execute(query, values);
+            String q = USE_PREPARED_VALUES
+                     ? query + " (values: " + formatAllValues(values) + ")"
+                     : replaceValues(query, values);
+            Assert.fail("Query should be invalid but no error was thrown. Query is: " + q);
+        }
+        catch (InvalidRequestException e)
+        {
+            // This is what we expect
+        }
+    }
+
+    protected void assertInvalidSyntax(String query, Object... values) throws Throwable
+    {
+        try
+        {
+            execute(query, values);
+            String q = USE_PREPARED_VALUES
+                     ? query + " (values: " + formatAllValues(values) + ")"
+                     : replaceValues(query, values);
+            Assert.fail("Query should have invalid syntax but no error was thrown. Query is: " + q);
+        }
+        catch (SyntaxException e)
+        {
+            // This is what we expect
+        }
+    }
+
+    private static String replaceValues(String query, Object[] values)
+    {
+        StringBuilder sb = new StringBuilder();
+        int last = 0;
+        int i = 0;
+        int idx;
+        while ((idx = query.indexOf('?', last)) > 0)
+        {
+            if (i >= values.length)
+                throw new IllegalArgumentException(String.format("Not enough values provided. The query has at least %d variables but only %d values provided", i, values.length));
+
+            sb.append(query.substring(last, idx));
+
+            Object value = values[i++];
+
+            // When we have a .. IN ? .., we use a list for the value because that's what's expected when the value is serialized.
+            // When we format as string however, we need to special case to use parenthesis. Hackish but convenient.
+            if (idx >= 3 && value instanceof List && query.substring(idx - 3, idx).equalsIgnoreCase("IN "))
+            {
+                List l = (List)value;
+                sb.append("(");
+                for (int j = 0; j < l.size(); j++)
+                {
+                    if (j > 0)
+                        sb.append(", ");
+                    sb.append(formatForCQL(l.get(j)));
+                }
+                sb.append(")");
+            }
+            else
+            {
+                sb.append(formatForCQL(value));
+            }
+            last = idx + 1;
+        }
+        sb.append(query.substring(last));
+        return sb.toString();
+    }
+
+    // We're rellly only returning ByteBuffers but this make the type system happy
+    private static Object[] transformValues(Object[] values)
+    {
+        // We could partly rely on QueryProcessor.executeOnceInternal doing type conversion for us, but
+        // it would complain with ClassCastException if we pass say a string where an int is excepted (since
+        // it bases conversion on what the value should be, not what it is). For testing, we sometimes
+        // want to pass value of the wrong type and assert that this properly raise an InvalidRequestException
+        // and executeOnceInternal goes into way. So instead, we pre-convert everything to bytes here based
+        // on the value.
+        // Besides, we need to handle things like TupleValue that executeOnceInternal don't know about.
+
+        Object[] buffers = new ByteBuffer[values.length];
+        for (int i = 0; i < values.length; i++)
+        {
+            Object value = values[i];
+            if (value == null)
+            {
+                buffers[i] = null;
+                continue;
+            }
+
+            buffers[i] = typeFor(value).decompose(serializeTuples(value));
+        }
+        return buffers;
+    }
+
+    private static Object serializeTuples(Object value)
+    {
+        if (value instanceof TupleValue)
+        {
+            return ((TupleValue)value).toByteBuffer();
+        }
+
+        // We need to reach inside collections for TupleValue and transform them to ByteBuffer
+        // since otherwise the decompose method of the collection AbstractType won't know what
+        // to do with them
+        if (value instanceof List)
+        {
+            List l = (List)value;
+            List n = new ArrayList(l.size());
+            for (Object o : l)
+                n.add(serializeTuples(o));
+            return n;
+        }
+
+        if (value instanceof Set)
+        {
+            Set s = (Set)value;
+            Set n = new LinkedHashSet(s.size());
+            for (Object o : s)
+                n.add(serializeTuples(o));
+            return n;
+        }
+
+        if (value instanceof Map)
+        {
+            Map m = (Map)value;
+            Map n = new LinkedHashMap(m.size());
+            for (Object entry : m.entrySet())
+                n.put(serializeTuples(((Map.Entry)entry).getKey()), serializeTuples(((Map.Entry)entry).getValue()));
+            return n;
+        }
+        return value;
+    }
+
+    private static String formatAllValues(Object[] values)
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append("[");
+        for (int i = 0; i < values.length; i++)
+        {
+            if (i > 0)
+                sb.append(", ");
+            sb.append(formatForCQL(values[i]));
+        }
+        sb.append("]");
+        return sb.toString();
+    }
+
+    private static String formatForCQL(Object value)
+    {
+        if (value == null)
+            return "null";
+
+        if (value instanceof TupleValue)
+            return ((TupleValue)value).toCQLString();
+
+        // We need to reach inside collections for TupleValue. Besides, for some reason the format
+        // of collection that CollectionType.getString gives us is not at all 'CQL compatible'
+        if (value instanceof Collection || value instanceof Map)
+        {
+            StringBuilder sb = new StringBuilder();
+            if (value instanceof List)
+            {
+                List l = (List)value;
+                sb.append("[");
+                for (int i = 0; i < l.size(); i++)
+                {
+                    if (i > 0)
+                        sb.append(", ");
+                    sb.append(formatForCQL(l.get(i)));
+                }
+                sb.append("]");
+            }
+            else if (value instanceof Set)
+            {
+                Set s = (Set)value;
+                sb.append("{");
+                Iterator iter = s.iterator();
+                while (iter.hasNext())
+                {
+                    sb.append(formatForCQL(iter.next()));
+                    if (iter.hasNext())
+                        sb.append(", ");
+                }
+                sb.append("}");
+            }
+            else
+            {
+                Map m = (Map)value;
+                sb.append("{");
+                Iterator iter = m.entrySet().iterator();
+                while (iter.hasNext())
+                {
+                    Map.Entry entry = (Map.Entry)iter.next();
+                    sb.append(formatForCQL(entry.getKey())).append(": ").append(formatForCQL(entry.getValue()));
+                    if (iter.hasNext())
+                        sb.append(", ");
+                }
+                sb.append("}");
+            }
+            return sb.toString();
+        }
+
+        AbstractType type = typeFor(value);
+        String s = type.getString(type.decompose(value));
+
+        if (type instanceof UTF8Type)
+            return String.format("'%s'", s.replaceAll("'", "''"));
+
+        if (type instanceof BytesType)
+            return "0x" + s;
+
+        return s;
+    }
+
+    private static ByteBuffer makeByteBuffer(Object value, AbstractType type)
+    {
+        if (value == null)
+            return null;
+
+        if (value instanceof TupleValue)
+            return ((TupleValue)value).toByteBuffer();
+
+        if (value instanceof ByteBuffer)
+            return (ByteBuffer)value;
+
+        return type.decompose(value);
+    }
+
+    private static String formatValue(ByteBuffer bb, AbstractType<?> type)
+    {
+        if (bb == null)
+            return "null";
+
+        if (type instanceof CollectionType)
+        {
+            // CollectionType override getString() to use hexToBytes. We can't change that
+            // without breaking SSTable2json, but the serializer for collection have the
+            // right getString so using it directly instead.
+            TypeSerializer ser = type.getSerializer();
+            return ser.toString(ser.deserialize(bb));
+        }
+
+        return type.getString(bb);
+    }
+
+    protected Object tuple(Object...values)
+    {
+        return new TupleValue(values);
+    }
+
+    protected Object list(Object...values)
+    {
+        return Arrays.asList(values);
+    }
+
+    protected Object set(Object...values)
+    {
+        return ImmutableSet.copyOf(values);
+    }
+
+    protected Object map(Object...values)
+    {
+        if (values.length % 2 != 0)
+            throw new IllegalArgumentException();
+
+        int size = values.length / 2;
+        Map m = new LinkedHashMap(size);
+        for (int i = 0; i < size; i++)
+            m.put(values[2 * i], values[(2 * i) + 1]);
+        return m;
+    }
+
+    // Attempt to find an AbstracType from a value (for serialization/printing sake).
+    // Will work as long as we use types we know of, which is good enough for testing
+    private static AbstractType typeFor(Object value)
+    {
+        if (value instanceof ByteBuffer || value instanceof TupleValue || value == null)
+            return BytesType.instance;
+
+        if (value instanceof Integer)
+            return Int32Type.instance;
+
+        if (value instanceof Long)
+            return LongType.instance;
+
+        if (value instanceof Float)
+            return FloatType.instance;
+
+        if (value instanceof Double)
+            return DoubleType.instance;
+
+        if (value instanceof String)
+            return UTF8Type.instance;
+
+        if (value instanceof Boolean)
+            return BooleanType.instance;
+
+        if (value instanceof List)
+        {
+            List l = (List)value;
+            AbstractType elt = l.isEmpty() ? BytesType.instance : typeFor(l.get(0));
+            return ListType.getInstance(elt);
+        }
+
+        if (value instanceof Set)
+        {
+            Set s = (Set)value;
+            AbstractType elt = s.isEmpty() ? BytesType.instance : typeFor(s.iterator().next());
+            return SetType.getInstance(elt);
+        }
+
+        if (value instanceof Map)
+        {
+            Map m = (Map)value;
+            AbstractType keys, values;
+            if (m.isEmpty())
+            {
+                keys = BytesType.instance;
+                values = BytesType.instance;
+            }
+            else
+            {
+                Map.Entry entry = (Map.Entry)m.entrySet().iterator().next();
+                keys = typeFor(entry.getKey());
+                values = typeFor(entry.getValue());
+            }
+            return MapType.getInstance(keys, values);
+        }
+
+        throw new IllegalArgumentException("Unsupported value type (value is " + value + ")");
+    }
+
+    private static class TupleValue
+    {
+        private final Object[] values;
+
+        TupleValue(Object[] values)
+        {
+            this.values = values;
+        }
+
+        public ByteBuffer toByteBuffer()
+        {
+            ByteBuffer[] bbs = new ByteBuffer[values.length];
+            for (int i = 0; i < values.length; i++)
+                bbs[i] = makeByteBuffer(values[i], typeFor(values[i]));
+            return TupleType.buildValue(bbs);
+        }
+
+        public String toCQLString()
+        {
+            StringBuilder sb = new StringBuilder();
+            sb.append("(");
+            for (int i = 0; i < values.length; i++)
+            {
+                if (i > 0)
+                    sb.append(", ");
+                sb.append(formatForCQL(values[i]));
+            }
+            sb.append(")");
+            return sb.toString();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/CollectionsTest.java b/test/unit/org/apache/cassandra/cql3/CollectionsTest.java
new file mode 100644
index 0000000..2380c38
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/CollectionsTest.java

@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+public class CollectionsTest extends CQLTester
+{
+    //@Test
+    //public void testMapBulkRemoval() throws Throwable
+    //{
+    //    createTable("CREATE TABLE %s (k int PRIMARY KEY, m map<text, text>)");
+
+    //    execute("INSERT INTO %s(k, m) VALUES (?, ?)", 0, map("k1", "v1", "k2", "v2", "k3", "v3"));
+
+    //    assertRows(execute("SELECT * FROM %s"),
+    //        row(0, map("k1", "v1", "k2", "v2", "k3", "v3"))
+    //    );
+
+    //    execute("UPDATE %s SET m = m - ? WHERE k = ?", set("k2"), 0);
+
+    //    assertRows(execute("SELECT * FROM %s"),
+    //        row(0, map("k1", "v1", "k3", "v3"))
+    //    );
+
+    //    execute("UPDATE %s SET m = m + ?, m = m - ? WHERE k = ?", map("k4", "v4"), set("k3"), 0);
+
+    //    assertRows(execute("SELECT * FROM %s"),
+    //        row(0, map("k1", "v1", "k4", "v4"))
+    //    );
+    //}
+
+    @Test
+    public void testInvalidCollectionsMix() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, l list<text>, s set<text>, m map<text, text>)");
+
+        // Note: we force the non-prepared form for some of those tests because a list and a set
+        // have the same serialized format in practice and CQLTester don't validate that the type
+        // of what's passed as a value in the prepared case, so the queries would work (which is ok,
+        // CQLTester is just a "dumb" client).
+
+        assertInvalid("UPDATE %s SET l = l + { 'a', 'b' } WHERE k = 0");
+        assertInvalid("UPDATE %s SET l = l - { 'a', 'b' } WHERE k = 0");
+        assertInvalid("UPDATE %s SET l = l + ? WHERE k = 0", map("a", "b", "c", "d"));
+        assertInvalid("UPDATE %s SET l = l - ? WHERE k = 0", map("a", "b", "c", "d"));
+
+        assertInvalid("UPDATE %s SET s = s + [ 'a', 'b' ] WHERE k = 0");
+        assertInvalid("UPDATE %s SET s = s - [ 'a', 'b' ] WHERE k = 0");
+        assertInvalid("UPDATE %s SET s = s + ? WHERE k = 0", map("a", "b", "c", "d"));
+        assertInvalid("UPDATE %s SET s = s - ? WHERE k = 0", map("a", "b", "c", "d"));
+
+        assertInvalid("UPDATE %s SET m = m + ? WHERE k = 0", list("a", "b"));
+        assertInvalid("UPDATE %s SET m = m - [ 'a', 'b' ] WHERE k = 0");
+        assertInvalid("UPDATE %s SET m = m + ? WHERE k = 0", set("a", "b"));
+        assertInvalid("UPDATE %s SET m = m - ? WHERE k = 0", map("a", "b", "c", "d"));
+    }
+
+    //@Test
+    //public void testSets() throws Throwable
+    //{
+    //    createTable("CREATE TABLE %s (k int PRIMARY KEY, s set<text>)");
+
+    //    execute("INSERT INTO %s(k, s) VALUES (0, ?)", set("v1", "v2", "v3", "v4"));
+
+    //    assertRows(execute("SELECT s FROM %s WHERE k = 0"),
+    //        row(set("v1", "v2", "v3", "v4"))
+    //    );
+
+    //    execute("DELETE s[?] FROM %s WHERE k = 0", "v1");
+
+    //    assertRows(execute("SELECT s FROM %s WHERE k = 0"),
+    //        row(set("v2", "v3", "v4"))
+    //    );
+
+    //    // Full overwrite
+    //    execute("UPDATE %s SET s = ? WHERE k = 0", set("v6", "v5"));
+
+    //    assertRows(execute("SELECT s FROM %s WHERE k = 0"),
+    //        row(set("v5", "v6"))
+    //    );
+
+    //    execute("UPDATE %s SET s = s + ? WHERE k = 0", set("v7"));
+
+    //    assertRows(execute("SELECT s FROM %s WHERE k = 0"),
+    //        row(set("v5", "v6", "v7"))
+    //    );
+
+    //    execute("UPDATE %s SET s = s - ? WHERE k = 0", set("v6", "v5"));
+
+    //    assertRows(execute("SELECT s FROM %s WHERE k = 0"),
+    //        row(set("v7"))
+    //    );
+
+    //    execute("DELETE s FROM %s WHERE k = 0");
+
+    //    assertRows(execute("SELECT s FROM %s WHERE k = 0"),
+    //        row((Object)null)
+    //    );
+    //}
+
+    //@Test
+    //public void testMaps() throws Throwable
+    //{
+    //    createTable("CREATE TABLE %s (k int PRIMARY KEY, m map<text, int>)");
+
+    //    execute("INSERT INTO %s(k, m) VALUES (0, ?)", map("v1", 1, "v2", 2));
+
+    //    assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+    //        row(map("v1", 1, "v2", 2))
+    //    );
+
+    //    execute("UPDATE %s SET m[?] = ?, m[?] = ? WHERE k = 0", "v3", 3, "v4", 4);
+
+    //    assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+    //        row(map("v1", 1, "v2", 2, "v3", 3, "v4", 4))
+    //    );
+
+    //    execute("DELETE m[?] FROM %s WHERE k = 0", "v1");
+
+    //    assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+    //        row(map("v2", 2, "v3", 3, "v4", 4))
+    //    );
+
+    //    // Full overwrite
+    //    execute("UPDATE %s SET m = ? WHERE k = 0", map("v6", 6, "v5", 5));
+
+    //    assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+    //        row(map("v5", 5, "v6", 6))
+    //    );
+
+    //    execute("UPDATE %s SET m = m + ? WHERE k = 0", map("v7", 7));
+
+    //    assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+    //        row(map("v5", 5, "v6", 6, "v7", 7))
+    //    );
+
+    //    // The empty map is parsed as an empty set (because we don't have enough info at parsing
+    //    // time when we see a {}) and special cased later. This test checks this work properly
+    //    execute("UPDATE %s SET m = {} WHERE k = 0");
+
+    //    assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+    //        row((Object)null)
+    //    );
+    //}
+
+    //@Test
+    //public void testLists() throws Throwable
+    //{
+    //    createTable("CREATE TABLE %s (k int PRIMARY KEY, l list<text>)");
+
+    //    execute("INSERT INTO %s(k, l) VALUES (0, ?)", list("v1", "v2", "v3"));
+
+    //    assertRows(execute("SELECT l FROM %s WHERE k = 0"),
+    //        row(list("v1", "v2", "v3"))
+    //    );
+
+    //    execute("DELETE l[?] FROM %s WHERE k = 0", 1);
+
+    //    assertRows(execute("SELECT l FROM %s WHERE k = 0"),
+    //        row(list("v1", "v3"))
+    //    );
+
+    //    execute("UPDATE %s SET l[?] = ? WHERE k = 0", 1, "v4");
+
+    //    assertRows(execute("SELECT l FROM %s WHERE k = 0"),
+    //        row(list("v1", "v4"))
+    //    );
+
+    //    // Full overwrite
+    //    execute("UPDATE %s SET l = ? WHERE k = 0", list("v6", "v5"));
+
+    //    assertRows(execute("SELECT l FROM %s WHERE k = 0"),
+    //        row(list("v6", "v5"))
+    //    );
+
+    //    execute("UPDATE %s SET l = l + ? WHERE k = 0", list("v7", "v8"));
+
+    //    assertRows(execute("SELECT l FROM %s WHERE k = 0"),
+    //        row(list("v6", "v5", "v7", "v8"))
+    //    );
+
+    //    execute("UPDATE %s SET l = ? + l WHERE k = 0", list("v9"));
+
+    //    assertRows(execute("SELECT l FROM %s WHERE k = 0"),
+    //        row(list("v9", "v6", "v5", "v7", "v8"))
+    //    );
+
+    //    execute("UPDATE %s SET l = l - ? WHERE k = 0", list("v5", "v8"));
+
+    //    assertRows(execute("SELECT l FROM %s WHERE k = 0"),
+    //        row(list("v9", "v6", "v7"))
+    //    );
+
+    //    execute("DELETE l FROM %s WHERE k = 0");
+
+    //    assertRows(execute("SELECT l FROM %s WHERE k = 0"),
+    //        row((Object)null)
+    //    );
+    //}
+}

diff --git a/test/unit/org/apache/cassandra/cql3/ColumnConditionTest.java b/test/unit/org/apache/cassandra/cql3/ColumnConditionTest.java
new file mode 100644
index 0000000..2071a33
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ColumnConditionTest.java

@@ -0,0 +1,577 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.BufferCell;
+import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.serializers.Int32Serializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.fail;
+
+public class ColumnConditionTest
+{
+    public static final ByteBuffer ZERO = Int32Type.instance.fromString("0");
+    public static final ByteBuffer ONE = Int32Type.instance.fromString("1");
+    public static final ByteBuffer TWO = Int32Type.instance.fromString("2");
+
+    public static final ByteBuffer A = AsciiType.instance.fromString("a");
+    public static final ByteBuffer B = AsciiType.instance.fromString("b");
+
+    private static boolean isSatisfiedBy(ColumnCondition.Bound bound, ByteBuffer conditionValue, ByteBuffer columnValue) throws InvalidRequestException
+    {
+        Cell cell = null;
+        if (columnValue != null)
+        {
+            CompoundSparseCellNameType nameType = new CompoundSparseCellNameType(Collections.EMPTY_LIST);
+            ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("c", true), Int32Type.instance, null, null, null, null, null);
+            cell = new BufferCell(nameType.create(Composites.EMPTY, definition), columnValue);
+        }
+        return bound.isSatisfiedByValue(conditionValue, cell, Int32Type.instance, bound.operator, 1234);
+    }
+
+    private static void assertThrowsIRE(ColumnCondition.Bound bound, ByteBuffer conditionValue, ByteBuffer columnValue)
+    {
+        try
+        {
+            isSatisfiedBy(bound, conditionValue, columnValue);
+            fail("Expected InvalidRequestException was not thrown");
+        } catch (InvalidRequestException e) { }
+    }
+
+    @Test
+    public void testSimpleBoundIsSatisfiedByValue() throws InvalidRequestException
+    {
+        ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("c", true), Int32Type.instance, null, null, null, null, null);
+
+        // EQ
+        ColumnCondition condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Relation.Type.EQ);
+        ColumnCondition.Bound bound = condition.bind(QueryOptions.DEFAULT);
+        assertTrue(isSatisfiedBy(bound, ONE, ONE));
+        assertFalse(isSatisfiedBy(bound, ZERO, ONE));
+        assertFalse(isSatisfiedBy(bound, TWO, ONE));
+        assertFalse(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE));
+        assertFalse(isSatisfiedBy(bound, ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertTrue(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertTrue(isSatisfiedBy(bound, null, null));
+        assertFalse(isSatisfiedBy(bound, ONE, null));
+        assertFalse(isSatisfiedBy(bound, null, ONE));
+
+        // NEQ
+        condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Relation.Type.NEQ);
+        bound = condition.bind(QueryOptions.DEFAULT);
+        assertFalse(isSatisfiedBy(bound, ONE, ONE));
+        assertTrue(isSatisfiedBy(bound, ZERO, ONE));
+        assertTrue(isSatisfiedBy(bound, TWO, ONE));
+        assertTrue(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE));
+        assertTrue(isSatisfiedBy(bound, ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertFalse(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertFalse(isSatisfiedBy(bound, null, null));
+        assertTrue(isSatisfiedBy(bound, ONE, null));
+        assertTrue(isSatisfiedBy(bound, null, ONE));
+
+        // LT
+        condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Relation.Type.LT);
+        bound = condition.bind(QueryOptions.DEFAULT);
+        assertFalse(isSatisfiedBy(bound, ONE, ONE));
+        assertFalse(isSatisfiedBy(bound, ZERO, ONE));
+        assertTrue(isSatisfiedBy(bound, TWO, ONE));
+        assertFalse(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE));
+        assertTrue(isSatisfiedBy(bound, ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertFalse(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertThrowsIRE(bound, null, ONE);
+        assertFalse(isSatisfiedBy(bound, ONE, null));
+
+        // LTE
+        condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Relation.Type.LTE);
+        bound = condition.bind(QueryOptions.DEFAULT);
+        assertTrue(isSatisfiedBy(bound, ONE, ONE));
+        assertFalse(isSatisfiedBy(bound, ZERO, ONE));
+        assertTrue(isSatisfiedBy(bound, TWO, ONE));
+        assertFalse(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE));
+        assertTrue(isSatisfiedBy(bound, ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertTrue(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertThrowsIRE(bound, null, ONE);
+        assertFalse(isSatisfiedBy(bound, ONE, null));
+
+        // GT
+        condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Relation.Type.GT);
+        bound = condition.bind(QueryOptions.DEFAULT);
+        assertFalse(isSatisfiedBy(bound, ONE, ONE));
+        assertTrue(isSatisfiedBy(bound, ZERO, ONE));
+        assertFalse(isSatisfiedBy(bound, TWO, ONE));
+        assertTrue(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE));
+        assertFalse(isSatisfiedBy(bound, ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertFalse(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertThrowsIRE(bound, null, ONE);
+        assertFalse(isSatisfiedBy(bound, ONE, null));
+
+        // GT
+        condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Relation.Type.GTE);
+        bound = condition.bind(QueryOptions.DEFAULT);
+        assertTrue(isSatisfiedBy(bound, ONE, ONE));
+        assertTrue(isSatisfiedBy(bound, ZERO, ONE));
+        assertFalse(isSatisfiedBy(bound, TWO, ONE));
+        assertTrue(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE));
+        assertFalse(isSatisfiedBy(bound, ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertTrue(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertThrowsIRE(bound, null, ONE);
+        assertFalse(isSatisfiedBy(bound, ONE, null));
+    }
+
+    private static List<ByteBuffer> list(ByteBuffer... values)
+    {
+        return Arrays.asList(values);
+    }
+
+    private static boolean listAppliesTo(ColumnCondition.CollectionBound bound, List<ByteBuffer> conditionValues, List<ByteBuffer> columnValues)
+    {
+        CFMetaData cfm = CFMetaData.compile("create table foo(a int PRIMARY KEY, b int, c list<int>)", "ks");
+        Map<ByteBuffer, CollectionType> typeMap = new HashMap<>();
+        typeMap.put(ByteBufferUtil.bytes("c"), ListType.getInstance(Int32Type.instance));
+        CompoundSparseCellNameType.WithCollection nameType = new CompoundSparseCellNameType.WithCollection(Collections.EMPTY_LIST, ColumnToCollectionType.getInstance(typeMap));
+        ColumnDefinition definition = new ColumnDefinition(cfm, ByteBufferUtil.bytes("c"), ListType.getInstance(Int32Type.instance), 0, ColumnDefinition.Kind.REGULAR);
+
+        List<Cell> cells = new ArrayList<>(columnValues.size());
+        if (columnValues != null)
+        {
+            for (int i = 0; i < columnValues.size(); i++)
+            {
+                ByteBuffer key = Int32Serializer.instance.serialize(i);
+                ByteBuffer value = columnValues.get(i);
+                cells.add(new BufferCell(nameType.create(Composites.EMPTY, definition, key), value));
+            };
+        }
+
+        return bound.listAppliesTo(ListType.getInstance(Int32Type.instance), cells == null ? null : cells.iterator(), conditionValues, bound.operator);
+    }
+
+    @Test
+    // sets use the same check as lists
+    public void testListCollectionBoundAppliesTo() throws InvalidRequestException
+    {
+        ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("c", true), ListType.getInstance(Int32Type.instance), null, null, null, null, null);
+
+        // EQ
+        ColumnCondition condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.EQ);
+        ColumnCondition.CollectionBound bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertTrue(listAppliesTo(bound, list(ONE), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(), list()));
+        assertFalse(listAppliesTo(bound, list(ZERO), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ZERO)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ONE, ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE, ONE), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list()));
+
+        assertFalse(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // NEQ
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.NEQ);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertFalse(listAppliesTo(bound, list(ONE), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(), list()));
+        assertTrue(listAppliesTo(bound, list(ZERO), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ZERO)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ONE, ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE, ONE), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list()));
+
+        assertTrue(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // LT
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.LT);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertFalse(listAppliesTo(bound, list(ONE), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(), list()));
+        assertFalse(listAppliesTo(bound, list(ZERO), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ZERO)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ONE, ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE, ONE), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list()));
+
+        assertFalse(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // LTE
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.LTE);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertTrue(listAppliesTo(bound, list(ONE), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(), list()));
+        assertFalse(listAppliesTo(bound, list(ZERO), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ZERO)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ONE, ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE, ONE), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list()));
+
+        assertFalse(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // GT
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.GT);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertFalse(listAppliesTo(bound, list(ONE), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(), list()));
+        assertTrue(listAppliesTo(bound, list(ZERO), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ZERO)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ONE, ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE, ONE), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list()));
+
+        assertTrue(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // GTE
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.GTE);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertTrue(listAppliesTo(bound, list(ONE), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(), list()));
+        assertTrue(listAppliesTo(bound, list(ZERO), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ZERO)));
+        assertTrue(listAppliesTo(bound, list(ONE), list(ONE, ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE, ONE), list(ONE)));
+        assertTrue(listAppliesTo(bound, list(), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list()));
+
+        assertTrue(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertFalse(listAppliesTo(bound, list(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(listAppliesTo(bound, list(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+    }
+
+    private static Set<ByteBuffer> set(ByteBuffer... values)
+    {
+        Set results = new HashSet<ByteBuffer>(values.length);
+        results.addAll(Arrays.asList(values));
+        return results;
+    }
+
+    private static boolean setAppliesTo(ColumnCondition.CollectionBound bound, Set<ByteBuffer> conditionValues, List<ByteBuffer> columnValues)
+    {
+        CFMetaData cfm = CFMetaData.compile("create table foo(a int PRIMARY KEY, b int, c set<int>)", "ks");
+        Map<ByteBuffer, CollectionType> typeMap = new HashMap<>();
+        typeMap.put(ByteBufferUtil.bytes("c"), SetType.getInstance(Int32Type.instance));
+        CompoundSparseCellNameType.WithCollection nameType = new CompoundSparseCellNameType.WithCollection(Collections.EMPTY_LIST, ColumnToCollectionType.getInstance(typeMap));
+        ColumnDefinition definition = new ColumnDefinition(cfm, ByteBufferUtil.bytes("c"), SetType.getInstance(Int32Type.instance), 0, ColumnDefinition.Kind.REGULAR);
+
+        List<Cell> cells = new ArrayList<>(columnValues.size());
+        if (columnValues != null)
+        {
+            for (int i = 0; i < columnValues.size(); i++)
+            {
+                ByteBuffer key = columnValues.get(i);
+                cells.add(new BufferCell(nameType.create(Composites.EMPTY, definition, key), ByteBufferUtil.EMPTY_BYTE_BUFFER));
+            };
+        }
+
+        return bound.setAppliesTo(SetType.getInstance(Int32Type.instance), cells == null ? null : cells.iterator(), conditionValues, bound.operator);
+    }
+
+    @Test
+    public void testSetCollectionBoundAppliesTo() throws InvalidRequestException
+    {
+        ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("c", true), SetType.getInstance(Int32Type.instance), null, null, null, null, null);
+
+        // EQ
+        ColumnCondition condition = ColumnCondition.condition(definition, null, new Sets.Value(set(ONE)), Relation.Type.EQ);
+        ColumnCondition.CollectionBound bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertTrue(setAppliesTo(bound, set(ONE), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(), list()));
+        assertFalse(setAppliesTo(bound, set(ZERO), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ZERO)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ONE, TWO)));
+        assertFalse(setAppliesTo(bound, set(ONE, TWO), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list()));
+
+        assertFalse(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // NEQ
+        condition = ColumnCondition.condition(definition, null, new Sets.Value(set(ONE)), Relation.Type.NEQ);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertFalse(setAppliesTo(bound, set(ONE), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(), list()));
+        assertTrue(setAppliesTo(bound, set(ZERO), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ZERO)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ONE, TWO)));
+        assertTrue(setAppliesTo(bound, set(ONE, TWO), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list()));
+
+        assertTrue(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // LT
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.LT);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertFalse(setAppliesTo(bound, set(ONE), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(), list()));
+        assertFalse(setAppliesTo(bound, set(ZERO), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ZERO)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ONE, TWO)));
+        assertTrue(setAppliesTo(bound, set(ONE, TWO), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list()));
+
+        assertFalse(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // LTE
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.LTE);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertTrue(setAppliesTo(bound, set(ONE), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(), list()));
+        assertFalse(setAppliesTo(bound, set(ZERO), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ZERO)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ONE, TWO)));
+        assertTrue(setAppliesTo(bound, set(ONE, TWO), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list()));
+
+        assertFalse(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // GT
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.GT);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertFalse(setAppliesTo(bound, set(ONE), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(), list()));
+        assertTrue(setAppliesTo(bound, set(ZERO), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ZERO)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ONE, TWO)));
+        assertFalse(setAppliesTo(bound, set(ONE, TWO), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list()));
+
+        assertTrue(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // GTE
+        condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Relation.Type.GTE);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+        assertTrue(setAppliesTo(bound, set(ONE), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(), list()));
+        assertTrue(setAppliesTo(bound, set(ZERO), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ZERO)));
+        assertTrue(setAppliesTo(bound, set(ONE), list(ONE, TWO)));
+        assertFalse(setAppliesTo(bound, set(ONE, TWO), list(ONE)));
+        assertTrue(setAppliesTo(bound, set(), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list()));
+
+        assertTrue(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ONE)));
+        assertFalse(setAppliesTo(bound, set(ONE), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(setAppliesTo(bound, set(ByteBufferUtil.EMPTY_BYTE_BUFFER), list(ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+    }
+
+    // values should be a list of key, value, key, value, ...
+    private static Map<ByteBuffer, ByteBuffer> map(ByteBuffer... values)
+    {
+        Map<ByteBuffer, ByteBuffer> map = new TreeMap<>();
+        for (int i = 0; i < values.length; i += 2)
+            map.put(values[i], values[i + 1]);
+
+        return map;
+    }
+
+    private static boolean mapAppliesTo(ColumnCondition.CollectionBound bound, Map<ByteBuffer, ByteBuffer> conditionValues, Map<ByteBuffer, ByteBuffer> columnValues)
+    {
+        CFMetaData cfm = CFMetaData.compile("create table foo(a int PRIMARY KEY, b map<int, int>)", "ks");
+        Map<ByteBuffer, CollectionType> typeMap = new HashMap<>();
+        typeMap.put(ByteBufferUtil.bytes("b"), MapType.getInstance(Int32Type.instance, Int32Type.instance));
+        CompoundSparseCellNameType.WithCollection nameType = new CompoundSparseCellNameType.WithCollection(Collections.EMPTY_LIST, ColumnToCollectionType.getInstance(typeMap));
+        ColumnDefinition definition = new ColumnDefinition(cfm, ByteBufferUtil.bytes("b"), MapType.getInstance(Int32Type.instance, Int32Type.instance), 0, ColumnDefinition.Kind.REGULAR);
+
+        List<Cell> cells = new ArrayList<>(columnValues.size());
+        if (columnValues != null)
+        {
+            for (Map.Entry<ByteBuffer, ByteBuffer> entry : columnValues.entrySet())
+                cells.add(new BufferCell(nameType.create(Composites.EMPTY, definition, entry.getKey()), entry.getValue()));
+        }
+
+        return bound.mapAppliesTo(MapType.getInstance(Int32Type.instance, Int32Type.instance), cells.iterator(), conditionValues, bound.operator);
+    }
+
+    @Test
+    public void testMapCollectionBoundIsSatisfiedByValue() throws InvalidRequestException
+    {
+        ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("b", true), MapType.getInstance(Int32Type.instance, Int32Type.instance), null, null, null, null, null);
+
+        Map<ByteBuffer, ByteBuffer> placeholderMap = new TreeMap<>();
+        placeholderMap.put(ONE, ONE);
+        Maps.Value placeholder = new Maps.Value(placeholderMap);
+
+        // EQ
+        ColumnCondition condition = ColumnCondition.condition(definition, null, placeholder, Relation.Type.EQ);
+        ColumnCondition.CollectionBound bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(), map()));
+        assertFalse(mapAppliesTo(bound, map(ZERO, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ZERO, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ZERO), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ZERO)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE, TWO, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE, TWO, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map()));
+
+        assertFalse(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // NEQ
+        condition = ColumnCondition.condition(definition, null, placeholder, Relation.Type.NEQ);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(), map()));
+        assertTrue(mapAppliesTo(bound, map(ZERO, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ZERO, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ZERO), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ZERO)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE, TWO, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE, TWO, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map()));
+
+        assertTrue(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // LT
+        condition = ColumnCondition.condition(definition, null, placeholder, Relation.Type.LT);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(), map()));
+        assertFalse(mapAppliesTo(bound, map(ZERO, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ZERO, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ZERO), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ZERO)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE, TWO, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE, TWO, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map()));
+
+        assertFalse(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // LTE
+        condition = ColumnCondition.condition(definition, null, placeholder, Relation.Type.LTE);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(), map()));
+        assertFalse(mapAppliesTo(bound, map(ZERO, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ZERO, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ZERO), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ZERO)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE, TWO, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE, TWO, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map()));
+
+        assertFalse(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // GT
+        condition = ColumnCondition.condition(definition, null, placeholder, Relation.Type.GT);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(), map()));
+        assertTrue(mapAppliesTo(bound, map(ZERO, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ZERO, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ZERO), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ZERO)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE, TWO, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE, TWO, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map()));
+
+        assertTrue(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertFalse(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+
+        // GTE
+        condition = ColumnCondition.condition(definition, null, placeholder, Relation.Type.GTE);
+        bound = (ColumnCondition.CollectionBound) condition.bind(QueryOptions.DEFAULT);
+
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(), map()));
+        assertTrue(mapAppliesTo(bound, map(ZERO, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ZERO, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ZERO), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ZERO)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ONE, TWO, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE, TWO, ONE), map(ONE, ONE)));
+        assertTrue(mapAppliesTo(bound, map(), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map()));
+
+        assertTrue(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ONE)));
+        assertFalse(mapAppliesTo(bound, map(ONE, ONE), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+        assertTrue(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
+        assertTrue(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/cql3/ContainsRelationTest.java b/test/unit/org/apache/cassandra/cql3/ContainsRelationTest.java
new file mode 100644
index 0000000..605f3ed
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ContainsRelationTest.java

@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+public class ContainsRelationTest extends CQLTester
+{
+    @Test
+    public void testSetContains() throws Throwable
+    {
+        createTable("CREATE TABLE %s (account text, id int, categories set<text>, PRIMARY KEY (account, id))");
+        createIndex("CREATE INDEX ON %s(categories)");
+
+        execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 5, set("lmn"));
+
+        assertEmpty(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "xyz", "lmn"));
+
+        assertRows(execute("SELECT * FROM %s WHERE categories CONTAINS ?", "lmn"),
+            row("test", 5, set("lmn"))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "test", "lmn"),
+            row("test", 5, set("lmn"))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS ?", "test", 5, "lmn"),
+                   row("test", 5, set("lmn"))
+        );
+    }
+
+    @Test
+    public void testListContains() throws Throwable
+    {
+        createTable("CREATE TABLE %s (account text, id int, categories list<text>, PRIMARY KEY (account, id))");
+        createIndex("CREATE INDEX ON %s(categories)");
+
+        execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 5, list("lmn"));
+
+        assertEmpty(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "xyz", "lmn"));
+
+        assertRows(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?;", "test", "lmn"),
+            row("test", 5, list("lmn"))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE categories CONTAINS ?", "lmn"),
+            row("test", 5, list("lmn"))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS ?;", "test", 5, "lmn"),
+                   row("test", 5, list("lmn"))
+        );
+    }
+
+    @Test
+    public void testMapKeyContains() throws Throwable
+    {
+        createTable("CREATE TABLE %s (account text, id int, categories map<text,text>, PRIMARY KEY (account, id))");
+        createIndex("CREATE INDEX ON %s(keys(categories))");
+
+        execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 5, map("lmn", "foo"));
+
+        assertEmpty(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS KEY ?", "xyz", "lmn"));
+
+        assertRows(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS KEY ?", "test", "lmn"),
+            row("test", 5, map("lmn", "foo"))
+        );
+        assertRows(execute("SELECT * FROM %s WHERE categories CONTAINS KEY ?", "lmn"),
+            row("test", 5, map("lmn", "foo"))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS KEY ?", "test", 5, "lmn"),
+                   row("test", 5, map("lmn", "foo"))
+        );
+    }
+
+    @Test
+    public void testMapValueContains() throws Throwable
+    {
+        createTable("CREATE TABLE %s (account text, id int, categories map<text,text>, PRIMARY KEY (account, id))");
+        createIndex("CREATE INDEX ON %s(categories)");
+
+        execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 5, map("lmn", "foo"));
+
+        assertEmpty(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "xyz", "foo"));
+
+        assertRows(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "test", "foo"),
+            row("test", 5, map("lmn", "foo"))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE categories CONTAINS ?", "foo"),
+            row("test", 5, map("lmn", "foo"))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS ?", "test", 5, "foo"),
+                   row("test", 5, map("lmn", "foo"))
+        );
+    }
+
+    // See CASSANDRA-7525
+    @Test
+    public void testQueryMultipleIndexTypes() throws Throwable
+    {
+        createTable("CREATE TABLE %s (account text, id int, categories map<text,text>, PRIMARY KEY (account, id))");
+
+        // create an index on
+        createIndex("CREATE INDEX id_index ON %s(id)");
+        createIndex("CREATE INDEX categories_values_index ON %s(categories)");
+
+        execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 5, map("lmn", "foo"));
+
+        assertRows(execute("SELECT * FROM %s WHERE categories CONTAINS ? AND id = ? ALLOW FILTERING", "foo", 5),
+                row("test", 5, map("lmn", "foo"))
+        );
+
+        assertRows(
+            execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ? AND id = ? ALLOW FILTERING", "test", "foo", 5),
+            row("test", 5, map("lmn", "foo"))
+        );
+    }
+
+    // See CASSANDRA-8033
+    @Test
+    public void testFilterForContains() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 int, v set<int>, PRIMARY KEY ((k1, k2)))");
+        createIndex("CREATE INDEX ON %s(k2)");
+
+        execute("INSERT INTO %s (k1, k2, v) VALUES (?, ?, ?)", 0, 0, set(1, 2, 3));
+        execute("INSERT INTO %s (k1, k2, v) VALUES (?, ?, ?)", 0, 1, set(2, 3, 4));
+        execute("INSERT INTO %s (k1, k2, v) VALUES (?, ?, ?)", 1, 0, set(3, 4, 5));
+        execute("INSERT INTO %s (k1, k2, v) VALUES (?, ?, ?)", 1, 1, set(4, 5, 6));
+
+        assertRows(execute("SELECT * FROM %s WHERE k2 = ?", 1),
+            row(0, 1, set(2, 3, 4)),
+            row(1, 1, set(4, 5, 6))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k2 = ? AND v CONTAINS ? ALLOW FILTERING", 1, 6),
+            row(1, 1, set(4, 5, 6))
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE k2 = ? AND v CONTAINS ? ALLOW FILTERING", 1, 7));
+    }
+
+    // See CASSANDRA-8073
+    @Test
+    public void testIndexLookupWithClusteringPrefix() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d set<int>, PRIMARY KEY (a, b, c))");
+        createIndex("CREATE INDEX ON %s(d)");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, set(1, 2, 3));
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, set(3, 4, 5));
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, set(1, 2, 3));
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, set(3, 4, 5));
+
+        assertRows(execute("SELECT * FROM %s WHERE a=? AND b=? AND d CONTAINS ?", 0, 1, 3),
+            row(0, 1, 0, set(1, 2, 3)),
+            row(0, 1, 1, set(3, 4, 5))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a=? AND b=? AND d CONTAINS ?", 0, 1, 2),
+            row(0, 1, 0, set(1, 2, 3))
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a=? AND b=? AND d CONTAINS ?", 0, 1, 5),
+            row(0, 1, 1, set(3, 4, 5))
+        );
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/CqlParserTest.java b/test/unit/org/apache/cassandra/cql3/CqlParserTest.java
new file mode 100644
index 0000000..84509e8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/CqlParserTest.java

@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+import org.antlr.runtime.ANTLRStringStream;
+import org.antlr.runtime.BaseRecognizer;
+import org.antlr.runtime.CharStream;
+import org.antlr.runtime.CommonTokenStream;
+import org.antlr.runtime.RecognitionException;
+import org.antlr.runtime.TokenStream;
+
+import static org.junit.Assert.*;
+
+public class CqlParserTest
+{
+    @Test
+    public void testAddErrorListener() throws Exception
+    {
+        SyntaxErrorCounter firstCounter = new SyntaxErrorCounter();
+        SyntaxErrorCounter secondCounter = new SyntaxErrorCounter();
+
+        CharStream stream = new ANTLRStringStream("SELECT * FORM test;");
+        CqlLexer lexer = new CqlLexer(stream);
+
+        TokenStream tokenStream = new CommonTokenStream(lexer);
+        CqlParser parser = new CqlParser(tokenStream);
+        parser.addErrorListener(firstCounter);
+        parser.addErrorListener(secondCounter);
+
+        parser.query();
+
+        // ANTLR 3.5 reports 2 errors in the sentence above (missing FROM and missing EOF).
+        assertTrue(firstCounter.count > 0);
+        assertTrue(secondCounter.count > 0);
+    }
+
+    @Test
+    public void testRemoveErrorListener() throws Exception
+    {
+        SyntaxErrorCounter firstCounter = new SyntaxErrorCounter();
+        SyntaxErrorCounter secondCounter = new SyntaxErrorCounter();
+
+        CharStream stream = new ANTLRStringStream("SELECT * FORM test;");
+        CqlLexer lexer = new CqlLexer(stream);
+
+        TokenStream tokenStream = new CommonTokenStream(lexer);
+        CqlParser parser = new CqlParser(tokenStream);
+        parser.addErrorListener(firstCounter);
+        parser.addErrorListener(secondCounter);
+        parser.removeErrorListener(secondCounter);
+
+        parser.query();
+
+        assertTrue(firstCounter.count > 0);
+        assertEquals(0, secondCounter.count);
+    }
+
+    private static final class SyntaxErrorCounter implements ErrorListener
+    {
+        private int count;
+
+        @Override
+        public void syntaxError(BaseRecognizer recognizer, String[] tokenNames, RecognitionException e)
+        {
+            count++;
+        }
+
+        @Override
+        public void syntaxError(BaseRecognizer recognizer, String errorMsg)
+        {
+            count++;
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/CrcCheckChanceTest.java b/test/unit/org/apache/cassandra/cql3/CrcCheckChanceTest.java
new file mode 100644
index 0000000..b9d23cd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/CrcCheckChanceTest.java

@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import junit.framework.Assert;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.junit.Test;
+
+
+public class CrcCheckChanceTest extends CQLTester
+{
+    @Test
+    public void testChangingCrcCheckChance() throws Throwable
+    {
+        //Start with crc_check_chance of 99%
+        createTable("CREATE TABLE %s (p text, c text, v text, s text static, PRIMARY KEY (p, c)) WITH compression = {'sstable_compression': 'LZ4Compressor', 'crc_check_chance' : 0.99}");
+
+        execute("CREATE INDEX foo ON %s(v)");
+
+        execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
+        execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
+        execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
+
+
+        ColumnFamilyStore cfs = Keyspace.open(CQLTester.KEYSPACE).getColumnFamilyStore(currentTable());
+        ColumnFamilyStore indexCfs = cfs.indexManager.getIndexesBackedByCfs().iterator().next();
+        cfs.forceBlockingFlush();
+
+        Assert.assertEquals(0.99, cfs.metadata.compressionParameters.getCrcCheckChance());
+        Assert.assertEquals(0.99, cfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+        Assert.assertEquals(0.99, indexCfs.metadata.compressionParameters.getCrcCheckChance());
+        Assert.assertEquals(0.99, indexCfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+
+
+        assertRows(execute("SELECT * FROM %s WHERE p=?", "p1"),
+                row("p1", "k1", "sv1", "v1"),
+                row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE v=?", "v1"),
+                row("p1", "k1", "sv1", "v1")
+        );
+
+
+
+        //Write a few SSTables then Compact
+
+        execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
+        execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
+        execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
+
+        cfs.forceBlockingFlush();
+
+
+        execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
+        execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
+        execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
+
+        cfs.forceBlockingFlush();
+
+        execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
+        execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
+        execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
+
+        cfs.forceBlockingFlush();
+
+        cfs.forceMajorCompaction();
+
+        //Verify when we alter the value the live sstable readers hold the new one
+        alterTable("ALTER TABLE %s WITH compression = {'sstable_compression': 'LZ4Compressor', 'crc_check_chance': 0.01}");
+
+        Assert.assertEquals( 0.01, cfs.metadata.compressionParameters.getCrcCheckChance());
+        Assert.assertEquals( 0.01, cfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+        Assert.assertEquals( 0.01, indexCfs.metadata.compressionParameters.getCrcCheckChance());
+        Assert.assertEquals( 0.01, indexCfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+
+        assertRows(execute("SELECT * FROM %s WHERE p=?", "p1"),
+                row("p1", "k1", "sv1", "v1"),
+                row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE v=?", "v1"),
+                row("p1", "k1", "sv1", "v1")
+        );
+
+
+        //Verify the call used by JMX still works
+        cfs.setCrcCheckChance(0.03);
+        Assert.assertEquals( 0.03, cfs.metadata.compressionParameters.getCrcCheckChance());
+        Assert.assertEquals( 0.03, cfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+        Assert.assertEquals( 0.03, indexCfs.metadata.compressionParameters.getCrcCheckChance());
+        Assert.assertEquals( 0.03, indexCfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/tools/NodeToolHelp.java b/test/unit/org/apache/cassandra/cql3/CreateTableTest.java
similarity index 70%
rename from src/java/org/apache/cassandra/tools/NodeToolHelp.java
rename to test/unit/org/apache/cassandra/cql3/CreateTableTest.java
index c89e48c..14d2c2b 100644
--- a/src/java/org/apache/cassandra/tools/NodeToolHelp.java
+++ b/test/unit/org/apache/cassandra/cql3/CreateTableTest.java

@@ -15,22 +15,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.tools;
+package org.apache.cassandra.cql3;
 
-import java.util.List;
+import org.junit.Test;
 
-public class NodeToolHelp
+import static junit.framework.Assert.assertFalse;
+
+public class CreateTableTest extends CQLTester
 {
-    public List<NodeToolCommand> commands;
-
-    public static class NodeToolCommand
+    @Test
+    public void testCQL3PartitionKeyOnlyTable()
     {
-        public String name;
-        public String help;
-
-        public String toString()
-        {
-            return name;
-        }
+        createTable("CREATE TABLE %s (id text PRIMARY KEY);");
+        assertFalse(currentTableMetadata().isThriftCompatible());
     }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/CreateTriggerStatementTest.java b/test/unit/org/apache/cassandra/cql3/CreateTriggerStatementTest.java
new file mode 100644
index 0000000..6557c16
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/CreateTriggerStatementTest.java

@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.TriggerDefinition;
+import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.triggers.ITrigger;
+import org.junit.Test;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class CreateTriggerStatementTest extends CQLTester
+{
+    @Test
+    public void testCreateTrigger() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a))");
+        execute("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("trigger_1", TestTrigger.class);
+        execute("CREATE TRIGGER trigger_2 ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("trigger_2", TestTrigger.class);
+        assertInvalid("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
+        execute("CREATE TRIGGER \"Trigger 3\" ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("Trigger 3", TestTrigger.class);
+    }
+
+    @Test
+    public void testCreateTriggerIfNotExists() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+
+        execute("CREATE TRIGGER IF NOT EXISTS trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("trigger_1", TestTrigger.class);
+
+        execute("CREATE TRIGGER IF NOT EXISTS trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("trigger_1", TestTrigger.class);
+    }
+
+    @Test
+    public void testDropTrigger() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a))");
+
+        execute("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("trigger_1", TestTrigger.class);
+
+        execute("DROP TRIGGER trigger_1 ON %s");
+        assertTriggerDoesNotExists("trigger_1", TestTrigger.class);
+
+        execute("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("trigger_1", TestTrigger.class);
+
+        assertInvalid("DROP TRIGGER trigger_2 ON %s");
+        
+        execute("CREATE TRIGGER \"Trigger 3\" ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("Trigger 3", TestTrigger.class);
+
+        execute("DROP TRIGGER \"Trigger 3\" ON %s");
+        assertTriggerDoesNotExists("Trigger 3", TestTrigger.class);
+    }
+
+    @Test
+    public void testDropTriggerIfExists() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a))");
+
+        execute("DROP TRIGGER IF EXISTS trigger_1 ON %s");
+        assertTriggerDoesNotExists("trigger_1", TestTrigger.class);
+
+        execute("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
+        assertTriggerExists("trigger_1", TestTrigger.class);
+
+        execute("DROP TRIGGER IF EXISTS trigger_1 ON %s");
+        assertTriggerDoesNotExists("trigger_1", TestTrigger.class);
+    }
+
+    private void assertTriggerExists(String name, Class<?> clazz)
+    {
+        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), currentTable()).copy();
+        assertTrue("the trigger does not exist", cfm.containsTriggerDefinition(TriggerDefinition.create(name,
+                clazz.getName())));
+    }
+
+    private void assertTriggerDoesNotExists(String name, Class<?> clazz)
+    {
+        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), currentTable()).copy();
+        assertFalse("the trigger exists", cfm.containsTriggerDefinition(TriggerDefinition.create(name,
+                clazz.getName())));
+    }
+
+    public static class TestTrigger implements ITrigger
+    {
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        {
+            return Collections.emptyList();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/DeleteTest.java b/test/unit/org/apache/cassandra/cql3/DeleteTest.java
new file mode 100644
index 0000000..c8aa660
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/DeleteTest.java

@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+
+import com.datastax.driver.core.Cluster;
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.ResultSetFuture;
+import com.datastax.driver.core.Session;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.service.EmbeddedCassandraService;
+import org.junit.Assert;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class DeleteTest extends SchemaLoader
+{
+    private static EmbeddedCassandraService cassandra;
+
+    private static Cluster cluster;
+    private static Session session;
+    private static PreparedStatement pstmtI;
+    private static PreparedStatement pstmtU;
+    private static PreparedStatement pstmtD;
+    private static PreparedStatement pstmt1;
+    private static PreparedStatement pstmt2;
+    private static PreparedStatement pstmt3;
+    private static PreparedStatement pstmt4;
+    private static PreparedStatement pstmt5;
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        Schema.instance.clear();
+
+        cassandra = new EmbeddedCassandraService();
+        cassandra.start();
+
+        cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build();
+        session = cluster.connect();
+
+        session.execute("drop keyspace if exists junit;");
+        session.execute("create keyspace junit WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 2 };");
+        session.execute("CREATE TABLE junit.tpc_base (\n" +
+                "  id int ,\n" +
+                "  cid int ,\n" +
+                "  val text ,\n" +
+                "  PRIMARY KEY ( ( id ), cid )\n" +
+                ");");
+        session.execute("CREATE TABLE junit.tpc_inherit_a (\n" +
+                "  id int ,\n" +
+                "  cid int ,\n" +
+                "  inh_a text ,\n" +
+                "  val text ,\n" +
+                "  PRIMARY KEY ( ( id ), cid )\n" +
+                ");");
+        session.execute("CREATE TABLE junit.tpc_inherit_b (\n" +
+                "  id int ,\n" +
+                "  cid int ,\n" +
+                "  inh_b text ,\n" +
+                "  val text ,\n" +
+                "  PRIMARY KEY ( ( id ), cid )\n" +
+                ");");
+        session.execute("CREATE TABLE junit.tpc_inherit_b2 (\n" +
+                "  id int ,\n" +
+                "  cid int ,\n" +
+                "  inh_b text ,\n" +
+                "  inh_b2 text ,\n" +
+                "  val text ,\n" +
+                "  PRIMARY KEY ( ( id ), cid )\n" +
+                ");");
+        session.execute("CREATE TABLE junit.tpc_inherit_c (\n" +
+                "  id int ,\n" +
+                "  cid int ,\n" +
+                "  inh_c text ,\n" +
+                "  val text ,\n" +
+                "  PRIMARY KEY ( ( id ), cid )\n" +
+                ");");
+
+        pstmtI = session.prepare("insert into junit.tpc_inherit_b ( id, cid, inh_b, val) values (?, ?, ?, ?)");
+        pstmtU = session.prepare("update junit.tpc_inherit_b set inh_b=?, val=? where id=? and cid=?");
+        pstmtD = session.prepare("delete from junit.tpc_inherit_b where id=? and cid=?");
+        pstmt1 = session.prepare("select id, cid, val from junit.tpc_base where id=? and cid=?");
+        pstmt2 = session.prepare("select id, cid, inh_a, val from junit.tpc_inherit_a where id=? and cid=?");
+        pstmt3 = session.prepare("select id, cid, inh_b, val from junit.tpc_inherit_b where id=? and cid=?");
+        pstmt4 = session.prepare("select id, cid, inh_b, inh_b2, val from junit.tpc_inherit_b2 where id=? and cid=?");
+        pstmt5 = session.prepare("select id, cid, inh_c, val from junit.tpc_inherit_c where id=? and cid=?");
+    }
+
+    @AfterClass
+    public static void tearDown() throws Exception
+    {
+        cluster.close();
+    }
+
+    @Test
+    public void lostDeletesTest()
+    {
+
+        for (int i = 0; i < 500; i++)
+        {
+            session.execute(pstmtI.bind(1, 1, "inhB", "valB"));
+
+            ResultSetFuture[] futures = load();
+
+            Assert.assertTrue(futures[0].getUninterruptibly().isExhausted());
+            Assert.assertTrue(futures[1].getUninterruptibly().isExhausted());
+            Assert.assertNotNull(futures[2].getUninterruptibly().one());
+            Assert.assertTrue(futures[3].getUninterruptibly().isExhausted());
+            Assert.assertTrue(futures[4].getUninterruptibly().isExhausted());
+
+            session.execute(pstmtU.bind("inhBu", "valBu", 1, 1));
+
+            futures = load();
+
+            Assert.assertTrue(futures[0].getUninterruptibly().isExhausted());
+            Assert.assertTrue(futures[1].getUninterruptibly().isExhausted());
+            Assert.assertNotNull(futures[2].getUninterruptibly().one());
+            Assert.assertTrue(futures[3].getUninterruptibly().isExhausted());
+            Assert.assertTrue(futures[4].getUninterruptibly().isExhausted());
+
+            session.execute(pstmtD.bind(1, 1));
+
+            futures = load();
+
+            Assert.assertTrue(futures[0].getUninterruptibly().isExhausted());
+            Assert.assertTrue(futures[1].getUninterruptibly().isExhausted());
+            Assert.assertTrue(futures[2].getUninterruptibly().isExhausted());
+            Assert.assertTrue(futures[3].getUninterruptibly().isExhausted());
+            Assert.assertTrue(futures[4].getUninterruptibly().isExhausted());
+        }
+    }
+
+    private ResultSetFuture[] load() {
+        return new ResultSetFuture[]{
+                session.executeAsync(pstmt1.bind(1, 1)),
+                session.executeAsync(pstmt2.bind(1, 1)),
+                session.executeAsync(pstmt3.bind(1, 1)),
+                session.executeAsync(pstmt4.bind(1, 1)),
+                session.executeAsync(pstmt5.bind(1, 1))
+        };
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/ErrorCollectorTest.java b/test/unit/org/apache/cassandra/cql3/ErrorCollectorTest.java
new file mode 100644
index 0000000..4f5db34
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ErrorCollectorTest.java

@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.antlr.runtime.CharStream;
+import org.antlr.runtime.Token;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class ErrorCollectorTest
+{
+    @Test
+    public void testAppendSnippetWithEmptyQuery()
+    {
+        String query = ";";
+
+        ErrorCollector collector = new ErrorCollector(query);
+
+        StringBuilder builder = new StringBuilder();
+
+        Token from = new MockToken(1, 0, ";");
+        Token to = new MockToken(1, 0, ";");
+        Token offending = new MockToken(1, 0, ";");
+
+        collector.appendSnippet(builder, from, to, offending);
+
+        String expected = " ([;])";
+
+        assertEquals(expected, builder.toString());
+    }
+
+    @Test
+    public void testAppendSnippetWithOneLines()
+    {
+        String query = "select * from users where user_name = ''test'';";
+
+        ErrorCollector collector = new ErrorCollector(query);
+
+        StringBuilder builder = new StringBuilder();
+
+        Token from = new MockToken(1, 25, " ");
+        Token to = new MockToken(1, 46, ";");
+        Token offending = new MockToken(1, 40, "test");
+
+        collector.appendSnippet(builder, from, to, offending);
+
+        String expected = " (... user_name = ''[test]'';)";
+
+        assertEquals(expected, builder.toString());
+    }
+
+    @Test
+    public void testAppendSnippetOnSecondLine()
+    {
+        String query = "select * from users\n" +
+                "where user_name = ''test'';";
+
+        ErrorCollector collector = new ErrorCollector(query);
+
+        StringBuilder builder = new StringBuilder();
+
+        Token from = new MockToken(2, 5, " ");
+        Token to = new MockToken(2, 26, ";");
+        Token offending = new MockToken(2, 20, "test");
+
+        collector.appendSnippet(builder, from, to, offending);
+
+        String expected = " (... user_name = ''[test]'';)";
+
+        assertEquals(expected, builder.toString());
+    }
+
+    @Test
+    public void testAppendSnippetWithSnippetOverTwoLines()
+    {
+        String query = "select * from users where user_name \n" +
+                "= ''test'';";
+
+        ErrorCollector collector = new ErrorCollector(query);
+
+        StringBuilder builder = new StringBuilder();
+
+        Token from = new MockToken(1, 20, "where");
+        Token to = new MockToken(2, 9, "'");
+        Token offending = new MockToken(2, 4, "test");
+
+        collector.appendSnippet(builder, from, to, offending);
+
+        String expected = " (...where user_name = ''[test]''...)";
+
+        assertEquals(expected, builder.toString());
+    }
+
+    private final static class MockToken implements Token
+    {
+        /**
+         * The line number on which this token was matched; line=1..n
+         */
+        private int line;
+
+        /**
+         * The index of the first character relative to the beginning of the line 0..n-1
+         */
+        private int charPositionInLine;
+
+        /**
+         * The text of the token
+         */
+        private String text;
+
+        public MockToken(int line, int charPositionInLine, String text)
+        {
+            this.line = line;
+            this.charPositionInLine = charPositionInLine;
+            this.text = text;
+        }
+
+        @Override
+        public int getChannel()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public int getCharPositionInLine()
+        {
+            return charPositionInLine;
+        }
+
+        @Override
+        public CharStream getInputStream()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public int getLine()
+        {
+            return line;
+        }
+
+        @Override
+        public String getText()
+        {
+            return text;
+        }
+
+        @Override
+        public int getTokenIndex()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public int getType()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public void setChannel(int channel)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public void setCharPositionInLine(int charPositionInLine)
+        {
+            this.charPositionInLine = charPositionInLine;
+        }
+
+        @Override
+        public void setInputStream(CharStream inputStream)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public void setLine(int line)
+        {
+            this.line = line;
+        }
+
+        @Override
+        public void setText(String text)
+        {
+            this.text = text;
+        }
+
+        @Override
+        public void setTokenIndex(int tokenIndex)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public void setType(int type)
+        {
+            throw new UnsupportedOperationException();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/MultiColumnRelationTest.java b/test/unit/org/apache/cassandra/cql3/MultiColumnRelationTest.java
index 498d332..bcf4f27 100644
--- a/test/unit/org/apache/cassandra/cql3/MultiColumnRelationTest.java
+++ b/test/unit/org/apache/cassandra/cql3/MultiColumnRelationTest.java

@@ -17,1164 +17,522 @@
  */
 package org.apache.cassandra.cql3;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.exceptions.RequestValidationException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.service.QueryState;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.MD5Digest;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
 import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import static org.apache.cassandra.cql3.QueryProcessor.process;
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertEquals;
-import static com.google.common.collect.Lists.newArrayList;
-import static org.junit.Assert.fail;
-
-public class MultiColumnRelationTest
+public class MultiColumnRelationTest extends CQLTester
 {
-    private static final Logger logger = LoggerFactory.getLogger(MultiColumnRelationTest.class);
-    static ClientState clientState;
-    static String keyspace = "multi_column_relation_test";
-
-    @BeforeClass
-    public static void setUpClass() throws Throwable
+    @Test
+    public void testSingleClusteringInvalidQueries() throws Throwable
     {
-        SchemaLoader.loadSchema();
-        executeSchemaChange("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.single_partition (a int PRIMARY KEY, b int)");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.compound_partition (a int, b int, c int, PRIMARY KEY ((a, b)))");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.single_clustering (a int, b int, c int, PRIMARY KEY (a, b))");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.multiple_clustering (a int, b int, c int, d int, PRIMARY KEY (a, b, c, d))");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.multiple_clustering_reversed (a int, b int, c int, d int, PRIMARY KEY (a, b, c, d)) WITH CLUSTERING ORDER BY (b DESC, c ASC, d DESC)");
-        clientState = ClientState.forInternalCalls();
-    }
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
 
-    @AfterClass
-    public static void stopGossiper()
-    {
-        Gossiper.instance.stop();
-    }
-
-    private static void executeSchemaChange(String query) throws Throwable
-    {
-        try
-        {
-            process(String.format(query, keyspace), ConsistencyLevel.ONE);
-        } catch (RuntimeException exc)
-        {
-            throw exc.getCause();
-        }
-    }
-
-    private static UntypedResultSet execute(String query) throws Throwable
-    {
-        try
-        {
-            return processInternal(String.format(query, keyspace));
-        } catch (RuntimeException exc)
-        {
-            if (exc.getCause() != null)
-                throw exc.getCause();
-            throw exc;
-        }
-    }
-
-    private MD5Digest prepare(String query) throws RequestValidationException
-    {
-        ResultMessage.Prepared prepared = QueryProcessor.prepare(String.format(query, keyspace), clientState, false);
-        return prepared.statementId;
-    }
-
-    private UntypedResultSet executePrepared(MD5Digest statementId, QueryOptions options) throws RequestValidationException, RequestExecutionException
-    {
-        CQLStatement statement = QueryProcessor.instance.getPrepared(statementId);
-        ResultMessage message = statement.executeInternal(QueryState.forInternalCalls(), options);
-
-        if (message instanceof ResultMessage.Rows)
-            return new UntypedResultSet(((ResultMessage.Rows)message).result);
-        else
-            return null;
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testMixMultiColumnRelationsAndSingleColumn() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a = 1 AND (b) in ((2),(3)) AND c > 4");
-    }
-
-    @Test(expected=SyntaxException.class)
-    public void testEmptyIdentifierTuple() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_clustering WHERE () = (1, 2)");
-    }
-
-    @Test(expected=SyntaxException.class)
-    public void testEmptyValueTuple() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE (b, c) > ()");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testDifferentTupleLengths() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE (b, c) > (1, 2, 3)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testNullInTuple() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE (b, c) > (1, null)");
+        assertInvalidSyntax("SELECT * FROM %s WHERE () = (?, ?)", 1, 2);
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b) = (?) AND (b) > (?)", 0, 0);
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b) > (?) AND (b) > (?)", 0, 1);
+        assertInvalid("SELECT * FROM %s WHERE (a, b) = (?, ?)", 0, 0);
     }
 
     @Test
-    public void testEmptyIN() throws Throwable
+    public void testMultiClusteringInvalidQueries() throws Throwable
     {
-        UntypedResultSet results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ()");
-        assertTrue(results.isEmpty());
-    }
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c, d))");
 
-    @Test(expected=InvalidRequestException.class)
-    public void testNullInINValues() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((1, 2, null))");
-        assertTrue(results.isEmpty());
-    }
+        assertInvalidSyntax("SELECT * FROM %s WHERE a = 0 AND (b, c) > ()");
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b, c) > (?, ?, ?)", 1, 2, 3);
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b, c) > (?, ?)", 1, null);
 
-    @Test(expected=InvalidRequestException.class)
-    public void testPartitionKeyInequality() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_partition WHERE (a) > (1)");
-    }
+        // Wrong order of columns
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (d, c, b) = (?, ?, ?)", 0, 0, 0);
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (d, c, b) > (?, ?, ?)", 0, 0, 0);
 
-    @Test(expected=InvalidRequestException.class)
-    public void testPartitionKeyEquality() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_partition WHERE (a) = (0)");
-    }
+        // Wrong number of values
+        assertInvalid("SELECT * FROM %s WHERE a=0 AND (b, c, d) IN ((?, ?))", 0, 1);
+        assertInvalid("SELECT * FROM %s WHERE a=0 AND (b, c, d) IN ((?, ?, ?, ?, ?))", 0, 1, 2, 3, 4);
 
-    @Test(expected=InvalidRequestException.class)
-    public void testRestrictNonPrimaryKey() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_partition WHERE (b) = (0)");
-    }
+        // Missing first clustering column
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (c, d) = (?, ?)", 0, 0);
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (c, d) > (?, ?)", 0, 0);
 
-    @Test(expected=InvalidRequestException.class)
-    public void testMixEqualityAndInequality() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) = (0) AND (b) > (0)");
-    }
+        // Nulls
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b, c, d) IN ((?, ?, ?))", 1, 2, null);
 
-    @Test(expected=InvalidRequestException.class)
-    public void testMixMultipleInequalitiesOnSameBound() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) > (0) AND (b) > (1)");
-    }
+        // Wrong type for 'd'
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b, c, d) = (?, ?, ?)", 1, 2, "foobar");
 
-    @Test(expected=InvalidRequestException.class)
-    public void testClusteringColumnsOutOfOrderInInequality() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (d, c, b) > (0, 0, 0)");
-    }
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND b = (?, ?, ?)", 1, 2, 3);
 
-    @Test(expected=InvalidRequestException.class)
-    public void testSkipClusteringColumnInEquality() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (c, d) = (0, 0)");
-    }
+        // Mix single and tuple inequalities
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b, c, d) > (?, ?, ?) AND b < ?", 0, 1, 0, 1);
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b, c, d) > (?, ?, ?) AND c < ?", 0, 1, 0, 1);
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND b > ? AND (b, c, d) < (?, ?, ?)", 1, 1, 1, 0);
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND c > ? AND (b, c, d) < (?, ?, ?)", 1, 1, 1, 0);
 
-    @Test(expected=InvalidRequestException.class)
-    public void testSkipClusteringColumnInInequality() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (c, d) > (0, 0)");
+        assertInvalid("SELECT * FROM %s WHERE (a, b, c, d) IN ((?, ?, ?, ?))", 0, 1, 2, 3);
+        assertInvalid("SELECT * FROM %s WHERE (c, d) IN ((?, ?))", 0, 1);
+
+        assertInvalid("SELECT * FROM %s WHERE a = ? AND (b, c) in ((?, ?), (?, ?)) AND d > ?", 0, 0, 0, 0, 0, 0);
+
     }
 
     @Test
-    public void testSingleClusteringColumnEquality() throws Throwable
+    public void testSinglePartitionInvalidQueries() throws Throwable
     {
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 0, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 1, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 2, 0)");
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) = (1)");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0);
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int)");
 
-        results = execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) = (3)");
-        assertEquals(0, results.size());
+        assertInvalid("SELECT * FROM %s WHERE (a) > (?)", 0);
+        assertInvalid("SELECT * FROM %s WHERE (a) = (?)", 0);
+        assertInvalid("SELECT * FROM %s WHERE (b) = (?)", 0);
     }
 
     @Test
-    public void testMultipleClusteringColumnEquality() throws Throwable
+    public void testSingleClustering() throws Throwable
     {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 1, 1)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 2, 0, 0)");
-        UntypedResultSet results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) = (1)");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(2, results, 0, 1, 1, 1);
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) = (1, 1)");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 1, 0);
-        checkRow(1, results, 0, 1, 1, 1);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 1, 0);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 2, 0);
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) = (1, 1, 1)");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 1, 1);
-        execute("DELETE FROM %s.multiple_clustering WHERE a=0 AND b=2 and c=0 and d=0");
-    }
+        // Equalities
 
-    @Test(expected=InvalidRequestException.class)
-    public void testPartitionAndClusteringColumnEquality() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_clustering WHERE (a, b) = (0, 0)");
-    }
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) = (?)", 0, 1),
+            row(0, 1, 0)
+        );
 
-    @Test(expected=InvalidRequestException.class)
-    public void testClusteringColumnsOutOfOrderInEquality() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (d, c, b) = (3, 2, 1)");
-    }
+        // Same but check the whole tuple can be prepared
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) = ?", 0, tuple(1)),
+            row(0, 1, 0)
+        );
 
-    @Test(expected=InvalidRequestException.class)
-    public void testBadType() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) = (1, 2, 'foobar')");
-    }
+        assertEmpty(execute("SELECT * FROM %s WHERE a = ? AND (b) = (?)", 0, 3));
 
-    @Test(expected=SyntaxException.class)
-    public void testSingleColumnTupleRelation() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND b = (1, 2, 3)");
+        // Inequalities
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) > (?)", 0, 0),
+            row(0, 1, 0),
+            row(0, 2, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) >= (?)", 0, 1),
+            row(0, 1, 0),
+            row(0, 2, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) < (?)", 0, 2),
+            row(0, 0, 0),
+            row(0, 1, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) <= (?)", 0, 1),
+            row(0, 0, 0),
+            row(0, 1, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) > (?) AND (b) < (?)", 0, 0, 2),
+            row(0, 1, 0)
+        );
     }
 
     @Test
-    public void testMixSingleAndTupleInequalities() throws Throwable
+    public void testNonEqualsRelation() throws Throwable
     {
-        String[] queries = new String[]{
-            "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (0, 1, 0) AND b < 1",
-            "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (0, 1, 0) AND c < 1",
-            "SELECT * FROM %s.multiple_clustering WHERE a=0 AND b > 1 AND (b, c, d) < (1, 1, 0)",
-            "SELECT * FROM %s.multiple_clustering WHERE a=0 AND c > 1 AND (b, c, d) < (1, 1, 0)",
-        };
-
-        for (String query : queries)
-        {
-            try
-            {
-                execute(query);
-                fail(String.format("Expected query \"%s\" to throw an InvalidRequestException", query));
-            }
-            catch (InvalidRequestException e) {}
-        }
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int)");
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND (b) != (0)");
     }
 
     @Test
-    public void testSingleClusteringColumnInequality() throws Throwable
+    public void testMultipleClustering() throws Throwable
     {
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 0, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 1, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 2, 0)");
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c, d))");
 
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) > (0)");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 0);
-        checkRow(1, results, 0, 2, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 1);
 
-        results = execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) >= (1)");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 0);
-        checkRow(1, results, 0, 2, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 1);
 
-        results = execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) < (2)");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 0);
-        checkRow(1, results, 0, 1, 0);
+        // Empty query
+        assertEmpty(execute("SELECT * FROM %s WHERE a = 0 AND (b, c, d) IN ()"));
 
-        results = execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) <= (1)");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 0);
-        checkRow(1, results, 0, 1, 0);
+        // Equalities
 
-        results = execute("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) > (0) AND (b) < (2)");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0);
-    }
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) = (?)", 0, 1),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-    @Test
-    public void testMultipleClusteringColumnInequality() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 1)");
+        // Same with whole tuple prepared
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) = ?", 0, tuple(1)),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 1, 1)");
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) = (?, ?)", 0, 1, 1),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        UntypedResultSet results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) > (0)");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(2, results, 0, 1, 1, 1);
+        // Same with whole tuple prepared
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) = ?", 0, tuple(1, 1)),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) >= (0)");
-        assertEquals(6, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
-        checkRow(3, results, 0, 1, 0, 0);
-        checkRow(4, results, 0, 1, 1, 0);
-        checkRow(5, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) = (?, ?, ?)", 0, 1, 1, 1),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) > (1, 0)");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 1, 0);
-        checkRow(1, results, 0, 1, 1, 1);
+        // Same with whole tuple prepared
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) = ?", 0, tuple(1, 1, 1)),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) >= (1, 0)");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(2, results, 0, 1, 1, 1);
+        // Inequalities
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (1, 1, 0)");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) > (?)", 0, 0),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) >= (1, 1, 0)");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 1, 0);
-        checkRow(1, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) >= (?)", 0, 0),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) < (1)");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) > (?, ?)", 0, 1, 0),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) <= (1)");
-        assertEquals(6, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
-        checkRow(3, results, 0, 1, 0, 0);
-        checkRow(4, results, 0, 1, 1, 0);
-        checkRow(5, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) >= (?, ?)", 0, 1, 0),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) < (0, 1)");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > (?, ?, ?)", 0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) <= (0, 1)");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) >= (?, ?, ?)", 0, 1, 1, 0),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) < (0, 1, 1)");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) < (?)", 0, 1),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) <= (0, 1, 1)");
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) <= (?)", 0, 1),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (0, 1, 0) AND (b) < (1)");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) < (?, ?)", 0, 0, 1),
+            row(0, 0, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (0, 1, 1) AND (b, c) < (1, 1)");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) <= (?, ?)", 0, 0, 1),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (0, 1, 1) AND (b, c, d) < (1, 1, 0)");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) < (?, ?, ?)", 0, 0, 1, 1),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) <= (?, ?, ?)", 0, 0, 1, 1),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > (?, ?, ?) AND (b) < (?)", 0, 0, 1, 0, 1),
+            row(0, 0, 1, 1)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > (?, ?, ?) AND (b, c) < (?, ?)", 0, 0, 1, 1, 1, 1),
+            row(0, 1, 0, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > (?, ?, ?) AND (b, c, d) < (?, ?, ?)", 0, 0, 1, 1, 1, 1, 0),
+            row(0, 1, 0, 0)
+        );
+
+        // Same with whole tuple prepared
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > ? AND (b, c, d) < ?", 0, tuple(0, 1, 1), tuple(1, 1, 0)),
+            row(0, 1, 0, 0)
+        );
 
         // reversed
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) > (0) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(3, results.size());
-        checkRow(2, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(0, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) > (?) ORDER BY b DESC, c DESC, d DESC", 0, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0),
+            row(0, 1, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) >= (0) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(6, results.size());
-        checkRow(5, results, 0, 0, 0, 0);
-        checkRow(4, results, 0, 0, 1, 0);
-        checkRow(3, results, 0, 0, 1, 1);
-        checkRow(2, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(0, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) >= (?) ORDER BY b DESC, c DESC, d DESC", 0, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0),
+            row(0, 1, 0, 0),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) > (1, 0) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(2, results.size());
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(0, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) > (?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 1, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) >= (1, 0) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(3, results.size());
-        checkRow(2, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(0, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) >= (?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 1, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0),
+            row(0, 1, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (1, 1, 0) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > (?, ?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 1, 1, 0),
+            row(0, 1, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) >= (1, 1, 0) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(2, results.size());
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(0, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) >= (?, ?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 1, 1, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) < (1) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(3, results.size());
-        checkRow(2, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(0, results, 0, 0, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) < (?) ORDER BY b DESC, c DESC, d DESC", 0, 1),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) <= (1) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(6, results.size());
-        checkRow(5, results, 0, 0, 0, 0);
-        checkRow(4, results, 0, 0, 1, 0);
-        checkRow(3, results, 0, 0, 1, 1);
-        checkRow(2, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(0, results, 0, 1, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) <= (?) ORDER BY b DESC, c DESC, d DESC", 0, 1),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0),
+            row(0, 1, 0, 0),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) < (0, 1) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) < (?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 0, 1),
+            row(0, 0, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) <= (0, 1) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(3, results.size());
-        checkRow(2, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(0, results, 0, 0, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) <= (?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 0, 1),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) < (0, 1, 1) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(2, results.size());
-        checkRow(1, results, 0, 0, 0, 0);
-        checkRow(0, results, 0, 0, 1, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) < (?, ?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 0, 1, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) <= (0, 1, 1) ORDER BY b DESC, c DESC, d DESC");
-        checkRow(2, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(0, results, 0, 0, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) <= (?, ?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 0, 1, 1),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (0, 1, 0) AND (b) < (1) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > (?, ?, ?) AND (b) < (?) ORDER BY b DESC, c DESC, d DESC", 0, 0, 1, 0, 1),
+            row(0, 0, 1, 1)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (0, 1, 1) AND (b, c) < (1, 1) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > (?, ?, ?) AND (b, c) < (?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 0, 1, 1, 1, 1),
+            row(0, 1, 0, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (0, 1, 1) AND (b, c, d) < (1, 1, 0) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) > (?, ?, ?) AND (b, c, d) < (?, ?, ?) ORDER BY b DESC, c DESC, d DESC", 0, 0, 1, 1, 1, 1, 0),
+            row(0, 1, 0, 0)
+        );
+
+        // IN
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) IN ((?, ?, ?), (?, ?, ?))", 0, 0, 1, 0, 0, 1, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
+
+        // same query but with whole tuple prepared
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) IN (?, ?)", 0, tuple(0, 1, 0), tuple(0, 1, 1)),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
+
+        // same query but with whole IN list prepared
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) IN ?", 0, list(tuple(0, 1, 0), tuple(0, 1, 1))),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
+
+        // same query, but reversed order for the IN values
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) IN (?, ?)", 0, tuple(0, 1, 1), tuple(0, 1, 0)),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? and (b, c) IN ((?, ?))", 0, 0, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? and (b) IN ((?))", 0, 0),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) IN ((?, ?)) ORDER BY b DESC, c DESC, d DESC", 0, 0, 1),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0)
+        );
+
+        // IN on both partition key and clustering key
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 1);
+
+        assertRows(execute("SELECT * FROM %s WHERE a IN (?, ?) AND (b, c, d) IN (?, ?)", 0, 1, tuple(0, 1, 0), tuple(0, 1, 1)),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1),
+            row(1, 0, 1, 0),
+            row(1, 0, 1, 1)
+        );
+
+        // same but with whole IN lists prepared
+        assertRows(execute("SELECT * FROM %s WHERE a IN ? AND (b, c, d) IN ?", list(0, 1), list(tuple(0, 1, 0), tuple(0, 1, 1))),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1),
+            row(1, 0, 1, 0),
+            row(1, 0, 1, 1)
+        );
+
+        // same query, but reversed order for the IN values
+        assertRows(execute("SELECT * FROM %s WHERE a IN (?, ?) AND (b, c, d) IN (?, ?)", 1, 0, tuple(0, 1, 1), tuple(0, 1, 0)),
+            row(1, 0, 1, 0),
+            row(1, 0, 1, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a IN (?, ?) and (b, c) IN ((?, ?))", 0, 1, 0, 1),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1),
+            row(1, 0, 1, 0),
+            row(1, 0, 1, 1)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a IN (?, ?) and (b) IN ((?))", 0, 1, 0),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 0),
+            row(0, 0, 1, 1),
+            row(1, 0, 0, 0),
+            row(1, 0, 1, 0),
+            row(1, 0, 1, 1)
+        );
     }
 
     @Test
-    public void testMultipleClusteringColumnInequalityReversedComponents() throws Throwable
+    public void testMultipleClusteringReversedComponents() throws Throwable
     {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c, d)) WITH CLUSTERING ORDER BY (b DESC, c ASC, d DESC)");
+
         // b and d are reversed in the clustering order
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 1, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 1, 1, 1)");
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 1, 1, 0)");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 1);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
 
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 0, 1, 1)");
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 0, 1, 0)");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 1);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
 
 
-        UntypedResultSet results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b) > (0)");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 1);
-        checkRow(2, results, 0, 1, 1, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) > (?)", 0, 0),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b) >= (0)");
-        assertEquals(6, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 1);
-        checkRow(2, results, 0, 1, 1, 0);
-        checkRow(3, results, 0, 0, 0, 0);
-        checkRow(4, results, 0, 0, 1, 1);
-        checkRow(5, results, 0, 0, 1, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) >= (?)", 0, 0),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b) < (1)");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-        checkRow(2, results, 0, 0, 1, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) < (?)", 0, 1),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0)
+        );
 
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b) <= (1)");
-        assertEquals(6, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 1);
-        checkRow(2, results, 0, 1, 1, 0);
-        checkRow(3, results, 0, 0, 0, 0);
-        checkRow(4, results, 0, 0, 1, 1);
-        checkRow(5, results, 0, 0, 1, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) <= (?)", 0, 1),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a=? AND (b, c, d) IN ((?, ?, ?), (?, ?, ?))", 0, 1, 1, 1, 0, 1, 1),
+            row(0, 1, 1, 1),
+            row(0, 0, 1, 1)
+        );
+
+        // same query, but reversed order for the IN values
+        assertRows(execute("SELECT * FROM %s WHERE a=? AND (b, c, d) IN ((?, ?, ?), (?, ?, ?))", 0, 0, 1, 1, 1, 1, 1),
+           row(0, 1, 1, 1),
+           row(0, 0, 1, 1)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c, d) IN (?, ?, ?, ?, ?, ?)",
+                           0, tuple(1, 0, 0), tuple(1, 1, 1), tuple(1, 1, 0), tuple(0, 0, 0), tuple(0, 1, 1), tuple(0, 1, 0)),
+            row(0, 1, 0, 0),
+            row(0, 1, 1, 1),
+            row(0, 1, 1, 0),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) IN (?)", 0, tuple(0, 1)),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b, c) IN (?)", 0, tuple(0, 0)),
+            row(0, 0, 0, 0)
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND (b) IN ((?))", 0, 0),
+            row(0, 0, 0, 0),
+            row(0, 0, 1, 1),
+            row(0, 0, 1, 0)
+        );
 
         // preserve pre-6875 behavior (even though the query result is technically incorrect)
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b, c) > (1, 0)");
-        assertEquals(0, results.size());
-    }
-
-    @Test
-    public void testLiteralIn() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 1)");
-
-        UntypedResultSet results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((0, 1, 0), (0, 1, 1))");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        // same query, but reversed order for the IN values
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((0, 1, 1), (0, 1, 0))");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b, c) IN ((0, 1))");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b) IN ((0))");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
-
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) IN ((0, 1)) ORDER BY b DESC, c DESC, d DESC");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
-        checkRow(1, results, 0, 0, 1, 0);
-    }
-
-
-    @Test
-    public void testLiteralInReversed() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 1, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 0, 1, 1)");
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering_reversed (a, b, c, d) VALUES (0, -1, 0, 0)");
-
-        UntypedResultSet results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b, c, d) IN ((0, 1, 0), (0, 1, 1))");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
-        checkRow(1, results, 0, 0, 1, 0);
-
-        // same query, but reversed order for the IN values
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b, c, d) IN ((0, 1, 1), (0, 1, 0))");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
-        checkRow(1, results, 0, 0, 1, 0);
-
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b, c, d) IN ((1, 0, 0), (0, 0, 0), (0, 1, 1), (0, 1, 0), (-1, 0, 0))");
-        assertEquals(5, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 0, 0, 0);
-        checkRow(2, results, 0, 0, 1, 1);
-        checkRow(3, results, 0, 0, 1, 0);
-        checkRow(4, results, 0, -1, 0, 0);
-
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b, c, d) IN ((0, 0, 0))");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b, c, d) IN ((0, 1, 1))");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
-
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 AND (b, c, d) IN ((0, 1, 0))");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 and (b, c) IN ((0, 1))");
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
-        checkRow(1, results, 0, 0, 1, 0);
-
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 and (b, c) IN ((0, 0))");
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-
-        results = execute("SELECT * FROM %s.multiple_clustering_reversed WHERE a=0 and (b) IN ((0))");
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-        checkRow(2, results, 0, 0, 1, 0);
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testLiteralInWithShortTuple() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((0, 1))");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testLiteralInWithLongTuple() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((0, 1, 2, 3, 4))");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testLiteralInWithPartitionKey() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE (a, b, c, d) IN ((0, 1, 2, 3))");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testLiteralInSkipsClusteringColumn() throws Throwable
-    {
-        execute("SELECT * FROM %s.multiple_clustering WHERE (c, d) IN ((0, 1))");
-    }
-    @Test
-    public void testPartitionAndClusteringInClauses() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 1)");
-
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (1, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (1, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (1, 0, 1, 1)");
-
-        UntypedResultSet results = execute("SELECT * FROM %s.multiple_clustering WHERE a IN (0, 1) AND (b, c, d) IN ((0, 1, 0), (0, 1, 1))");
-        assertEquals(4, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-        checkRow(2, results, 1, 0, 1, 0);
-        checkRow(3, results, 1, 0, 1, 1);
-
-        // same query, but reversed order for the IN values
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a IN (1, 0) AND (b, c, d) IN ((0, 1, 1), (0, 1, 0))");
-        assertEquals(4, results.size());
-        checkRow(0, results, 1, 0, 1, 0);
-        checkRow(1, results, 1, 0, 1, 1);
-        checkRow(2, results, 0, 0, 1, 0);
-        checkRow(3, results, 0, 0, 1, 1);
-
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a IN (0, 1) and (b, c) IN ((0, 1))");
-        assertEquals(4, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-        checkRow(2, results, 1, 0, 1, 0);
-        checkRow(3, results, 1, 0, 1, 1);
-
-        results = execute("SELECT * FROM %s.multiple_clustering WHERE a IN (0, 1) and (b) IN ((0))");
-        assertEquals(6, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
-        checkRow(3, results, 1, 0, 0, 0);
-        checkRow(4, results, 1, 0, 1, 0);
-        checkRow(5, results, 1, 0, 1, 1);
-    }
-
-    // prepare statement tests
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPreparePartitionAndClusteringColumnEquality() throws Throwable
-    {
-        prepare("SELECT * FROM %s.single_clustering WHERE (a, b) = (?, ?)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareDifferentTupleLengths() throws Throwable
-    {
-        prepare("SELECT * FROM %s.multiple_clustering WHERE (b, c) > (?, ?, ?)");
-    }
-
-    @Test
-    public void testPrepareEmptyIN() throws Throwable
-    {
-        MD5Digest id = prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ()");
-        UntypedResultSet results = executePrepared(id, makeIntOptions());
-        assertTrue(results.isEmpty());
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPreparePartitionKeyInequality() throws Throwable
-    {
-        prepare("SELECT * FROM %s.single_partition WHERE (a) > (?)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPreparePartitionKeyEquality() throws Throwable
-    {
-        prepare("SELECT * FROM %s.single_partition WHERE (a) = (?)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareRestrictNonPrimaryKey() throws Throwable
-    {
-        prepare("SELECT * FROM %s.single_partition WHERE (b) = (?)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareMixEqualityAndInequality() throws Throwable
-    {
-        prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) = (?) AND (b) > (?)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareMixMultipleInequalitiesOnSameBound() throws Throwable
-    {
-        prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) > (?) AND (b) > (?)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareClusteringColumnsOutOfOrderInInequality() throws Throwable
-    {
-        prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (d, c, b) > (?, ?, ?)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareSkipClusteringColumnInEquality() throws Throwable
-    {
-        prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (c, d) = (?, ?)");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareSkipClusteringColumnInInequality() throws Throwable
-    {
-        prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (c, d) > (?, ?)");
-    }
-
-    @Test
-    public void testPreparedClusteringColumnEquality() throws Throwable
-    {
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 0, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 1, 0)");
-        MD5Digest id = prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) = (?)");
-        UntypedResultSet results = executePrepared(id, makeIntOptions(0));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 0);
-    }
-
-    @Test
-    public void testPreparedClusteringColumnEqualitySingleMarker() throws Throwable
-    {
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 0, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 1, 0)");
-        MD5Digest id = prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) = ?");
-        UntypedResultSet results = executePrepared(id, options(tuple(0)));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 0);
-    }
-
-    @Test
-    public void testPreparedSingleClusteringColumnInequality() throws Throwable
-    {
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 0, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 1, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 2, 0)");
-
-        MD5Digest id = prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) > (?)");
-        UntypedResultSet results = executePrepared(id, makeIntOptions(0));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 0);
-        checkRow(1, results, 0, 2, 0);
-
-        results = executePrepared(prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) >= (?)"), makeIntOptions(1));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 0);
-        checkRow(1, results, 0, 2, 0);
-
-        results = executePrepared(prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) < (?)"), makeIntOptions(2));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 0);
-        checkRow(1, results, 0, 1, 0);
-
-        results = executePrepared(prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) <= (?)"), makeIntOptions(1));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 0);
-        checkRow(1, results, 0, 1, 0);
-
-        results = executePrepared(prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) > (?) AND (b) < (?)"), makeIntOptions(0, 2));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0);
-    }
-
-    @Test
-    public void testPreparedSingleClusteringColumnInequalitySingleMarker() throws Throwable
-    {
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 0, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 1, 0)");
-        execute("INSERT INTO %s.single_clustering (a, b, c) VALUES (0, 2, 0)");
-
-        MD5Digest id = prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) > ?");
-        UntypedResultSet results = executePrepared(id, options(tuple(0)));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 0);
-        checkRow(1, results, 0, 2, 0);
-
-        results = executePrepared(prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) >= ?"), options(tuple(1)));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 0);
-        checkRow(1, results, 0, 2, 0);
-
-        results = executePrepared(prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) < ?"), options(tuple(2)));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 0);
-        checkRow(1, results, 0, 1, 0);
-
-        results = executePrepared(prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) <= ?"), options(tuple(1)));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 0);
-        checkRow(1, results, 0, 1, 0);
-
-
-        results = executePrepared(prepare("SELECT * FROM %s.single_clustering WHERE a=0 AND (b) > ? AND (b) < ?"),
-                options(tuple(0), tuple(2)));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0);
-    }
-
-    @Test
-    public void testPrepareMultipleClusteringColumnInequality() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 1)");
-
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 1, 1)");
-
-        UntypedResultSet results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) > (?)"), makeIntOptions(0));
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(2, results, 0, 1, 1, 1);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) > (?, ?)"), makeIntOptions(1, 0));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 1, 0);
-        checkRow(1, results, 0, 1, 1, 1);
-
-        results = executePrepared(prepare
-                ("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (?, ?, ?)"), makeIntOptions(1, 1, 0));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 1, 1);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (?, ?, ?) AND (b) < (?)"),
-                makeIntOptions(0, 1, 0, 1));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
-
-        results = executePrepared(prepare
-                ("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (?, ?, ?) AND (b, c) < (?, ?)"),
-                makeIntOptions(0, 1, 1, 1, 1));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (?, ?, ?) AND (b, c, d) < (?, ?, ?)"),
-                makeIntOptions(0, 1, 1, 1, 1, 0));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-
-        // reversed
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) > (?) ORDER BY b DESC, c DESC, d DESC"),
-                makeIntOptions(0));
-        assertEquals(3, results.size());
-        checkRow(2, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(0, results, 0, 1, 1, 1);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > (?, ?, ?) AND (b, c) < (?, ?) ORDER BY b DESC, c DESC, d DESC"),
-                makeIntOptions(0, 1, 1, 1, 1));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-    }
-
-    @Test
-    public void testPrepareMultipleClusteringColumnInequalitySingleMarker() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 1)");
-
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 1, 1, 1)");
-
-        UntypedResultSet results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) > ?"), options(tuple(0)));
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(2, results, 0, 1, 1, 1);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c) > ?"), options(tuple(1, 0)));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 1, 1, 0);
-        checkRow(1, results, 0, 1, 1, 1);
-
-        results = executePrepared(prepare
-                ("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > ?"), options(tuple(1, 1, 0)));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 1, 1);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > ? AND (b) < ?"),
-                options(tuple(0, 1, 0), tuple(1)));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 0, 1, 1);
-
-        results = executePrepared(prepare
-                ("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > ? AND (b, c) < ?"),
-                options(tuple(0, 1, 1), tuple(1, 1)));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > ? AND (b, c, d) < ?"),
-                options(tuple(0, 1, 1), tuple(1, 1, 0)));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-
-        // reversed
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b) > ? ORDER BY b DESC, c DESC, d DESC"),
-                options(tuple(0)));
-        assertEquals(3, results.size());
-        checkRow(2, results, 0, 1, 0, 0);
-        checkRow(1, results, 0, 1, 1, 0);
-        checkRow(0, results, 0, 1, 1, 1);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) > ? AND (b, c) < ? ORDER BY b DESC, c DESC, d DESC"),
-                options(tuple(0, 1, 1), tuple(1, 1)));
-        assertEquals(1, results.size());
-        checkRow(0, results, 0, 1, 0, 0);
-    }
-
-    @Test
-    public void testPrepareLiteralIn() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 1)");
-
-        UntypedResultSet results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((?, ?, ?), (?, ?, ?))"),
-                makeIntOptions(0, 1, 0, 0, 1, 1));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        // same query, but reversed order for the IN values
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((?, ?, ?), (?, ?, ?))"),
-                makeIntOptions(0, 1, 1, 0, 1, 0));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        results = executePrepared(prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b, c) IN ((?, ?))"),
-                makeIntOptions(0, 1));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        results = executePrepared(prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b) IN ((?))"),
-                makeIntOptions(0));
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
-    }
-
-    @Test
-    public void testPrepareInOneMarkerPerTuple() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 1)");
-
-        UntypedResultSet results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN (?, ?)"),
-                options(tuple(0, 1, 0), tuple(0, 1, 1)));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        // same query, but reversed order for the IN values
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN (?, ?)"),
-                options(tuple(0, 1, 1), tuple(0, 1, 0)));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-
-        results = executePrepared(prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b, c) IN (?)"),
-                options(tuple(0, 1)));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        results = executePrepared(prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b) IN (?)"),
-                options(tuple(0)));
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
-    }
-
-    @Test
-    public void testPrepareInOneMarker() throws Throwable
-    {
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 0, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 0)");
-        execute("INSERT INTO %s.multiple_clustering (a, b, c, d) VALUES (0, 0, 1, 1)");
-
-        UntypedResultSet results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ?"),
-                options(list(tuple(0, 1, 0), tuple(0, 1, 1))));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        // same query, but reversed order for the IN values
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ?"),
-                options(list(tuple(0, 1, 1), tuple(0, 1, 0))));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        results = executePrepared(prepare(
-                "SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ?"),
-                options(list()));
-        assertTrue(results.isEmpty());
-
-        results = executePrepared(prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b, c) IN ?"),
-                options(list(tuple(0, 1))));
-        assertEquals(2, results.size());
-        checkRow(0, results, 0, 0, 1, 0);
-        checkRow(1, results, 0, 0, 1, 1);
-
-        results = executePrepared(prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b) IN ?"),
-                options(list(tuple(0))));
-        assertEquals(3, results.size());
-        checkRow(0, results, 0, 0, 0, 0);
-        checkRow(1, results, 0, 0, 1, 0);
-        checkRow(2, results, 0, 0, 1, 1);
-
-        results = executePrepared(prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 and (b) IN ?"),
-                options(list()));
-        assertTrue(results.isEmpty());
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareLiteralInWithShortTuple() throws Throwable
-    {
-        prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((?, ?))");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareLiteralInWithLongTuple() throws Throwable
-    {
-        prepare("SELECT * FROM %s.multiple_clustering WHERE a=0 AND (b, c, d) IN ((?, ?, ?, ?, ?))");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareLiteralInWithPartitionKey() throws Throwable
-    {
-        prepare("SELECT * FROM %s.multiple_clustering WHERE (a, b, c, d) IN ((?, ?, ?, ?))");
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void testPrepareLiteralInSkipsClusteringColumn() throws Throwable
-    {
-        prepare("SELECT * FROM %s.multiple_clustering WHERE (c, d) IN ((?, ?))");
-    }
-
-    private static QueryOptions makeIntOptions(Integer... values)
-    {
-        List<ByteBuffer> buffers = new ArrayList<>(values.length);
-        for (int value : values)
-            buffers.add(ByteBufferUtil.bytes(value));
-        return new QueryOptions(ConsistencyLevel.ONE, buffers);
-    }
-
-    private static ByteBuffer tuple(Integer... values)
-    {
-        List<AbstractType<?>> types = new ArrayList<>(values.length);
-        ByteBuffer[] buffers = new ByteBuffer[values.length];
-        for (int i = 0; i < values.length; i++)
-        {
-            types.add(Int32Type.instance);
-            buffers[i] = ByteBufferUtil.bytes(values[i]);
-        }
-
-        TupleType type = new TupleType(types);
-        return type.buildValue(buffers);
-    }
-
-    private static ByteBuffer list(ByteBuffer... values)
-    {
-        return CollectionType.pack(Arrays.asList(values), values.length);
-    }
-
-    private static QueryOptions options(ByteBuffer... buffers)
-    {
-        return new QueryOptions(ConsistencyLevel.ONE, Arrays.asList(buffers));
-    }
-
-    private static void checkRow(int rowIndex, UntypedResultSet results, Integer... expectedValues)
-    {
-        List<UntypedResultSet.Row> rows = newArrayList(results.iterator());
-        UntypedResultSet.Row row = rows.get(rowIndex);
-        Iterator<ColumnSpecification> columns = row.getColumns().iterator();
-        for (Integer expected : expectedValues)
-        {
-            String columnName = columns.next().name.toString();
-            int actual = row.getInt(columnName);
-            assertEquals(String.format("Expected value %d for column %s in row %d, but got %s", actual, columnName, rowIndex, expected),
-                         (long) expected, actual);
-        }
+        assertEmpty(execute("SELECT * FROM %s WHERE a = ? AND (b, c) > (?, ?)", 0, 1, 0));
     }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java b/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java
new file mode 100644
index 0000000..f65ec18
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java

@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.Cluster;
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.Session;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.service.EmbeddedCassandraService;
+
+import static junit.framework.Assert.assertEquals;
+
+public class PreparedStatementsTest extends SchemaLoader
+{
+    private static Cluster cluster;
+    private static Session session;
+
+    private static final String KEYSPACE = "prepared_stmt_cleanup";
+    private static final String createKsStatement = "CREATE KEYSPACE " + KEYSPACE +
+                                                    " WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };";
+    private static final String dropKsStatement = "DROP KEYSPACE IF EXISTS " + KEYSPACE;
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        Schema.instance.clear();
+
+        EmbeddedCassandraService cassandra = new EmbeddedCassandraService();
+        cassandra.start();
+
+        // Currently the native server start method return before the server is fully binded to the socket, so we need
+        // to wait slightly before trying to connect to it. We should fix this but in the meantime using a sleep.
+        Thread.sleep(500);
+
+		cluster = Cluster.builder().addContactPoint("127.0.0.1")
+                                   .withPort(DatabaseDescriptor.getNativeTransportPort())
+                                   .build();
+        session = cluster.connect();
+
+        session.execute(dropKsStatement);
+        session.execute(createKsStatement);
+	}
+
+    @AfterClass
+    public static void tearDown() throws Exception
+    {
+        cluster.close();
+    }
+
+    @Test
+    public void testInvalidatePreparedStatementsOnDrop()
+    {
+        String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (id int PRIMARY KEY, cid int, val text);";
+        String dropTableStatement = "DROP TABLE IF EXISTS " + KEYSPACE + ".qp_cleanup;";
+
+        session.execute(createTableStatement);
+        PreparedStatement prepared = session.prepare("INSERT INTO " + KEYSPACE + ".qp_cleanup (id, cid, val) VALUES (?, ?, ?)");
+        session.execute(dropTableStatement);
+        session.execute(createTableStatement);
+        session.execute(prepared.bind(1, 1, "value"));
+
+        session.execute(dropKsStatement);
+        session.execute(createKsStatement);
+        session.execute(createTableStatement);
+        session.execute(prepared.bind(1, 1, "value"));
+        session.execute(dropKsStatement);
+
+        // FIXME: where is invalidation actually tested?
+	}
+
+    @Test
+    public void testStatementRePreparationOnReconnect()
+    {
+        session.execute(dropKsStatement);
+        session.execute(createKsStatement);
+
+        session.execute("CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_test (id int PRIMARY KEY, cid int, val text);");
+
+        String insertCQL = "INSERT INTO " + KEYSPACE + ".qp_test (id, cid, val) VALUES (?, ?, ?)";
+        String selectCQL = "Select * from " + KEYSPACE + ".qp_test where id = ?";
+
+        PreparedStatement preparedInsert = session.prepare(insertCQL);
+        PreparedStatement preparedSelect = session.prepare(selectCQL);
+
+        session.execute(preparedInsert.bind(1, 1, "value"));
+        assertEquals(1, session.execute(preparedSelect.bind(1)).all().size());
+
+        cluster.close();
+
+        cluster = Cluster.builder().addContactPoint("127.0.0.1")
+                                   .withPort(DatabaseDescriptor.getNativeTransportPort())
+                                   .build();
+        session = cluster.connect();
+
+        preparedInsert = session.prepare(insertCQL);
+        preparedSelect = session.prepare(selectCQL);
+        session.execute(preparedInsert.bind(1, 1, "value"));
+
+        assertEquals(1, session.execute(preparedSelect.bind(1)).all().size());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/SSTableMetadataTrackingTest.java b/test/unit/org/apache/cassandra/cql3/SSTableMetadataTrackingTest.java
index 9104269..7c3965f 100644
--- a/test/unit/org/apache/cassandra/cql3/SSTableMetadataTrackingTest.java
+++ b/test/unit/org/apache/cassandra/cql3/SSTableMetadataTrackingTest.java

@@ -17,44 +17,23 @@
  */
 package org.apache.cassandra.cql3;
 
-import org.junit.AfterClass;
-import org.junit.Before;
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
-import org.apache.cassandra.service.ClientState;
-import static org.apache.cassandra.cql3.QueryProcessor.process;
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import static org.junit.Assert.assertEquals;
 
-public class SSTableMetadataTrackingTest
+public class SSTableMetadataTrackingTest extends CQLTester
 {
-    private static String keyspace = "sstable_metadata_tracking_test";
-    private static ClientState clientState;
-
-    @BeforeClass
-    public static void setup() throws Throwable
-    {
-        SchemaLoader.loadSchema();
-        createKeyspace("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}");
-        clientState = ClientState.forInternalCalls();
-    }
-
-    
     @Test
     public void baseCheck() throws Throwable
     {
-        createTable("CREATE TABLE %s.basecheck (a int, b int, c text, PRIMARY KEY (a, b))");
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore("basecheck");
-        execute("INSERT INTO %s.basecheck (a,b,c) VALUES (1,1,'1') using timestamp 9999");
+        createTable("CREATE TABLE %s (a int, b int, c text, PRIMARY KEY (a, b))");
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
+        execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 9999");
         cfs.forceBlockingFlush();
-        SSTableMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime);
         cfs.forceMajorCompaction();
@@ -66,12 +45,12 @@
     @Test
     public void testMinMaxtimestampRange() throws Throwable
     {
-        createTable("CREATE TABLE %s.minmaxtsrange (a int, b int, c text, PRIMARY KEY (a, b))");
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore("minmaxtsrange");
-        execute("INSERT INTO %s.minmaxtsrange (a,b,c) VALUES (1,1,'1') using timestamp 10000");
-        execute("DELETE FROM %s.minmaxtsrange USING TIMESTAMP 9999 WHERE a = 1 and b = 1");
+        createTable("CREATE TABLE %s (a int, b int, c text, PRIMARY KEY (a, b))");
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
+        execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 10000");
+        execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1 and b = 1");
         cfs.forceBlockingFlush();
-        SSTableMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(10000, metadata.maxTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime, 5);
@@ -85,12 +64,12 @@
     @Test
     public void testMinMaxtimestampRow() throws Throwable
     {
-        createTable("CREATE TABLE %s.minmaxtsrow (a int, b int, c text, PRIMARY KEY (a, b))");
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore("minmaxtsrow");
-        execute("INSERT INTO %s.minmaxtsrow (a,b,c) VALUES (1,1,'1') using timestamp 10000");
-        execute("DELETE FROM %s.minmaxtsrow USING TIMESTAMP 9999 WHERE a = 1");
+        createTable("CREATE TABLE %s (a int, b int, c text, PRIMARY KEY (a, b))");
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
+        execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 10000");
+        execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1");
         cfs.forceBlockingFlush();
-        SSTableMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(10000, metadata.maxTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime, 5);
@@ -105,17 +84,17 @@
     @Test
     public void testTrackMetadata_rangeTombstone() throws Throwable
     {
-        createTable("CREATE TABLE %s.rangetombstone (a int, b int, c text, PRIMARY KEY (a, b)) WITH gc_grace_seconds = 10000");
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore("rangetombstone");
-        execute("DELETE FROM %s.rangetombstone USING TIMESTAMP 9999 WHERE a = 1 and b = 1");
+        createTable("CREATE TABLE %s (a int, b int, c text, PRIMARY KEY (a, b)) WITH gc_grace_seconds = 10000");
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
+        execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1 and b = 1");
         cfs.forceBlockingFlush();
         assertEquals(1, cfs.getSSTables().size());
-        SSTableMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(9999, metadata.maxTimestamp);
         assertEquals(System.currentTimeMillis()/1000, metadata.maxLocalDeletionTime, 5);
         cfs.forceMajorCompaction();
-        SSTableMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(metadata.maxLocalDeletionTime, metadata2.maxLocalDeletionTime);
         assertEquals(metadata.minTimestamp, metadata2.minTimestamp);
         assertEquals(metadata.maxTimestamp, metadata2.maxTimestamp);
@@ -124,18 +103,18 @@
     @Test
     public void testTrackMetadata_rowTombstone() throws Throwable
     {
-        createTable("CREATE TABLE %s.rowtombstone (a int, b int, c text, PRIMARY KEY (a, b))");
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore("rowtombstone");
-        execute("DELETE FROM %s.rowtombstone USING TIMESTAMP 9999 WHERE a = 1");
+        createTable("CREATE TABLE %s (a int, b int, c text, PRIMARY KEY (a, b))");
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
+        execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1");
 
         cfs.forceBlockingFlush();
         assertEquals(1, cfs.getSSTables().size());
-        SSTableMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(9999, metadata.maxTimestamp);
         assertEquals(System.currentTimeMillis()/1000, metadata.maxLocalDeletionTime, 5);
         cfs.forceMajorCompaction();
-        SSTableMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(metadata.maxLocalDeletionTime, metadata2.maxLocalDeletionTime);
         assertEquals(metadata.minTimestamp, metadata2.minTimestamp);
         assertEquals(metadata.maxTimestamp, metadata2.maxTimestamp);
@@ -144,18 +123,18 @@
     @Test
     public void testTrackMetadata_rowMarker() throws Throwable
     {
-        createTable("CREATE TABLE %s.rowmarker (a int, PRIMARY KEY (a))");
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore("rowmarker");
-        execute("INSERT INTO %s.rowmarker (a) VALUES (1) USING TIMESTAMP 9999");
+        createTable("CREATE TABLE %s (a int, PRIMARY KEY (a))");
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
+        execute("INSERT INTO %s (a) VALUES (1) USING TIMESTAMP 9999");
 
         cfs.forceBlockingFlush();
         assertEquals(1, cfs.getSSTables().size());
-        SSTableMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(9999, metadata.maxTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime);
         cfs.forceMajorCompaction();
-        SSTableMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(metadata.maxLocalDeletionTime, metadata2.maxLocalDeletionTime);
         assertEquals(metadata.minTimestamp, metadata2.minTimestamp);
         assertEquals(metadata.maxTimestamp, metadata2.maxTimestamp);
@@ -163,61 +142,19 @@
     @Test
     public void testTrackMetadata_rowMarkerDelete() throws Throwable
     {
-        createTable("CREATE TABLE %s.rowmarkerdel (a int, PRIMARY KEY (a))");
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore("rowmarkerdel");
-        execute("DELETE FROM %s.rowmarkerdel USING TIMESTAMP 9999 WHERE a=1");
+        createTable("CREATE TABLE %s (a int, PRIMARY KEY (a))");
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
+        execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a=1");
         cfs.forceBlockingFlush();
         assertEquals(1, cfs.getSSTables().size());
-        SSTableMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(9999, metadata.maxTimestamp);
         assertEquals(System.currentTimeMillis()/1000, metadata.maxLocalDeletionTime, 5);
         cfs.forceMajorCompaction();
-        SSTableMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
         assertEquals(metadata.maxLocalDeletionTime, metadata2.maxLocalDeletionTime);
         assertEquals(metadata.minTimestamp, metadata2.minTimestamp);
         assertEquals(metadata.maxTimestamp, metadata2.maxTimestamp);
     }
-
-    @AfterClass
-    public static void stopGossiper()
-    {
-        Gossiper.instance.stop();
-    }
-
-    private static void createKeyspace(String query) throws Throwable
-    {
-        try
-        {
-            process(String.format(query, keyspace), ConsistencyLevel.ONE);
-        } catch (RuntimeException exc)
-        {
-            throw exc.getCause();
-        }
-    }
-
-
-    private void createTable(String query) throws Throwable
-    {
-        try
-        {
-            process(String.format(query, keyspace), ConsistencyLevel.ONE);
-        } catch (RuntimeException exc)
-        {
-            throw exc.getCause();
-        }
-    }
-
-    private UntypedResultSet execute(String query) throws Throwable
-    {
-        try
-        {
-            return processInternal(String.format(query, keyspace));
-        } catch (RuntimeException exc)
-        {
-            if (exc.getCause() != null)
-                throw exc.getCause();
-            throw exc;
-        }
-    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/SelectWithTokenFunctionTest.java b/test/unit/org/apache/cassandra/cql3/SelectWithTokenFunctionTest.java
index 9199862..6f9f5e2 100644
--- a/test/unit/org/apache/cassandra/cql3/SelectWithTokenFunctionTest.java
+++ b/test/unit/org/apache/cassandra/cql3/SelectWithTokenFunctionTest.java

@@ -17,155 +17,50 @@
  */
 package org.apache.cassandra.cql3;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.service.ClientState;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
 import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
-import static org.apache.cassandra.cql3.QueryProcessor.process;
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
-import static org.junit.Assert.assertEquals;
-
-public class SelectWithTokenFunctionTest
+public class SelectWithTokenFunctionTest extends CQLTester
 {
-    private static final Logger logger = LoggerFactory.getLogger(SelectWithTokenFunctionTest.class);
-    static ClientState clientState;
-    static String keyspace = "token_function_test";
-
-    @BeforeClass
-    public static void setUpClass() throws Throwable
+    @Test
+    public void testTokenFunctionWithSingleColumnPartitionKey() throws Throwable
     {
-        SchemaLoader.loadSchema();
-        executeSchemaChange("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.single_partition (a int PRIMARY KEY, b text)");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.compound_partition (a int, b text, PRIMARY KEY ((a, b)))");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.single_clustering (a int, b text, PRIMARY KEY (a, b))");
-        clientState = ClientState.forInternalCalls();
-    }
+        createTable("CREATE TABLE IF NOT EXISTS %s (a int PRIMARY KEY, b text)");
+        execute("INSERT INTO %s (a, b) VALUES (0, 'a')");
 
-    @AfterClass
-    public static void stopGossiper()
-    {
-        Gossiper.instance.stop();
-    }
-
-    private static void executeSchemaChange(String query) throws Throwable
-    {
-        try
-        {
-            process(String.format(query, keyspace), ConsistencyLevel.ONE);
-        }
-        catch (RuntimeException exc)
-        {
-            throw exc.getCause();
-        }
-    }
-
-    private static UntypedResultSet execute(String query) throws Throwable
-    {
-        try
-        {
-            return processInternal(String.format(query, keyspace));
-        }
-        catch (RuntimeException exc)
-        {
-            if (exc.getCause() != null)
-                throw exc.getCause();
-            throw exc;
-        }
+        assertRows(execute("SELECT * FROM %s WHERE token(a) >= token(?)", 0), row(0, "a"));
+        assertRows(execute("SELECT * FROM %s WHERE token(a) >= token(?) and token(a) < token(?)", 0, 1), row(0, "a"));
+        assertInvalid("SELECT * FROM %s WHERE token(a) > token(?)", "a");
+        assertInvalid("SELECT * FROM %s WHERE token(a, b) >= token(?, ?)", "b", 0);
+        assertInvalid("SELECT * FROM %s WHERE token(a) >= token(?) and token(a) >= token(?)", 0, 1);
+        assertInvalid("SELECT * FROM %s WHERE token(a) >= token(?) and token(a) = token(?)", 0, 1);
+        assertInvalidSyntax("SELECT * FROM %s WHERE token(a) = token(?) and token(a) IN (token(?))", 0, 1);
     }
 
     @Test
-    public void testTokenFunctionWithSinglePartitionArgument() throws Throwable
-    {
-        execute("INSERT INTO %s.single_partition (a, b) VALUES (0, 'a')");
-
-        try
-        {
-            UntypedResultSet results = execute("SELECT * FROM %s.single_partition WHERE token(a) >= token(0)");
-            assertEquals(1, results.size());
-            results = execute("SELECT * FROM %s.single_partition WHERE token(a) >= token(0) and token(a) < token(1)");
-            assertEquals(1, results.size());
-        }
-        finally
-        {
-            execute("DELETE FROM %s.single_partition WHERE a = 0");
-        }
-    }
-
-    @Test(expected = InvalidRequestException.class)
-    public void testTokenFunctionWithWrongLiteralArgument() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_partition WHERE token(a) > token('a')");
-    }
-
-    @Test(expected = InvalidRequestException.class)
-    public void testTokenFunctionWithTwoGreaterThan() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_clustering WHERE token(a) >= token(0) and token(a) >= token(1)");
-    }
-
-    @Test(expected = InvalidRequestException.class)
-    public void testTokenFunctionWithGreaterThanAndEquals() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_clustering WHERE token(a) >= token(0) and token(a) = token(1)");
-    }
-
-    @Test(expected = SyntaxException.class)
-    public void testTokenFunctionWithGreaterThanAndIn() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_clustering WHERE token(a) >= token(0) and token(a) in (token(1))");
-    }
-
-    @Test(expected = InvalidRequestException.class)
     public void testTokenFunctionWithPartitionKeyAndClusteringKeyArguments() throws Throwable
     {
-        execute("SELECT * FROM %s.single_clustering WHERE token(a, b) > token(0, 'c')");
-    }
-
-    @Test(expected = InvalidRequestException.class)
-    public void testTokenFunctionWithCompoundPartitionKeyAndWrongLiteralArgument() throws Throwable
-    {
-        execute("SELECT * FROM %s.single_partition WHERE token(a, b) >= token('c', 0)");
+        createTable("CREATE TABLE IF NOT EXISTS %s (a int, b text, PRIMARY KEY (a, b))");
+        assertInvalid("SELECT * FROM %s WHERE token(a, b) > token(0, 'c')");
     }
 
     @Test
-    public void testTokenFunctionWithCompoundPartition() throws Throwable
+    public void testTokenFunctionWithMultiColumnPartitionKey() throws Throwable
     {
-        execute("INSERT INTO %s.compound_partition (a, b) VALUES (0, 'a')");
-        execute("INSERT INTO %s.compound_partition (a, b) VALUES (0, 'b')");
-        execute("INSERT INTO %s.compound_partition (a, b) VALUES (0, 'c')");
+        createTable("CREATE TABLE IF NOT EXISTS %s (a int, b text, PRIMARY KEY ((a, b)))");
+        execute("INSERT INTO %s (a, b) VALUES (0, 'a')");
+        execute("INSERT INTO %s (a, b) VALUES (0, 'b')");
+        execute("INSERT INTO %s (a, b) VALUES (0, 'c')");
 
-        try
-        {
-            UntypedResultSet results = execute("SELECT * FROM %s.compound_partition WHERE token(a, b) > token(0, 'a')");
-            assertEquals(2, results.size());
-            results = execute("SELECT * FROM %s.compound_partition WHERE token(a, b) > token(0, 'a') "
-                    + "and token(a, b) < token(0, 'd')");
-            assertEquals(2, results.size());
-        }
-        finally
-        {
-            execute("DELETE FROM %s.compound_partition WHERE a = 0 and b in ('a', 'b', 'c')");
-        }
-    }
-
-    @Test(expected = InvalidRequestException.class)
-    public void testTokenFunctionWithCompoundPartitionKeyAndColumnIdentifierInWrongOrder() throws Throwable
-    {
-        execute("SELECT * FROM %s.compound_partition WHERE token(b, a) > token(0, 'c')");
-    }
-
-    @Test(expected = InvalidRequestException.class)
-    public void testTokenFunctionOnEachPartitionKeyColumns() throws Throwable
-    {
-        execute("SELECT * FROM %s.compound_partition WHERE token(a) > token(0) and token(b) > token('c')");
+        assertRows(execute("SELECT * FROM %s WHERE token(a, b) > token(?, ?)", 0, "a"),
+                   row(0, "b"),
+                   row(0, "c"));
+        assertRows(execute("SELECT * FROM %s WHERE token(a, b) > token(?, ?) and token(a, b) < token(?, ?)",
+                           0, "a",
+                           0, "d"),
+                   row(0, "b"),
+                   row(0, "c"));
+        assertInvalid("SELECT * FROM %s WHERE token(a) > token(?) and token(b) > token(?)", 0, "a");
+        assertInvalid("SELECT * FROM %s WHERE token(a) > token(?, ?) and token(a) < token(?, ?) and token(b) > token(?, ?) ", 0, "a", 0, "d", 0, "a");
+        assertInvalid("SELECT * FROM %s WHERE token(b, a) > token(0, 'c')");
     }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/SingleColumnRelationTest.java b/test/unit/org/apache/cassandra/cql3/SingleColumnRelationTest.java
new file mode 100644
index 0000000..120c780
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/SingleColumnRelationTest.java

@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+public class SingleColumnRelationTest extends CQLTester
+{
+    @Test
+    public void testInvalidCollectionEqualityRelation() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b set<int>, c list<int>, d map<int, int>)");
+        createIndex("CREATE INDEX ON %s (b)");
+        createIndex("CREATE INDEX ON %s (c)");
+        createIndex("CREATE INDEX ON %s (d)");
+
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND b=?", set(0));
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND c=?", list(0));
+        assertInvalid("SELECT * FROM %s WHERE a = 0 AND d=?", map(0, 0));
+    }
+
+    @Test
+    public void testInvalidCollectionNonEQRelation() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b set<int>, c int)");
+        createIndex("CREATE INDEX ON %s (c)");
+        execute("INSERT INTO %s (a, b, c) VALUES (0, {0}, 0)");
+
+        // non-EQ operators
+        assertInvalid("SELECT * FROM %s WHERE c = 0 AND b > ?", set(0));
+        assertInvalid("SELECT * FROM %s WHERE c = 0 AND b >= ?", set(0));
+        assertInvalid("SELECT * FROM %s WHERE c = 0 AND b < ?", set(0));
+        assertInvalid("SELECT * FROM %s WHERE c = 0 AND b <= ?", set(0));
+        assertInvalid("SELECT * FROM %s WHERE c = 0 AND b IN (?)", set(0));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/SliceQueryFilterWithStaticColumnsPresentTest.java b/test/unit/org/apache/cassandra/cql3/SliceQueryFilterWithStaticColumnsPresentTest.java
deleted file mode 100644
index 75d1a1d..0000000
--- a/test/unit/org/apache/cassandra/cql3/SliceQueryFilterWithStaticColumnsPresentTest.java
+++ /dev/null

@@ -1,374 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.exceptions.RequestValidationException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.service.QueryState;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.MD5Digest;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import static org.apache.cassandra.cql3.QueryProcessor.process;
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertEquals;
-import static com.google.common.collect.Lists.newArrayList;
-import static org.junit.Assert.fail;
-
-/**
- * Test column ranges and ordering with static column in table
- */
-public class SliceQueryFilterWithStaticColumnsPresentTest
-{
-    static ClientState clientState;
-    static String keyspace = "static_column_slice_test";
-
-    @BeforeClass
-    public static void setUpClass() throws Throwable
-    {
-        SchemaLoader.loadSchema();
-        executeSchemaChange("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.single_clustering (p text, c text, v text, s text static, PRIMARY KEY (p, c));");
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.single_clustering_reversed (p text, c text, v text, s text static, PRIMARY KEY (p, c)) WITH CLUSTERING ORDER BY (c DESC);");
-        execute("INSERT INTO %s.single_clustering (p, c, v, s) values ('p1', 'k1', 'v1', 'sv1')");
-        execute("INSERT INTO %s.single_clustering (p, c, v) values ('p1', 'k2', 'v2')");
-        execute("INSERT INTO %s.single_clustering (p, s) values ('p2', 'sv2')");
-        execute("INSERT INTO %s.single_clustering_reversed (p, c, v, s) values ('p1', 'k1', 'v1', 'sv1')");
-        execute("INSERT INTO %s.single_clustering_reversed (p, c, v) values ('p1', 'k2', 'v2')");
-        execute("INSERT INTO %s.single_clustering_reversed (p, s) values ('p2', 'sv2')");
-        clientState = ClientState.forInternalCalls();
-    }
-
-    @AfterClass
-    public static void stopGossiper()
-    {
-        Gossiper.instance.stop();
-    }
-
-    private static void executeSchemaChange(String query) throws Throwable
-    {
-        try
-        {
-            process(String.format(query, keyspace), ConsistencyLevel.ONE);
-        } catch (RuntimeException exc)
-        {
-            throw exc.getCause();
-        }
-    }
-
-    private static UntypedResultSet execute(String query) throws Throwable
-    {
-        try
-        {
-            return processInternal(String.format(query, keyspace));
-        } catch (RuntimeException exc)
-        {
-            if (exc.getCause() != null)
-                throw exc.getCause();
-            throw exc;
-        }
-    }
-
-    @Test
-    public void testNoClusteringColumnDefaultOrdering() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1'");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p2'");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p2", null, "sv2", null);
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1'");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p2'");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p2", null, "sv2", null);
-    }
-
-    @Test
-    public void testNoClusteringColumnAscending() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' ORDER BY c ASC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p2' ORDER BY c ASC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p2", null, "sv2", null);
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' ORDER BY c ASC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p2' ORDER BY c ASC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p2", null, "sv2", null);
-    }
-
-    @Test
-    public void testNoClusteringColumnDescending() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' ORDER BY c DESC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p2' ORDER BY c DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p2", null, "sv2", null);
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' ORDER BY c DESC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p2' ORDER BY c DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p2", null, "sv2", null);
-    }
-
-    @Test
-    public void testSingleRelationDefaultOrdering() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k1'");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k2'");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k3'");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c ='k1'");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c<='k1'");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c<='k0'");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k1'");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k2'");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k3'");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c='k1'");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c<='k1'");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c<='k0'");
-        assertEquals(0, results.size());
-    }
-
-    @Test
-    public void testSingleRelationAscending() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k1' ORDER BY c ASC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k2' ORDER BY c ASC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k3' ORDER BY c ASC");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c ='k1' ORDER BY c ASC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c<='k1' ORDER BY c ASC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c<='k0' ORDER BY c ASC");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k1' ORDER BY c ASC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k2' ORDER BY c ASC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k3' ORDER BY c ASC");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c='k1' ORDER BY c ASC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c<='k1' ORDER BY c ASC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c<='k0' ORDER BY c ASC");
-        assertEquals(0, results.size());
-    }
-
-    @Test
-    public void testSingleRelationDescending() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k1' ORDER BY c DESC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k2' ORDER BY c DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c>='k3' ORDER BY c DESC");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c ='k1' ORDER BY c DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c<='k1' ORDER BY c DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c<='k0' ORDER BY c DESC");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k1' ORDER BY c DESC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k2' ORDER BY c DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c>='k3' ORDER BY c DESC");
-        assertEquals(0, results.size());
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c='k1' ORDER BY c DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c<='k1' ORDER BY c DESC");
-        assertEquals(1, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c<='k0' ORDER BY c DESC");
-        assertEquals(0, results.size());
-    }
-
-    @Test
-    public void testInDefaultOrdering() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c IN ('k1', 'k2')");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c IN ('k1', 'k2')");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-    }
-
-    @Test
-    public void testInAscending() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c IN ('k1', 'k2') ORDER BY c ASC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c IN ('k1', 'k2') ORDER BY c ASC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k1", "sv1", "v1");
-        checkRow(1, results, "p1", "k2", "sv1", "v2");
-    }
-
-    @Test
-    public void testInDescending() throws Throwable
-    {
-        UntypedResultSet results = execute("SELECT * FROM %s.single_clustering WHERE p='p1' AND c IN ('k1', 'k2') ORDER BY c DESC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-
-        results = execute("SELECT * FROM %s.single_clustering_reversed WHERE p='p1' AND c IN ('k1', 'k2') ORDER BY c DESC");
-        assertEquals(2, results.size());
-        checkRow(0, results, "p1", "k2", "sv1", "v2");
-        checkRow(1, results, "p1", "k1", "sv1", "v1");
-    }
-
-    private static void checkRow(int rowIndex, UntypedResultSet results, String... expectedValues)
-    {
-        List<UntypedResultSet.Row> rows = newArrayList(results.iterator());
-        UntypedResultSet.Row row = rows.get(rowIndex);
-        Iterator<ColumnSpecification> columns = row.getColumns().iterator();
-        for (String expected : expectedValues)
-        {
-            String columnName = columns.next().name.toString();
-            String actual = row.has(columnName) ? row.getString(columnName) : null;
-            assertEquals(String.format("Expected value %s for column %s in row %d, but got %s", actual, columnName, rowIndex, expected),
-                    expected, actual);
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/cql3/StaticColumnsQueryTest.java b/test/unit/org/apache/cassandra/cql3/StaticColumnsQueryTest.java
new file mode 100644
index 0000000..e27f968
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/StaticColumnsQueryTest.java

@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+/**
+ * Test column ranges and ordering with static column in table
+ */
+public class StaticColumnsQueryTest extends CQLTester
+{
+    @Test
+    public void testSingleClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (p text, c text, v text, s text static, PRIMARY KEY (p, c))");
+
+        execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
+        execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
+        execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
+
+        assertRows(execute("SELECT * FROM %s WHERE p=?", "p1"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=?", "p2"),
+            row("p2", null, "sv2", null)
+        );
+
+        // Ascending order
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? ORDER BY c ASC", "p1"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? ORDER BY c ASC", "p2"),
+            row("p2", null, "sv2", null)
+        );
+
+        // Descending order
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? ORDER BY c DESC", "p1"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? ORDER BY c DESC", "p2"),
+            row("p2", null, "sv2", null)
+        );
+
+        // No order with one relation
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=?", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=?", "p1", "k2"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c>=?", "p1", "k3"));
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c =?", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c<=?", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c<=?", "p1", "k0"));
+
+        // Ascending with one relation
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c ASC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c ASC", "p1", "k2"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c ASC", "p1", "k3"));
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c =? ORDER BY c ASC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c<=? ORDER BY c ASC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c<=? ORDER BY c ASC", "p1", "k0"));
+
+        // Descending with one relation
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c DESC", "p1", "k1"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c DESC", "p1", "k2"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c DESC", "p1", "k3"));
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c =? ORDER BY c DESC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c<=? ORDER BY c DESC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c<=? ORDER BY c DESC", "p1", "k0"));
+
+        // IN
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c IN (?, ?)", "p1", "k1", "k2"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c IN (?, ?) ORDER BY c ASC", "p1", "k1", "k2"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c IN (?, ?) ORDER BY c DESC", "p1", "k1", "k2"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+    }
+
+    @Test
+    public void testSingleClusteringReversed() throws Throwable
+    {
+        createTable("CREATE TABLE %s (p text, c text, v text, s text static, PRIMARY KEY (p, c)) WITH CLUSTERING ORDER BY (c DESC)");
+
+        execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
+        execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
+        execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
+
+        assertRows(execute("SELECT * FROM %s WHERE p=?", "p1"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=?", "p2"),
+            row("p2", null, "sv2", null)
+        );
+
+        // Ascending order
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? ORDER BY c ASC", "p1"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? ORDER BY c ASC", "p2"),
+            row("p2", null, "sv2", null)
+        );
+
+        // Descending order
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? ORDER BY c DESC", "p1"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? ORDER BY c DESC", "p2"),
+            row("p2", null, "sv2", null)
+        );
+
+        // No order with one relation
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=?", "p1", "k1"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=?", "p1", "k2"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c>=?", "p1", "k3"));
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c=?", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c<=?", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c<=?", "p1", "k0"));
+
+        // Ascending with one relation
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c ASC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c ASC", "p1", "k2"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c ASC", "p1", "k3"));
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c=? ORDER BY c ASC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c<=? ORDER BY c ASC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c<=? ORDER BY c ASC", "p1", "k0"));
+
+        // Descending with one relation
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c DESC", "p1", "k1"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c DESC", "p1", "k2"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c>=? ORDER BY c DESC", "p1", "k3"));
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c=? ORDER BY c DESC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c<=? ORDER BY c DESC", "p1", "k1"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertEmpty(execute("SELECT * FROM %s WHERE p=? AND c<=? ORDER BY c DESC", "p1", "k0"));
+
+        // IN
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c IN (?, ?)", "p1", "k1", "k2"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c IN (?, ?) ORDER BY c ASC", "p1", "k1", "k2"),
+            row("p1", "k1", "sv1", "v1"),
+            row("p1", "k2", "sv1", "v2")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE p=? AND c IN (?, ?) ORDER BY c DESC", "p1", "k1", "k2"),
+            row("p1", "k2", "sv1", "v2"),
+            row("p1", "k1", "sv1", "v1")
+        );
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/TupleTypeTest.java b/test/unit/org/apache/cassandra/cql3/TupleTypeTest.java
new file mode 100644
index 0000000..53e7e71
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/TupleTypeTest.java

@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+public class TupleTypeTest extends CQLTester
+{
+    @Test
+    public void testTuplePutAndGet() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, t frozen<tuple<int, text, double>>)");
+
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 0, tuple(3, "foo", 3.4));
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 1, tuple(8, "bar", 0.2));
+        assertAllRows(
+            row(0, tuple(3, "foo", 3.4)),
+            row(1, tuple(8, "bar", 0.2))
+        );
+
+        // nulls
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 2, tuple(5, null, 3.4));
+        assertRows(execute("SELECT * FROM %s WHERE k=?", 2),
+            row(2, tuple(5, null, 3.4))
+        );
+
+        // incomplete tuple
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 3, tuple(5, "bar"));
+        assertRows(execute("SELECT * FROM %s WHERE k=?", 3),
+            row(3, tuple(5, "bar"))
+        );
+    }
+
+    @Test
+    public void testNestedTuple() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, t frozen<tuple<int, tuple<text, double>>>)");
+
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 0, tuple(3, tuple("foo", 3.4)));
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 1, tuple(8, tuple("bar", 0.2)));
+        assertAllRows(
+            row(0, tuple(3, tuple("foo", 3.4))),
+            row(1, tuple(8, tuple("bar", 0.2)))
+        );
+    }
+
+    @Test
+    public void testTupleInPartitionKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (t frozen<tuple<int, text>> PRIMARY KEY)");
+
+        execute("INSERT INTO %s (t) VALUES (?)", tuple(3, "foo"));
+        assertAllRows(row(tuple(3, "foo")));
+    }
+
+    @Test
+    public void testTupleInClusteringKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, t frozen<tuple<int, text>>, PRIMARY KEY (k, t))");
+
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 0, tuple(5, "bar"));
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 0, tuple(3, "foo"));
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 0, tuple(6, "bar"));
+        execute("INSERT INTO %s (k, t) VALUES (?, ?)", 0, tuple(5, "foo"));
+
+        assertAllRows(
+            row(0, tuple(3, "foo")),
+            row(0, tuple(5, "bar")),
+            row(0, tuple(5, "foo")),
+            row(0, tuple(6, "bar"))
+        );
+    }
+
+    @Test
+    public void testInvalidQueries() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, t frozen<tuple<int, text, double>>)");
+
+        assertInvalidSyntax("INSERT INTO %s (k, t) VALUES (0, ())");
+        assertInvalid("INSERT INTO %s (k, t) VALUES (0, (2, 'foo', 3.1, 'bar'))");
+    }
+
+    @Test
+    public void testNonFrozenTuple() throws Throwable
+    {
+        assertInvalid("CREATE TABLE wrong (k int PRIMARY KEY, v tuple<int, text>)");
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/TypeTest.java b/test/unit/org/apache/cassandra/cql3/TypeTest.java
index b08ca2c..ec82d41 100644
--- a/test/unit/org/apache/cassandra/cql3/TypeTest.java
+++ b/test/unit/org/apache/cassandra/cql3/TypeTest.java

@@ -17,162 +17,70 @@
  */
 package org.apache.cassandra.cql3;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.exceptions.RequestValidationException;
-import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.service.QueryState;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.utils.MD5Digest;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
 import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import static org.apache.cassandra.cql3.QueryProcessor.process;
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
-public class TypeTest
+public class TypeTest extends CQLTester
 {
-    private static final Logger logger = LoggerFactory.getLogger(TypeTest.class);
-    static ClientState clientState;
-    static String keyspace = "cql3_type_test";
-
-    @BeforeClass
-    public static void setUpClass() throws Throwable
-    {
-        SchemaLoader.loadSchema();
-        executeSchemaChange("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}");
-        clientState = ClientState.forInternalCalls();
-    }
-
-    @AfterClass
-    public static void stopGossiper()
-    {
-        Gossiper.instance.stop();
-    }
-
-    private static void executeSchemaChange(String query) throws Throwable
-    {
-        try
-        {
-            process(String.format(query, keyspace), ConsistencyLevel.ONE);
-        } catch (RuntimeException exc)
-        {
-            throw exc.getCause();
-        }
-    }
-
-    private static UntypedResultSet execute(String query) throws Throwable
-    {
-        try
-        {
-            return processInternal(String.format(query, keyspace));
-        } catch (RuntimeException exc)
-        {
-            if (exc.getCause() != null)
-                throw exc.getCause();
-            throw exc;
-        }
-    }
-
-    private MD5Digest prepare(String query) throws RequestValidationException
-    {
-        ResultMessage.Prepared prepared = QueryProcessor.prepare(String.format(query, keyspace), clientState, false);
-        return prepared.statementId;
-    }
-
-    private UntypedResultSet executePrepared(MD5Digest statementId, QueryOptions options) throws RequestValidationException, RequestExecutionException
-    {
-        CQLStatement statement = QueryProcessor.instance.getPrepared(statementId);
-        ResultMessage message = statement.executeInternal(QueryState.forInternalCalls(), options);
-
-        if (message instanceof ResultMessage.Rows)
-            return new UntypedResultSet(((ResultMessage.Rows)message).result);
-        else
-            return null;
-    }
-
     @Test
     public void testNowToUUIDCompatibility() throws Throwable
     {
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.uuid_now (a int, b uuid, PRIMARY KEY (a, b))");
-        String insert = "INSERT INTO %s.uuid_now (a, b) VALUES (0, now())";
-        String select = "SELECT * FROM %s.uuid_now WHERE a=0 AND b < now()";
-        execute(insert);
-        UntypedResultSet results = execute(select);
+        createTable("CREATE TABLE %s (a int, b uuid, PRIMARY KEY (a, b))");
+        execute("INSERT INTO %s (a, b) VALUES (0, now())");
+        UntypedResultSet results = execute("SELECT * FROM %s WHERE a=0 AND b < now()");
         assertEquals(1, results.size());
-
-        executePrepared(prepare(insert), QueryOptions.DEFAULT);
-        results = executePrepared(prepare(select), QueryOptions.DEFAULT);
-        assertEquals(2, results.size());
     }
 
     @Test
     public void testDateCompatibility() throws Throwable
     {
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.date_compatibility (a int, b timestamp, c bigint, d varint, PRIMARY KEY (a, b, c, d))");
-        String insert = "INSERT INTO %s.date_compatibility (a, b, c, d) VALUES (0, unixTimestampOf(now()), dateOf(now()), dateOf(now()))";
-        String select = "SELECT * FROM %s.date_compatibility WHERE a=0 AND b < unixTimestampOf(now())";
-        execute(insert);
-        UntypedResultSet results = execute(select);
+        createTable("CREATE TABLE %s (a int, b timestamp, c bigint, d varint, PRIMARY KEY (a, b, c, d))");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (0, unixTimestampOf(now()), dateOf(now()), dateOf(now()))");
+        UntypedResultSet results = execute("SELECT * FROM %s WHERE a=0 AND b < unixTimestampOf(now())");
         assertEquals(1, results.size());
-
-        executePrepared(prepare(insert), QueryOptions.DEFAULT);
-        results = executePrepared(prepare(select), QueryOptions.DEFAULT);
-        assertEquals(2, results.size());
     }
 
     @Test
     public void testReversedTypeCompatibility() throws Throwable
     {
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.uuid_now_reversed (a int, b timeuuid, PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
-        String insert = "INSERT INTO %s.uuid_now_reversed (a, b) VALUES (0, now())";
-        String select = "SELECT * FROM %s.uuid_now_reversed WHERE a=0 AND b < now()";
-        execute(insert);
-        UntypedResultSet results = execute(select);
+        createTable("CREATE TABLE %s (a int, b timeuuid, PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
+        execute("INSERT INTO %s (a, b) VALUES (0, now())");
+        UntypedResultSet results = execute("SELECT * FROM %s WHERE a=0 AND b < now()");
         assertEquals(1, results.size());
-
-        executePrepared(prepare(insert), QueryOptions.DEFAULT);
-        results = executePrepared(prepare(select), QueryOptions.DEFAULT);
-        assertEquals(2, results.size());
     }
 
     @Test
     // tests CASSANDRA-7797
     public void testAlterReversedColumn() throws Throwable
     {
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.test_alter_reversed (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
-        executeSchemaChange("ALTER TABLE %s.test_alter_reversed ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.TimestampType)'");
+        createTable("CREATE TABLE IF NOT EXISTS %s (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
+        alterTable("ALTER TABLE %s ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.TimestampType)'");
     }
 
     @Test
     public void testIncompatibleReversedTypes() throws Throwable
     {
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.test_incompatible_reversed (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
+        createTable("CREATE TABLE IF NOT EXISTS %s (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
         try
         {
-            executeSchemaChange("ALTER TABLE %s.test_incompatible_reversed ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.TimeUUIDType)'");
+            alterTable("ALTER TABLE %s ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.TimeUUIDType)'");
             fail("Expected error for ALTER statement");
         }
-        catch (ConfigurationException e) { }
+        catch (RuntimeException e) { }
     }
 
     @Test
     public void testReversedAndNonReversed() throws Throwable
     {
-        executeSchemaChange("CREATE TABLE IF NOT EXISTS %s.test_reversed_and_non_reversed (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b))");
+        createTable("CREATE TABLE IF NOT EXISTS %s (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b))");
         try
         {
-            executeSchemaChange("ALTER TABLE %s.test_reversed_and_non_reversed ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.DateType)'");
+            alterTable("ALTER TABLE %s ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.DateType)'");
             fail("Expected error for ALTER statement");
         }
-        catch (ConfigurationException e) { }
+        catch (RuntimeException e) { }
     }
-}
\ No newline at end of file
+}

diff --git a/test/unit/org/apache/cassandra/cql3/UserTypesTest.java b/test/unit/org/apache/cassandra/cql3/UserTypesTest.java
new file mode 100644
index 0000000..184de19
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/UserTypesTest.java

@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+public class UserTypesTest extends CQLTester
+{
+    @Test
+    public void testInvalidField() throws Throwable
+    {
+        String myType = createType("CREATE TYPE %s (f int)");
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<" + myType + ">)");
+
+        // 's' is not a field of myType
+        assertInvalid("INSERT INTO %s (k, v) VALUES (?, {s : ?})", 0, 1);
+    }
+
+    @Test
+    public void testCassandra8105() throws Throwable
+    {
+        String ut1 = createType("CREATE TYPE %s (a int, b int)");
+        String ut2 = createType("CREATE TYPE %s (j frozen<" + KEYSPACE + "." + ut1 + ">, k int)");
+        createTable("CREATE TABLE %s (x int PRIMARY KEY, y set<frozen<" + KEYSPACE + "." + ut2 + ">>)");
+        execute("INSERT INTO %s (x, y) VALUES (1, { { k: 1 } })");
+
+        String ut3 = createType("CREATE TYPE %s (a int, b int)");
+        String ut4 = createType("CREATE TYPE %s (j frozen<" + KEYSPACE + "." + ut3 + ">, k int)");
+        createTable("CREATE TABLE %s (x int PRIMARY KEY, y list<frozen<" + KEYSPACE + "." + ut4 + ">>)");
+        execute("INSERT INTO %s (x, y) VALUES (1, [ { k: 1 } ])");
+
+        String ut5 = createType("CREATE TYPE %s (a int, b int)");
+        String ut6 = createType("CREATE TYPE %s (i int, j frozen<" + KEYSPACE + "." + ut5 + ">)");
+        createTable("CREATE TABLE %s (x int PRIMARY KEY, y set<frozen<" + KEYSPACE + "." + ut6 + ">>)");
+        execute("INSERT INTO %s (x, y) VALUES (1, { { i: 1 } })");
+    }
+
+    @Test
+    public void testFor7684() throws Throwable
+    {
+        String myType = createType("CREATE TYPE %s (x double)");
+        createTable("CREATE TABLE %s (k int, v frozen<" + myType + ">, b boolean static, PRIMARY KEY (k, v))");
+
+        execute("INSERT INTO %s(k, v) VALUES (?, {x:?})", 1, -104.99251);
+        execute("UPDATE %s SET b = ? WHERE k = ?", true, 1);
+
+        assertRows(execute("SELECT v.x FROM %s WHERE k = ? AND v = {x:?}", 1, -104.99251),
+            row(-104.99251)
+        );
+
+        flush();
+
+        assertRows(execute("SELECT v.x FROM %s WHERE k = ? AND v = {x:?}", 1, -104.99251),
+            row(-104.99251)
+        );
+    }
+
+    @Test
+    public void testNonFrozenUDT() throws Throwable
+    {
+        // Using a UDT without frozen shouldn't work
+        String myType = createType("CREATE TYPE %s (f int)");
+        assertInvalid("CREATE TABLE wrong (k int PRIMARY KEY, v " + myType + ")");
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java b/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java
index 06e2e75..83a58e4a 100644
--- a/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java
+++ b/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java

@@ -1,4 +1,3 @@
-package org.apache.cassandra.db;
 /*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -19,22 +18,20 @@
  * under the License.
  *
  */
-
+package org.apache.cassandra.db;
 
 import java.util.*;
-
 import org.junit.Test;
 
 import static org.junit.Assert.*;
 
-import com.google.common.base.Functions;
-
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.utils.HeapAllocator;
+import org.apache.cassandra.db.marshal.Int32Type;
 
 public class ArrayBackedSortedColumnsTest extends SchemaLoader
 {
@@ -52,16 +49,86 @@
 
     private void testAddInternal(boolean reversed)
     {
+        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
         ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
         int[] values = new int[]{ 1, 2, 2, 3 };
 
         for (int i = 0; i < values.length; ++i)
-            map.addColumn(new Column(ByteBufferUtil.bytes(values[reversed ? values.length - 1 - i : i])), HeapAllocator.instance);
+            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
 
-        Iterator<Column> iter = map.iterator();
-        assertEquals("1st column", 1, iter.next().name().getInt(0));
-        assertEquals("2nd column", 2, iter.next().name().getInt(0));
-        assertEquals("3rd column", 3, iter.next().name().getInt(0));
+        Iterator<Cell> iter = map.iterator();
+        assertEquals("1st column", 1, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals("2nd column", 2, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals("3rd column", 3, iter.next().name().toByteBuffer().getInt(0));
+    }
+
+    @Test
+    public void testOutOfOrder()
+    {
+        testAddOutOfOrder(false);
+        testAddOutOfOrder(false);
+    }
+
+    private void testAddOutOfOrder(boolean reversed)
+    {
+        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
+        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
+
+        int[] values = new int[]{ 1, 2, 1, 3, 4, 4, 5, 5, 1, 2, 6, 6, 6, 1, 2, 3 };
+        for (int i = 0; i < values.length; ++i)
+            cells.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
+
+        assertEquals(6, cells.getColumnCount());
+
+        Iterator<Cell> iter = cells.iterator();
+        assertEquals(1, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(2, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(3, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(4, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(5, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(6, iter.next().name().toByteBuffer().getInt(0));
+
+        // Add more values
+        values = new int[]{ 11, 15, 12, 12, 12, 16, 10, 8, 8, 7, 4, 4, 5 };
+        for (int i = 0; i < values.length; ++i)
+            cells.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
+
+        assertEquals(13, cells.getColumnCount());
+
+        iter = cells.reverseIterator();
+        assertEquals(16, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(15, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(12, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(11, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(10, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(8,  iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(7, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(6, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(5, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(4, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(3, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(2, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals(1, iter.next().name().toByteBuffer().getInt(0));
+    }
+
+    @Test
+    public void testGetColumn()
+    {
+        testGetColumnInternal(true);
+        testGetColumnInternal(false);
+    }
+
+    private void testGetColumnInternal(boolean reversed)
+    {
+        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
+        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
+
+        int[] values = new int[]{ -1, 20, 44, 55, 27, 27, 17, 1, 9, 89, 33, 44, 0, 9 };
+        for (int i = 0; i < values.length; ++i)
+            cells.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
+
+        for (int i : values)
+            assertEquals(i, cells.getColumn(type.makeCellName(i)).name().toByteBuffer().getInt(0));
     }
 
     @Test
@@ -73,6 +140,7 @@
 
     private void testAddAllInternal(boolean reversed)
     {
+        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
         ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
         ColumnFamily map2 = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
 
@@ -80,20 +148,20 @@
         int[] values2 = new int[]{ 2, 4, 5, 6 };
 
         for (int i = 0; i < values1.length; ++i)
-            map.addColumn(new Column(ByteBufferUtil.bytes(values1[reversed ? values1.length - 1 - i : i])), HeapAllocator.instance);
+            map.addColumn(new BufferCell(type.makeCellName(values1[reversed ? values1.length - 1 - i : i])));
 
         for (int i = 0; i < values2.length; ++i)
-            map2.addColumn(new Column(ByteBufferUtil.bytes(values2[reversed ? values2.length - 1 - i : i])), HeapAllocator.instance);
+            map2.addColumn(new BufferCell(type.makeCellName(values2[reversed ? values2.length - 1 - i : i])));
 
-        map2.addAll(map, HeapAllocator.instance, Functions.<Column>identity());
+        map2.addAll(map);
 
-        Iterator<Column> iter = map2.iterator();
-        assertEquals("1st column", 1, iter.next().name().getInt(0));
-        assertEquals("2nd column", 2, iter.next().name().getInt(0));
-        assertEquals("3rd column", 3, iter.next().name().getInt(0));
-        assertEquals("4st column", 4, iter.next().name().getInt(0));
-        assertEquals("5st column", 5, iter.next().name().getInt(0));
-        assertEquals("6st column", 6, iter.next().name().getInt(0));
+        Iterator<Cell> iter = map2.iterator();
+        assertEquals("1st column", 1, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals("2nd column", 2, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals("3rd column", 3, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals("4st column", 4, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals("5st column", 5, iter.next().name().toByteBuffer().getInt(0));
+        assertEquals("6st column", 6, iter.next().name().toByteBuffer().getInt(0));
     }
 
     @Test
@@ -105,17 +173,18 @@
 
     private void testGetCollectionInternal(boolean reversed)
     {
+        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
         ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
         int[] values = new int[]{ 1, 2, 3, 5, 9 };
 
-        List<Column> sorted = new ArrayList<Column>();
+        List<Cell> sorted = new ArrayList<>();
         for (int v : values)
-            sorted.add(new Column(ByteBufferUtil.bytes(v)));
-        List<Column> reverseSorted = new ArrayList<Column>(sorted);
+            sorted.add(new BufferCell(type.makeCellName(v)));
+        List<Cell> reverseSorted = new ArrayList<>(sorted);
         Collections.reverse(reverseSorted);
 
         for (int i = 0; i < values.length; ++i)
-            map.addColumn(new Column(ByteBufferUtil.bytes(values[reversed ? values.length - 1 - i : i])), HeapAllocator.instance);
+            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
 
         assertSame(sorted, map.getSortedColumns());
         assertSame(reverseSorted, map.getReverseSortedColumns());
@@ -130,15 +199,16 @@
 
     private void testIteratorInternal(boolean reversed)
     {
+        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
         ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
 
         int[] values = new int[]{ 1, 2, 3, 5, 9 };
 
         for (int i = 0; i < values.length; ++i)
-            map.addColumn(new Column(ByteBufferUtil.bytes(values[reversed ? values.length - 1 - i : i])), HeapAllocator.instance);
+            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
 
-        assertSame(new int[]{ 3, 2, 1 }, map.reverseIterator(new ColumnSlice[]{ new ColumnSlice(ByteBufferUtil.bytes(3), ByteBufferUtil.EMPTY_BYTE_BUFFER) }));
-        assertSame(new int[]{ 3, 2, 1 }, map.reverseIterator(new ColumnSlice[]{ new ColumnSlice(ByteBufferUtil.bytes(4), ByteBufferUtil.EMPTY_BYTE_BUFFER) }));
+        assertSame(new int[]{ 3, 2, 1 }, map.reverseIterator(new ColumnSlice[]{ new ColumnSlice(type.make(3), Composites.EMPTY) }));
+        assertSame(new int[]{ 3, 2, 1 }, map.reverseIterator(new ColumnSlice[]{ new ColumnSlice(type.make(4), Composites.EMPTY) }));
 
         assertSame(map.iterator(), map.iterator(ColumnSlice.ALL_COLUMNS_ARRAY));
     }
@@ -156,12 +226,12 @@
             fail("The collection don't have the same size");
     }
 
-    private void assertSame(int[] names, Iterator<Column> iter)
+    private void assertSame(int[] names, Iterator<Cell> iter)
     {
         for (int name : names)
         {
             assert iter.hasNext() : "Expected " + name + " but no more result";
-            int value = ByteBufferUtil.toInt(iter.next().name());
+            int value = ByteBufferUtil.toInt(iter.next().name().toByteBuffer());
             assert name == value : "Expected " + name + " but got " + value;
         }
     }
@@ -175,13 +245,15 @@
 
     private void testRemoveInternal(boolean reversed)
     {
+        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
         ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
+
         int[] values = new int[]{ 1, 2, 2, 3 };
 
         for (int i = 0; i < values.length; ++i)
-            map.addColumn(new Column(ByteBufferUtil.bytes(values[reversed ? values.length - 1 - i : i])), HeapAllocator.instance);
+            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
 
-        Iterator<Column> iter = map.getReverseSortedColumns().iterator();
+        Iterator<Cell> iter = map.getReverseSortedColumns().iterator();
         assertTrue(iter.hasNext());
         iter.next();
         iter.remove();

diff --git a/test/unit/org/apache/cassandra/db/BatchlogManagerTest.java b/test/unit/org/apache/cassandra/db/BatchlogManagerTest.java
index 846b008..13b7150 100644
--- a/test/unit/org/apache/cassandra/db/BatchlogManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/BatchlogManagerTest.java

@@ -31,8 +31,10 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.UUIDGen;
 
@@ -60,15 +62,21 @@
 
         // Generate 1000 mutations and put them all into the batchlog.
         // Half (500) ready to be replayed, half not.
+        CellNameType comparator = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard1").metadata.comparator;
         for (int i = 0; i < 1000; i++)
         {
-            RowMutation mutation = new RowMutation("Keyspace1", bytes(i));
-            mutation.add("Standard1", bytes(i), bytes(i), System.currentTimeMillis());
+            Mutation mutation = new Mutation("Keyspace1", bytes(i));
+            mutation.add("Standard1", comparator.makeCellName(bytes(i)), bytes(i), System.currentTimeMillis());
 
             long timestamp = i < 500
                            ? (System.currentTimeMillis() - DatabaseDescriptor.getWriteRpcTimeout() * 2) * 1000
                            : Long.MAX_VALUE;
-            BatchlogManager.getBatchlogMutationFor(Collections.singleton(mutation), UUIDGen.getTimeUUID(), timestamp).apply();
+
+            BatchlogManager.getBatchlogMutationFor(Collections.singleton(mutation),
+                                                   UUIDGen.getTimeUUID(),
+                                                   MessagingService.current_version,
+                                                   timestamp)
+                           .apply();
         }
 
         // Flush the batchlog to disk (see CASSANDRA-6822).
@@ -86,7 +94,7 @@
 
         for (int i = 0; i < 1000; i++)
         {
-            UntypedResultSet result = QueryProcessor.processInternal(String.format("SELECT * FROM \"Keyspace1\".\"Standard1\" WHERE key = intAsBlob(%d)", i));
+            UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM \"Keyspace1\".\"Standard1\" WHERE key = intAsBlob(%d)", i));
             if (i < 500)
             {
                 assertEquals(bytes(i), result.one().getBytes("key"));
@@ -100,23 +108,25 @@
         }
 
         // Ensure that no stray mutations got somehow applied.
-        UntypedResultSet result = QueryProcessor.processInternal(String.format("SELECT count(*) FROM \"Keyspace1\".\"Standard1\""));
+        UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT count(*) FROM \"Keyspace1\".\"Standard1\""));
         assertEquals(500, result.one().getLong("count"));
     }
 
     @Test
     public void testTruncatedReplay() throws InterruptedException, ExecutionException
     {
+        CellNameType comparator2 = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard2").metadata.comparator;
+        CellNameType comparator3 = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard3").metadata.comparator;
         // Generate 2000 mutations (1000 batchlog entries) and put them all into the batchlog.
         // Each batchlog entry with a mutation for Standard2 and Standard3.
         // In the middle of the process, 'truncate' Standard2.
         for (int i = 0; i < 1000; i++)
         {
-            RowMutation mutation1 = new RowMutation("Keyspace1", bytes(i));
-            mutation1.add("Standard2", bytes(i), bytes(i), 0);
-            RowMutation mutation2 = new RowMutation("Keyspace1", bytes(i));
-            mutation2.add("Standard3", bytes(i), bytes(i), 0);
-            List<RowMutation> mutations = Lists.newArrayList(mutation1, mutation2);
+            Mutation mutation1 = new Mutation("Keyspace1", bytes(i));
+            mutation1.add("Standard2", comparator2.makeCellName(bytes(i)), bytes(i), 0);
+            Mutation mutation2 = new Mutation("Keyspace1", bytes(i));
+            mutation2.add("Standard3", comparator3.makeCellName(bytes(i)), bytes(i), 0);
+            List<Mutation> mutations = Lists.newArrayList(mutation1, mutation2);
 
             // Make sure it's ready to be replayed, so adjust the timestamp.
             long timestamp = System.currentTimeMillis() - DatabaseDescriptor.getWriteRpcTimeout() * 2;
@@ -132,7 +142,11 @@
             else
                 timestamp--;
 
-            BatchlogManager.getBatchlogMutationFor(mutations, UUIDGen.getTimeUUID(), timestamp * 1000).apply();
+            BatchlogManager.getBatchlogMutationFor(mutations,
+                                                   UUIDGen.getTimeUUID(),
+                                                   MessagingService.current_version,
+                                                   timestamp * 1000)
+                           .apply();
         }
 
         // Flush the batchlog to disk (see CASSANDRA-6822).
@@ -144,7 +158,7 @@
         // We should see half of Standard2-targeted mutations written after the replay and all of Standard3 mutations applied.
         for (int i = 0; i < 1000; i++)
         {
-            UntypedResultSet result = QueryProcessor.processInternal(String.format("SELECT * FROM \"Keyspace1\".\"Standard2\" WHERE key = intAsBlob(%d)", i));
+            UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM \"Keyspace1\".\"Standard2\" WHERE key = intAsBlob(%d)", i));
             if (i >= 500)
             {
                 assertEquals(bytes(i), result.one().getBytes("key"));
@@ -159,7 +173,7 @@
 
         for (int i = 0; i < 1000; i++)
         {
-            UntypedResultSet result = QueryProcessor.processInternal(String.format("SELECT * FROM \"Keyspace1\".\"Standard3\" WHERE key = intAsBlob(%d)", i));
+            UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM \"Keyspace1\".\"Standard3\" WHERE key = intAsBlob(%d)", i));
             assertEquals(bytes(i), result.one().getBytes("key"));
             assertEquals(bytes(i), result.one().getBytes("column1"));
             assertEquals(bytes(i), result.one().getBytes("value"));

diff --git a/test/unit/org/apache/cassandra/db/CellTest.java b/test/unit/org/apache/cassandra/db/CellTest.java
new file mode 100644
index 0000000..63d6f4c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/CellTest.java

@@ -0,0 +1,97 @@
+package org.apache.cassandra.db;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.NativeAllocator;
+import org.apache.cassandra.utils.memory.NativePool;
+
+public class CellTest
+{
+
+    private static final OpOrder order = new OpOrder();
+    private static NativeAllocator allocator = new NativePool(Integer.MAX_VALUE, Integer.MAX_VALUE, 1f, null).newAllocator();
+
+    @Test
+    public void testExpiringCellReconile()
+    {
+        // equal
+        Assert.assertEquals(0, testExpiring("a", "a", 1, 1, null, null, null, null));
+
+        // newer timestamp
+        Assert.assertEquals(-1, testExpiring("a", "a", 2, 1, null, null, 1L, null));
+        Assert.assertEquals(-1, testExpiring("a", "a", 2, 1, null, "b", 1L, 2));
+
+        // newer TTL
+        Assert.assertEquals(-1, testExpiring("a", "a", 1, 2, null, null, null, 1));
+        Assert.assertEquals(1, testExpiring("a", "a", 1, 2, null, "b", null, 1));
+
+        // newer value
+        Assert.assertEquals(-1, testExpiring("a", "b", 2, 1, null, "a", null, null));
+        Assert.assertEquals(-1, testExpiring("a", "b", 2, 1, null, "a", null, 2));
+    }
+
+    private int testExpiring(String n1, String v1, long t1, int et1, String n2, String v2, Long t2, Integer et2)
+    {
+        if (n2 == null)
+            n2 = n1;
+        if (v2 == null)
+            v2 = v1;
+        if (t2 == null)
+            t2 = t1;
+        if (et2 == null)
+            et2 = et1;
+        int result = testExpiring(n1, v1, t1, et1, false, n2, v2, t2, et2, false);
+        Assert.assertEquals(result, testExpiring(n1, v1, t1, et1, false, n2, v2, t2, et2, true));
+        Assert.assertEquals(result, testExpiring(n1, v1, t1, et1, true, n2, v2, t2, et2, false));
+        Assert.assertEquals(result, testExpiring(n1, v1, t1, et1, true, n2, v2, t2, et2, true));
+        return result;
+    }
+
+    private int testExpiring(String n1, String v1, long t1, int et1, boolean native1, String n2, String v2, long t2, int et2, boolean native2)
+    {
+        Cell c1 = expiring(n1, v1, t1, et1, native1);
+        Cell c2 = expiring(n2, v2, t2, et2, native2);
+        return reconcile(c1, c2);
+    }
+
+    int reconcile(Cell c1, Cell c2)
+    {
+        if (c1.reconcile(c2) == c1)
+            return c2.reconcile(c1) == c1 ? -1 : 0;
+        return c2.reconcile(c1) == c2 ? 1 : 0;
+    }
+
+    private Cell expiring(String name, String value, long timestamp, int expirationTime, boolean nativeCell)
+    {
+        ExpiringCell cell = new BufferExpiringCell(Util.cellname(name), ByteBufferUtil.bytes(value), timestamp, 1, expirationTime);
+        if (nativeCell)
+            cell = new NativeExpiringCell(allocator, order.getCurrent(), cell);
+        return cell;
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/db/CleanupTest.java b/test/unit/org/apache/cassandra/db/CleanupTest.java
index 9914c90..79a8470 100644
--- a/test/unit/org/apache/cassandra/db/CleanupTest.java
+++ b/test/unit/org/apache/cassandra/db/CleanupTest.java

@@ -21,6 +21,7 @@
 import static org.junit.Assert.assertEquals;
 import java.io.IOException;
 import java.net.InetAddress;
+import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.LinkedList;
@@ -31,20 +32,15 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.dht.BytesToken;
-import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.locator.TokenMetadata;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.CounterId;
 import org.junit.Test;
 
 public class CleanupTest extends SchemaLoader
@@ -62,7 +58,7 @@
     }
 
     @Test
-    public void testCleanup() throws IOException, ExecutionException, InterruptedException, ConfigurationException
+    public void testCleanup() throws ExecutionException, InterruptedException
     {
         StorageService.instance.getTokenMetadata().clearUnsafe();
 
@@ -81,7 +77,7 @@
         assertEquals(LOOPS, rows.size());
 
         // with one token in the ring, owned by the local node, cleanup should be a no-op
-        CompactionManager.instance.performCleanup(cfs, new CounterId.OneShotRenewer());
+        CompactionManager.instance.performCleanup(cfs);
 
         // ensure max timestamp of the sstables are retained post-cleanup
         assert expectedMaxTimestamps.equals(getMaxTimestampList(cfs));
@@ -110,10 +106,9 @@
             Thread.sleep(10);
 
         // verify we get it back w/ index query too
-        IndexExpression expr = new IndexExpression(COLUMN, IndexOperator.EQ, VALUE);
+        IndexExpression expr = new IndexExpression(COLUMN, IndexExpression.Operator.EQ, VALUE);
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
-        IPartitioner p = StorageService.getPartitioner();
         Range<RowPosition> range = Util.range("", "");
         rows = keyspace.getColumnFamilyStore(CF1).search(range, clause, filter, Integer.MAX_VALUE);
         assertEquals(LOOPS, rows.size());
@@ -127,7 +122,7 @@
         tmd.updateNormalToken(new BytesToken(tk1), InetAddress.getByName("127.0.0.1"));
         tmd.updateNormalToken(new BytesToken(tk2), InetAddress.getByName("127.0.0.2"));
 
-        CompactionManager.instance.performCleanup(cfs, new CounterId.OneShotRenewer());
+        CompactionManager.instance.performCleanup(cfs);
 
         // row data should be gone
         rows = Util.getRangeSlice(cfs);
@@ -141,7 +136,36 @@
         assertEquals(0, rows.size());
     }
 
-    protected void fillCF(ColumnFamilyStore cfs, int rowsPerSSTable) throws ExecutionException, InterruptedException, IOException
+    @Test
+    public void testCleanupWithNewToken() throws ExecutionException, InterruptedException, UnknownHostException
+    {
+        StorageService.instance.getTokenMetadata().clearUnsafe();
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF2);
+
+        List<Row> rows;
+
+        // insert data and verify we get it back w/ range query
+        fillCF(cfs, LOOPS);
+
+        rows = Util.getRangeSlice(cfs);
+
+        assertEquals(LOOPS, rows.size());
+        TokenMetadata tmd = StorageService.instance.getTokenMetadata();
+
+        byte[] tk1 = new byte[1], tk2 = new byte[1];
+        tk1[0] = 2;
+        tk2[0] = 1;
+        tmd.updateNormalToken(new BytesToken(tk1), InetAddress.getByName("127.0.0.1"));
+        tmd.updateNormalToken(new BytesToken(tk2), InetAddress.getByName("127.0.0.2"));
+        CompactionManager.instance.performCleanup(cfs);
+
+        rows = Util.getRangeSlice(cfs);
+        assertEquals(0, rows.size());
+    }
+
+    protected void fillCF(ColumnFamilyStore cfs, int rowsPerSSTable)
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -149,9 +173,9 @@
         {
             String key = String.valueOf(i);
             // create a row and update the birthdate value, test that the index query fetches the new version
-            RowMutation rm;
-            rm = new RowMutation(KEYSPACE1, ByteBufferUtil.bytes(key));
-            rm.add(cfs.name, COLUMN, VALUE, System.currentTimeMillis());
+            Mutation rm;
+            rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes(key));
+            rm.add(cfs.name, Util.cellname(COLUMN), VALUE, System.currentTimeMillis());
             rm.applyUnsafe();
         }
 

diff --git a/test/unit/org/apache/cassandra/db/CollationControllerTest.java b/test/unit/org/apache/cassandra/db/CollationControllerTest.java
index 721dfae..22c60b8 100644
--- a/test/unit/org/apache/cassandra/db/CollationControllerTest.java
+++ b/test/unit/org/apache/cassandra/db/CollationControllerTest.java

@@ -18,62 +18,57 @@
 */
 package org.apache.cassandra.db;
 
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.concurrent.ExecutionException;
+import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.db.composites.CellName;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
-import org.junit.Test;
 
-import org.apache.cassandra.io.sstable.SSTableReader;
+import static org.junit.Assert.assertEquals;
 
 public class CollationControllerTest extends SchemaLoader
 {
     @Test
     public void getTopLevelColumnsSkipsSSTablesModifiedBeforeRowDelete() 
-    throws IOException, ExecutionException, InterruptedException
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("key1");
         
         // add data
-        rm = new RowMutation(keyspace.getName(), dk.key);
-        rm.add(cfs.name, ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        rm = new Mutation(keyspace.getName(), dk.getKey());
+        rm.add(cfs.name, Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
         rm.apply();
         cfs.forceBlockingFlush();
         
         // remove
-        rm = new RowMutation(keyspace.getName(), dk.key);
+        rm = new Mutation(keyspace.getName(), dk.getKey());
         rm.delete(cfs.name, 10);
         rm.apply();
         
         // add another mutation because sstable maxtimestamp isn't set
         // correctly during flush if the most recent mutation is a row delete
-        rm = new RowMutation(keyspace.getName(), Util.dk("key2").key);
-        rm.add(cfs.name, ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("zxcv"), 20);
+        rm = new Mutation(keyspace.getName(), Util.dk("key2").getKey());
+        rm.add(cfs.name, Util.cellname("Column1"), ByteBufferUtil.bytes("zxcv"), 20);
         rm.apply();
         
         cfs.forceBlockingFlush();
 
         // add yet one more mutation
-        rm = new RowMutation(keyspace.getName(), dk.key);
-        rm.add(cfs.name, ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("foobar"), 30);
+        rm = new Mutation(keyspace.getName(), dk.getKey());
+        rm.add(cfs.name, Util.cellname("Column1"), ByteBufferUtil.bytes("foobar"), 30);
         rm.apply();
         cfs.forceBlockingFlush();
 
         // A NamesQueryFilter goes down one code path (through collectTimeOrderedData())
         // It should only iterate the last flushed sstable, since it probably contains the most recent value for Column1
-        QueryFilter filter = QueryFilter.getNamesFilter(dk, cfs.name, FBUtilities.singleton(ByteBufferUtil.bytes("Column1"), cfs.getComparator()), System.currentTimeMillis());
+        QueryFilter filter = Util.namesQueryFilter(cfs, dk, "Column1");
         CollationController controller = new CollationController(cfs, filter, Integer.MIN_VALUE);
-        controller.getTopLevelColumns();
+        controller.getTopLevelColumns(true);
         assertEquals(1, controller.getSstablesIterated());
 
         // SliceQueryFilter goes down another path (through collectAllData())
@@ -81,30 +76,29 @@
         // recent than the maxTimestamp of the very first sstable we flushed, we should only read the 2 first sstables.
         filter = QueryFilter.getIdentityFilter(dk, cfs.name, System.currentTimeMillis());
         controller = new CollationController(cfs, filter, Integer.MIN_VALUE);
-        controller.getTopLevelColumns();
+        controller.getTopLevelColumns(true);
         assertEquals(2, controller.getSstablesIterated());
     }
 
     @Test
     public void ensureTombstonesAppliedAfterGCGS()
-    throws IOException, ExecutionException, InterruptedException
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("StandardGCGS0");
         cfs.disableAutoCompaction();
 
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("key1");
-        ByteBuffer cellName = ByteBufferUtil.bytes("Column1");
+        CellName cellName = Util.cellname("Column1");
 
         // add data
-        rm = new RowMutation(keyspace.getName(), dk.key);
+        rm = new Mutation(keyspace.getName(), dk.getKey());
         rm.add(cfs.name, cellName, ByteBufferUtil.bytes("asdf"), 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
         // remove
-        rm = new RowMutation(keyspace.getName(), dk.key);
+        rm = new Mutation(keyspace.getName(), dk.getKey());
         rm.delete(cfs.name, cellName, 0);
         rm.apply();
         cfs.forceBlockingFlush();
@@ -116,10 +110,10 @@
 
         filter = QueryFilter.getNamesFilter(dk, cfs.name, FBUtilities.singleton(cellName, cfs.getComparator()), queryAt);
         CollationController controller = new CollationController(cfs, filter, gcBefore);
-        assert ColumnFamilyStore.removeDeleted(controller.getTopLevelColumns(), gcBefore) == null;
+        assert ColumnFamilyStore.removeDeleted(controller.getTopLevelColumns(true), gcBefore) == null;
 
         filter = QueryFilter.getIdentityFilter(dk, cfs.name, queryAt);
         controller = new CollationController(cfs, filter, gcBefore);
-        assert ColumnFamilyStore.removeDeleted(controller.getTopLevelColumns(), gcBefore) == null;
+        assert ColumnFamilyStore.removeDeleted(controller.getTopLevelColumns(true), gcBefore) == null;
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
index 611d6af..dda9b65 100644
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java

@@ -22,7 +22,20 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
-import java.util.*;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.UUID;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
@@ -30,8 +43,6 @@
 import com.google.common.base.Function;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.io.util.FileUtils;
 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.junit.Test;
@@ -42,27 +53,55 @@
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.IndexType;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.composites.Composites;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.filter.IDiskAtomFilter;
+import org.apache.cassandra.db.filter.NamesQueryFilter;
+import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.db.marshal.LexicalUUIDType;
 import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.dht.*;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.ExcludingBounds;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.IncludingExcludingBounds;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableDeletingTask;
+import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.sstable.SSTableSimpleWriter;
+import org.apache.cassandra.io.sstable.SSTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.thrift.*;
+import org.apache.cassandra.thrift.SlicePredicate;
+import org.apache.cassandra.thrift.SliceRange;
+import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.WrappedRunnable;
 
-import static org.junit.Assert.*;
-import static org.apache.cassandra.Util.*;
+import static org.apache.cassandra.Util.cellname;
+import static org.apache.cassandra.Util.column;
+import static org.apache.cassandra.Util.dk;
+import static org.apache.cassandra.Util.rp;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
-import static org.apache.commons.lang3.ArrayUtils.EMPTY_BYTE_ARRAY;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class ColumnFamilyStoreTest extends SchemaLoader
@@ -80,40 +119,39 @@
 
     @Test
     // create two sstables, and verify that we only deserialize data from the most recent one
-    public void testTimeSortedQuery() throws IOException, ExecutionException, InterruptedException
+    public void testTimeSortedQuery()
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
         cfs.truncateBlocking();
 
-        RowMutation rm;
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("key1"));
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        Mutation rm;
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("key1"));
+        rm.add("Standard1", cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("key1"));
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 1);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("key1"));
+        rm.add("Standard1", cellname("Column1"), ByteBufferUtil.bytes("asdf"), 1);
         rm.apply();
         cfs.forceBlockingFlush();
 
         cfs.getRecentSSTablesPerReadHistogram(); // resets counts
-        cfs.getColumnFamily(QueryFilter.getNamesFilter(Util.dk("key1"), "Standard1", FBUtilities.singleton(ByteBufferUtil.bytes("Column1"), cfs.getComparator()), System.currentTimeMillis()));
+        cfs.getColumnFamily(Util.namesQueryFilter(cfs, Util.dk("key1"), "Column1"));
         assertEquals(1, cfs.getRecentSSTablesPerReadHistogram()[0]);
     }
 
     @Test
-    public void testGetColumnWithWrongBF() throws IOException, ExecutionException, InterruptedException
+    public void testGetColumnWithWrongBF()
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
         cfs.truncateBlocking();
 
-        List<IMutation> rms = new LinkedList<IMutation>();
-        RowMutation rm;
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("key1"));
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.add("Standard1", ByteBufferUtil.bytes("Column2"), ByteBufferUtil.bytes("asdf"), 0);
+        List<Mutation> rms = new LinkedList<>();
+        Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("key1"));
+        rm.add("Standard1", cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        rm.add("Standard1", cellname("Column2"), ByteBufferUtil.bytes("asdf"), 0);
         rms.add(rm);
         Util.writeColumnFamily(rms);
 
@@ -129,9 +167,9 @@
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         final ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
-        RowMutation rm;
+        Mutation rm;
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("key1"));
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("key1"));
         rm.delete("Standard2", System.currentTimeMillis());
         rm.apply();
 
@@ -139,24 +177,15 @@
         {
             public void runMayThrow() throws IOException
             {
-                QueryFilter sliceFilter = QueryFilter.getSliceFilter(Util.dk("key1"),
-                                                                     "Standard2",
-                                                                     ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                     ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                     false,
-                                                                     1,
-                                                                     System.currentTimeMillis());
+                QueryFilter sliceFilter = QueryFilter.getSliceFilter(Util.dk("key1"), "Standard2", Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
                 ColumnFamily cf = store.getColumnFamily(sliceFilter);
-                assert cf.isMarkedForDelete();
-                assert cf.getColumnCount() == 0;
+                assertTrue(cf.isMarkedForDelete());
+                assertFalse(cf.hasColumns());
 
-                QueryFilter namesFilter = QueryFilter.getNamesFilter(Util.dk("key1"),
-                                                                     "Standard2",
-                                                                     FBUtilities.singleton(ByteBufferUtil.bytes("a"), store.getComparator()),
-                                                                     System.currentTimeMillis());
+                QueryFilter namesFilter = Util.namesQueryFilter(store, Util.dk("key1"), "a");
                 cf = store.getColumnFamily(namesFilter);
-                assert cf.isMarkedForDelete();
-                assert cf.getColumnCount() == 0;
+                assertTrue(cf.isMarkedForDelete());
+                assertFalse(cf.hasColumns());
             }
         };
 
@@ -164,122 +193,122 @@
     }
 
     @Test
-    public void testSkipStartKey() throws IOException, ExecutionException, InterruptedException
+    public void testSkipStartKey()
     {
         ColumnFamilyStore cfs = insertKey1Key2();
 
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner<?> p = StorageService.getPartitioner();
         List<Row> result = cfs.getRangeSlice(Util.range(p, "key1", "key2"),
                                              null,
-                                             new NamesQueryFilter(FBUtilities.singleton(ByteBufferUtil.bytes("asdf"), cfs.getComparator())),
+                                             Util.namesFilter(cfs, "asdf"),
                                              10);
         assertEquals(1, result.size());
-        assert result.get(0).key.key.equals(ByteBufferUtil.bytes("key2"));
+        assert result.get(0).key.getKey().equals(ByteBufferUtil.bytes("key2"));
     }
 
     @Test
-    public void testIndexScan() throws IOException
+    public void testIndexScan()
     {
-        RowMutation rm;
+        ColumnFamilyStore cfs = Keyspace.open("Keyspace1").getColumnFamilyStore("Indexed1");
+        Mutation rm;
+        CellName nobirthdate = cellname("notbirthdate");
+        CellName birthdate = cellname("birthdate");
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(1L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 0);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", nobirthdate, ByteBufferUtil.bytes(1L), 0);
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(1L), 0);
         rm.apply();
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("k2"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(2L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(2L), 0);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("k2"));
+        rm.add("Indexed1", nobirthdate, ByteBufferUtil.bytes(2L), 0);
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(2L), 0);
         rm.apply();
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("k3"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(2L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 0);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("k3"));
+        rm.add("Indexed1", nobirthdate, ByteBufferUtil.bytes(2L), 0);
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(1L), 0);
         rm.apply();
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("k4aaaa"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(2L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(3L), 0);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("k4aaaa"));
+        rm.add("Indexed1", nobirthdate, ByteBufferUtil.bytes(2L), 0);
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(3L), 0);
         rm.apply();
 
-        ColumnFamilyStore store = Keyspace.open("Keyspace1").getColumnFamilyStore("Indexed1");
-
         // basic single-expression query
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(1L));
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(1L));
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
-        IPartitioner p = StorageService.getPartitioner();
         Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = store.search(range, clause, filter, 100);
+        List<Row> rows = cfs.search(range, clause, filter, 100);
 
         assert rows != null;
         assert rows.size() == 2 : StringUtils.join(rows, ",");
 
-        String key = new String(rows.get(0).key.key.array(),rows.get(0).key.key.position(),rows.get(0).key.key.remaining());
+        String key = new String(rows.get(0).key.getKey().array(), rows.get(0).key.getKey().position(), rows.get(0).key.getKey().remaining());
         assert "k1".equals( key ) : key;
 
-        key = new String(rows.get(1).key.key.array(),rows.get(1).key.key.position(),rows.get(1).key.key.remaining());
+        key = new String(rows.get(1).key.getKey().array(), rows.get(1).key.getKey().position(), rows.get(1).key.getKey().remaining());
         assert "k3".equals(key) : key;
 
-        assert ByteBufferUtil.bytes(1L).equals( rows.get(0).cf.getColumn(ByteBufferUtil.bytes("birthdate")).value());
-        assert ByteBufferUtil.bytes(1L).equals( rows.get(1).cf.getColumn(ByteBufferUtil.bytes("birthdate")).value());
+        assert ByteBufferUtil.bytes(1L).equals( rows.get(0).cf.getColumn(birthdate).value());
+        assert ByteBufferUtil.bytes(1L).equals( rows.get(1).cf.getColumn(birthdate).value());
 
         // add a second expression
-        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), IndexOperator.GTE, ByteBufferUtil.bytes(2L));
+        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), IndexExpression.Operator.GTE, ByteBufferUtil.bytes(2L));
         clause = Arrays.asList(expr, expr2);
-        rows = store.search(range, clause, filter, 100);
+        rows = cfs.search(range, clause, filter, 100);
 
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = new String(rows.get(0).key.key.array(),rows.get(0).key.key.position(),rows.get(0).key.key.remaining());
+        key = new String(rows.get(0).key.getKey().array(), rows.get(0).key.getKey().position(), rows.get(0).key.getKey().remaining());
         assert "k3".equals( key );
 
         // same query again, but with resultset not including the subordinate expression
-        rows = store.search(range, clause, new NamesQueryFilter(FBUtilities.singleton(ByteBufferUtil.bytes("birthdate"), store.getComparator())), 100);
+        rows = cfs.search(range, clause, Util.namesFilter(cfs, "birthdate"), 100);
 
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = new String(rows.get(0).key.key.array(),rows.get(0).key.key.position(),rows.get(0).key.key.remaining());
+        key = new String(rows.get(0).key.getKey().array(), rows.get(0).key.getKey().position(), rows.get(0).key.getKey().remaining());
         assert "k3".equals( key );
 
         assert rows.get(0).cf.getColumnCount() == 1 : rows.get(0).cf;
 
         // once more, this time with a slice rowset that needs to be expanded
-        SliceQueryFilter emptyFilter = new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 0);
-        rows = store.search(range, clause, emptyFilter, 100);
+        SliceQueryFilter emptyFilter = new SliceQueryFilter(Composites.EMPTY, Composites.EMPTY, false, 0);
+        rows = cfs.search(range, clause, emptyFilter, 100);
 
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = new String(rows.get(0).key.key.array(),rows.get(0).key.key.position(),rows.get(0).key.key.remaining());
+        key = new String(rows.get(0).key.getKey().array(), rows.get(0).key.getKey().position(), rows.get(0).key.getKey().remaining());
         assert "k3".equals( key );
 
-        assert rows.get(0).cf.getColumnCount() == 0;
+        assertFalse(rows.get(0).cf.hasColumns());
 
         // query with index hit but rejected by secondary clause, with a small enough count that just checking count
         // doesn't tell the scan loop that it's done
-        IndexExpression expr3 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(-1L));
+        IndexExpression expr3 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(-1L));
         clause = Arrays.asList(expr, expr3);
-        rows = store.search(range, clause, filter, 100);
+        rows = cfs.search(range, clause, filter, 100);
 
         assert rows.isEmpty();
     }
 
     @Test
-    public void testLargeScan() throws IOException
+    public void testLargeScan()
     {
-        RowMutation rm;
+        Mutation rm;
+        ColumnFamilyStore cfs = Keyspace.open("Keyspace1").getColumnFamilyStore("Indexed1");
         for (int i = 0; i < 100; i++)
         {
-            rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("key" + i));
-            rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(34L), 0);
-            rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes((long) (i % 2)), 0);
+            rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("key" + i));
+            rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(34L), 0);
+            rm.add("Indexed1", cellname("notbirthdate"), ByteBufferUtil.bytes((long) (i % 2)), 0);
             rm.applyUnsafe();
         }
 
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(34L));
-        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(1L));
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(34L));
+        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(1L));
         List<IndexExpression> clause = Arrays.asList(expr, expr2);
         IDiskAtomFilter filter = new IdentityQueryFilter();
-        IPartitioner p = StorageService.getPartitioner();
         Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = Keyspace.open("Keyspace1").getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
+        List<Row> rows = cfs.search(range, clause, filter, 100);
 
         assert rows != null;
         assert rows.size() == 50 : rows.size();
@@ -294,94 +323,93 @@
     public void testIndexDeletions() throws IOException
     {
         ColumnFamilyStore cfs = Keyspace.open("Keyspace3").getColumnFamilyStore("Indexed1");
-        RowMutation rm;
+        Mutation rm;
 
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 0);
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), 0);
         rm.apply();
 
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(1L));
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(1L));
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
-        IPartitioner p = StorageService.getPartitioner();
         Range<RowPosition> range = Util.range("", "");
         List<Row> rows = cfs.search(range, clause, filter, 100);
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        String key = ByteBufferUtil.string(rows.get(0).key.key);
+        String key = ByteBufferUtil.string(rows.get(0).key.getKey());
         assert "k1".equals( key );
 
         // delete the column directly
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
-        rm.delete("Indexed1", ByteBufferUtil.bytes("birthdate"), 1);
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm.delete("Indexed1", cellname("birthdate"), 1);
         rm.apply();
         rows = cfs.search(range, clause, filter, 100);
         assert rows.isEmpty();
 
         // verify that it's not being indexed under the deletion column value either
-        Column deletion = rm.getColumnFamilies().iterator().next().iterator().next();
+        Cell deletion = rm.getColumnFamilies().iterator().next().iterator().next();
         ByteBuffer deletionLong = ByteBufferUtil.bytes((long) ByteBufferUtil.toInt(deletion.value()));
-        IndexExpression expr0 = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, deletionLong);
+        IndexExpression expr0 = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, deletionLong);
         List<IndexExpression> clause0 = Arrays.asList(expr0);
         rows = cfs.search(range, clause0, filter, 100);
         assert rows.isEmpty();
 
         // resurrect w/ a newer timestamp
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 2);
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), 2);
         rm.apply();
         rows = cfs.search(range, clause, filter, 100);
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = ByteBufferUtil.string(rows.get(0).key.key);
+        key = ByteBufferUtil.string(rows.get(0).key.getKey());
         assert "k1".equals( key );
 
         // verify that row and delete w/ older timestamp does nothing
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
         rm.delete("Indexed1", 1);
         rm.apply();
         rows = cfs.search(range, clause, filter, 100);
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = ByteBufferUtil.string(rows.get(0).key.key);
+        key = ByteBufferUtil.string(rows.get(0).key.getKey());
         assert "k1".equals( key );
 
         // similarly, column delete w/ older timestamp should do nothing
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
-        rm.delete("Indexed1", ByteBufferUtil.bytes("birthdate"), 1);
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm.delete("Indexed1", cellname("birthdate"), 1);
         rm.apply();
         rows = cfs.search(range, clause, filter, 100);
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = ByteBufferUtil.string(rows.get(0).key.key);
+        key = ByteBufferUtil.string(rows.get(0).key.getKey());
         assert "k1".equals( key );
 
         // delete the entire row (w/ newer timestamp this time)
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
         rm.delete("Indexed1", 3);
         rm.apply();
         rows = cfs.search(range, clause, filter, 100);
         assert rows.isEmpty() : StringUtils.join(rows, ",");
 
         // make sure obsolete mutations don't generate an index entry
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 3);
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), 3);
         rm.apply();
         rows = cfs.search(range, clause, filter, 100);
         assert rows.isEmpty() : StringUtils.join(rows, ",");
 
         // try insert followed by row delete in the same mutation
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 1);
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), 1);
         rm.delete("Indexed1", 2);
         rm.apply();
         rows = cfs.search(range, clause, filter, 100);
         assert rows.isEmpty() : StringUtils.join(rows, ",");
 
         // try row delete followed by insert in the same mutation
-        rm = new RowMutation("Keyspace3", ByteBufferUtil.bytes("k1"));
+        rm = new Mutation("Keyspace3", ByteBufferUtil.bytes("k1"));
         rm.delete("Indexed1", 3);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 4);
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), 4);
         rm.apply();
         rows = cfs.search(range, clause, filter, 100);
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = ByteBufferUtil.string(rows.get(0).key.key);
+        key = ByteBufferUtil.string(rows.get(0).key.getKey());
         assert "k1".equals( key );
     }
 
@@ -389,37 +417,38 @@
     public void testIndexUpdate() throws IOException
     {
         Keyspace keyspace = Keyspace.open("Keyspace2");
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Indexed1");
+        CellName birthdate = cellname("birthdate");
 
         // create a row and update the birthdate value, test that the index query fetches the new version
-        RowMutation rm;
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 1);
+        Mutation rm;
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(1L), 1);
         rm.apply();
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(2L), 2);
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(2L), 2);
         rm.apply();
 
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(1L));
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(1L));
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
-        IPartitioner p = StorageService.getPartitioner();
         Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
+        List<Row> rows = cfs.search(range, clause, filter, 100);
         assert rows.size() == 0;
 
-        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(2L));
+        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(2L));
         clause = Arrays.asList(expr);
         rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
-        String key = ByteBufferUtil.string(rows.get(0).key.key);
+        String key = ByteBufferUtil.string(rows.get(0).key.getKey());
         assert "k1".equals( key );
 
         // update the birthdate value with an OLDER timestamp, and test that the index ignores this
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(3L), 0);
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(3L), 0);
         rm.apply();
 
         rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
-        key = ByteBufferUtil.string(rows.get(0).key.key);
+        key = ByteBufferUtil.string(rows.get(0).key.getKey());
         assert "k1".equals( key );
 
     }
@@ -431,12 +460,12 @@
         Keyspace keyspace = Keyspace.open("Keyspace2");
 
         // create a row and update the birthdate value with an expiring column
-        RowMutation rm;
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("k100"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(100L), 1, 1000);
+        Mutation rm;
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("k100"));
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(100L), 1, 1000);
         rm.apply();
 
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(100L));
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(100L));
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
         Range<RowPosition> range = Util.range("", "");
@@ -447,32 +476,32 @@
         TimeUnit.SECONDS.sleep(1);
 
         // now overwrite with the same name/value/ttl, but the local expiry time will be different
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("k100"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(100L), 1, 1000);
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("k100"));
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(100L), 1, 1000);
         rm.apply();
 
         rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
         assertEquals(1, rows.size());
 
         // check that modifying the indexed value using the same timestamp behaves as expected
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("k101"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(101L), 1, 1000);
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("k101"));
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(101L), 1, 1000);
         rm.apply();
 
-        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(101L));
+        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(101L));
         clause = Arrays.asList(expr);
         rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
         assertEquals(1, rows.size());
 
         TimeUnit.SECONDS.sleep(1);
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("k101"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(102L), 1, 1000);
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("k101"));
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(102L), 1, 1000);
         rm.apply();
         // search for the old value
         rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
         assertEquals(0, rows.size());
         // and for the new
-        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(102L));
+        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(102L));
         clause = Arrays.asList(expr);
         rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
         assertEquals(1, rows.size());
@@ -489,16 +518,16 @@
         cfs.truncateBlocking();
 
         ByteBuffer rowKey = ByteBufferUtil.bytes("k1");
-        ByteBuffer colName = ByteBufferUtil.bytes("birthdate");
+        CellName colName = cellname("birthdate"); 
         ByteBuffer val1 = ByteBufferUtil.bytes(1L);
         ByteBuffer val2 = ByteBufferUtil.bytes(2L);
 
         // create a row and update the "birthdate" value, test that the index query fetches this version
-        RowMutation rm;
-        rm = new RowMutation(keySpace, rowKey);
+        Mutation rm;
+        rm = new Mutation(keySpace, rowKey);
         rm.add(cfName, colName, val1, 0);
         rm.apply();
-        IndexExpression expr = new IndexExpression(colName, IndexOperator.EQ, val1);
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, val1);
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
         Range<RowPosition> range = Util.range("", "");
@@ -509,7 +538,7 @@
         keyspace.getColumnFamilyStore(cfName).forceBlockingFlush();
 
         // now apply another update, but force the index update to be skipped
-        rm = new RowMutation(keySpace, rowKey);
+        rm = new Mutation(keySpace, rowKey);
         rm.add(cfName, colName, val2, 1);
         keyspace.apply(rm, true, false);
 
@@ -520,7 +549,7 @@
         rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
         assertEquals(0, rows.size());
         // now check for the updated value
-        expr = new IndexExpression(colName, IndexOperator.EQ, val2);
+        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, val2);
         clause = Arrays.asList(expr);
         filter = new IdentityQueryFilter();
         range = Util.range("", "");
@@ -529,11 +558,11 @@
 
         // now, reset back to the original value, still skipping the index update, to
         // make sure the value was expunged from the index when it was discovered to be inconsistent
-        rm = new RowMutation(keySpace, rowKey);
+        rm = new Mutation(keySpace, rowKey);
         rm.add(cfName, colName, ByteBufferUtil.bytes(1L), 3);
         keyspace.apply(rm, true, false);
 
-        expr = new IndexExpression(colName, IndexOperator.EQ, ByteBufferUtil.bytes(1L));
+        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(1L));
         clause = Arrays.asList(expr);
         filter = new IdentityQueryFilter();
         range = Util.range("", "");
@@ -553,24 +582,22 @@
 
         ByteBuffer rowKey = ByteBufferUtil.bytes("k1");
         ByteBuffer clusterKey = ByteBufferUtil.bytes("ck1");
-        ByteBuffer colName = ByteBufferUtil.bytes("col1");
-        CompositeType baseComparator = (CompositeType)cfs.getComparator();
-        CompositeType.Builder builder = baseComparator.builder();
-        builder.add(clusterKey);
-        builder.add(colName);
-        ByteBuffer compositeName = builder.build();
+        ByteBuffer colName = ByteBufferUtil.bytes("col1"); 
+
+        CellNameType baseComparator = cfs.getComparator();
+        CellName compositeName = baseComparator.makeCellName(clusterKey, colName);
 
         ByteBuffer val1 = ByteBufferUtil.bytes("v1");
         ByteBuffer val2 = ByteBufferUtil.bytes("v2");
 
         // create a row and update the author value
-        RowMutation rm;
-        rm = new RowMutation(keySpace, rowKey);
+        Mutation rm;
+        rm = new Mutation(keySpace, rowKey);
         rm.add(cfName, compositeName, val1, 0);
         rm.apply();
 
         // test that the index query fetches this version
-        IndexExpression expr = new IndexExpression(colName, IndexOperator.EQ, val1);
+        IndexExpression expr = new IndexExpression(colName, IndexExpression.Operator.EQ, val1);
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
         Range<RowPosition> range = Util.range("", "");
@@ -583,7 +610,7 @@
         assertEquals(1, rows.size());
 
         // now apply another update, but force the index update to be skipped
-        rm = new RowMutation(keySpace, rowKey);
+        rm = new Mutation(keySpace, rowKey);
         rm.add(cfName, compositeName, val2, 1);
         keyspace.apply(rm, true, false);
 
@@ -594,7 +621,7 @@
         rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
         assertEquals(0, rows.size());
         // now check for the updated value
-        expr = new IndexExpression(colName, IndexOperator.EQ, val2);
+        expr = new IndexExpression(colName, IndexExpression.Operator.EQ, val2);
         clause = Arrays.asList(expr);
         filter = new IdentityQueryFilter();
         range = Util.range("", "");
@@ -603,11 +630,11 @@
 
         // now, reset back to the original value, still skipping the index update, to
         // make sure the value was expunged from the index when it was discovered to be inconsistent
-        rm = new RowMutation(keySpace, rowKey);
+        rm = new Mutation(keySpace, rowKey);
         rm.add(cfName, compositeName, val1, 2);
         keyspace.apply(rm, true, false);
 
-        expr = new IndexExpression(colName, IndexOperator.EQ, val1);
+        expr = new IndexExpression(colName, IndexExpression.Operator.EQ, val1);
         clause = Arrays.asList(expr);
         filter = new IdentityQueryFilter();
         range = Util.range("", "");
@@ -629,22 +656,20 @@
         ByteBuffer rowKey = ByteBufferUtil.bytes("k1");
         ByteBuffer clusterKey = ByteBufferUtil.bytes("ck1");
         ByteBuffer colName = ByteBufferUtil.bytes("col1");
-        CompositeType baseComparator = (CompositeType)cfs.getComparator();
-        CompositeType.Builder builder = baseComparator.builder();
-        builder.add(clusterKey);
-        builder.add(colName);
-        ByteBuffer compositeName = builder.build();
+
+        CellNameType baseComparator = cfs.getComparator();
+        CellName compositeName = baseComparator.makeCellName(clusterKey, colName);
 
         ByteBuffer val1 = ByteBufferUtil.bytes("v2");
 
         // Insert indexed value.
-        RowMutation rm;
-        rm = new RowMutation(keySpace, rowKey);
+        Mutation rm;
+        rm = new Mutation(keySpace, rowKey);
         rm.add(cfName, compositeName, val1, 0);
         rm.apply();
 
         // Now delete the value and flush too.
-        rm = new RowMutation(keySpace, rowKey);
+        rm = new Mutation(keySpace, rowKey);
         rm.delete(cfName, 1);
         rm.apply();
 
@@ -655,7 +680,7 @@
         // Read the index and we check we do get no value (and no NPE)
         // Note: the index will return the entry because it hasn't been deleted (we
         // haven't read yet nor compacted) but the data read itself will return null
-        IndexExpression expr = new IndexExpression(colName, IndexOperator.EQ, val1);
+        IndexExpression expr = new IndexExpression(colName, IndexExpression.Operator.EQ, val1);
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
         Range<RowPosition> range = Util.range("", "");
@@ -665,61 +690,64 @@
 
     // See CASSANDRA-2628
     @Test
-    public void testIndexScanWithLimitOne() throws IOException
+    public void testIndexScanWithLimitOne()
     {
-        RowMutation rm;
+        ColumnFamilyStore cfs = Keyspace.open("Keyspace1").getColumnFamilyStore("Indexed1");
+        Mutation rm;
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("kk1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(1L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 0);
+        CellName nobirthdate = cellname("notbirthdate");
+        CellName birthdate = cellname("birthdate");
+
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("kk1"));
+        rm.add("Indexed1", nobirthdate, ByteBufferUtil.bytes(1L), 0);
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(1L), 0);
         rm.apply();
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("kk2"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(2L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 0);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("kk2"));
+        rm.add("Indexed1", nobirthdate, ByteBufferUtil.bytes(2L), 0);
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(1L), 0);
         rm.apply();
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("kk3"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(2L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 0);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("kk3"));
+        rm.add("Indexed1", nobirthdate, ByteBufferUtil.bytes(2L), 0);
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(1L), 0);
         rm.apply();
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("kk4"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("notbirthdate"), ByteBufferUtil.bytes(2L), 0);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 0);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("kk4"));
+        rm.add("Indexed1", nobirthdate, ByteBufferUtil.bytes(2L), 0);
+        rm.add("Indexed1", birthdate, ByteBufferUtil.bytes(1L), 0);
         rm.apply();
 
         // basic single-expression query
-        IndexExpression expr1 = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(1L));
-        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), IndexOperator.GT, ByteBufferUtil.bytes(1L));
-        List<IndexExpression> clause = Arrays.asList(new IndexExpression[]{ expr1, expr2 });
+        IndexExpression expr1 = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(1L));
+        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), IndexExpression.Operator.GT, ByteBufferUtil.bytes(1L));
+        List<IndexExpression> clause = Arrays.asList(expr1, expr2);
         IDiskAtomFilter filter = new IdentityQueryFilter();
-        IPartitioner p = StorageService.getPartitioner();
         Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = Keyspace.open("Keyspace1").getColumnFamilyStore("Indexed1").search(range, clause, filter, 1);
+        List<Row> rows = cfs.search(range, clause, filter, 1);
 
         assert rows != null;
         assert rows.size() == 1 : StringUtils.join(rows, ",");
     }
 
     @Test
-    public void testIndexCreate() throws IOException, ConfigurationException, InterruptedException, ExecutionException
+    public void testIndexCreate() throws IOException, InterruptedException, ExecutionException
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Indexed2");
 
         // create a row and update the birthdate value, test that the index query fetches the new version
-        RowMutation rm;
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed2", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), 1);
+        Mutation rm;
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed2", cellname("birthdate"), ByteBufferUtil.bytes(1L), 1);
         rm.apply();
 
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Indexed2");
         ColumnDefinition old = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("birthdate"));
-        ColumnDefinition cd = ColumnDefinition.regularDef(old.name, old.getValidator(), null).setIndex("birthdate_index", IndexType.KEYS, null);
+        ColumnDefinition cd = ColumnDefinition.regularDef(cfs.metadata, old.name.bytes, old.type, null).setIndex("birthdate_index", IndexType.KEYS, null);
         Future<?> future = cfs.indexManager.addIndexedColumn(cd);
         future.get();
         // we had a bug (CASSANDRA-2244) where index would get created but not flushed -- check for that
-        assert cfs.indexManager.getIndexForColumn(cd.name).getIndexCfs().getSSTables().size() > 0;
+        assert cfs.indexManager.getIndexForColumn(cd.name.bytes).getIndexCfs().getSSTables().size() > 0;
 
         queryBirthdate(keyspace);
 
@@ -736,13 +764,12 @@
 
     private void queryBirthdate(Keyspace keyspace) throws CharacterCodingException
     {
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(1L));
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(1L));
         List<IndexExpression> clause = Arrays.asList(expr);
         IDiskAtomFilter filter = new IdentityQueryFilter();
-        IPartitioner p = StorageService.getPartitioner();
         List<Row> rows = keyspace.getColumnFamilyStore("Indexed2").search(Util.range("", ""), clause, filter, 100);
         assert rows.size() == 1 : StringUtils.join(rows, ",");
-        assertEquals("k1", ByteBufferUtil.string(rows.get(0).key.key));
+        assertEquals("k1", ByteBufferUtil.string(rows.get(0).key.getKey()));
     }
 
     @Test
@@ -754,53 +781,52 @@
 
         // insert two columns that represent the same integer but have different binary forms (the
         // second one is padded with extra zeros)
-        RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("k1"));
-        byte[] column1 = {1};
-        rm.add(cfname, ByteBuffer.wrap(column1), ByteBufferUtil.bytes("data1"), 1);
+        Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("k1"));
+        CellName column1 = cellname(ByteBuffer.wrap(new byte[]{1}));
+        rm.add(cfname, column1, ByteBufferUtil.bytes("data1"), 1);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("k1"));
-        byte[] column2 = {0, 0, 1};
-        rm.add(cfname, ByteBuffer.wrap(column2), ByteBufferUtil.bytes("data2"), 2);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("k1"));
+        CellName column2 = cellname(ByteBuffer.wrap(new byte[]{0, 0, 1}));
+        rm.add(cfname, column2, ByteBufferUtil.bytes("data2"), 2);
         rm.apply();
         cfs.forceBlockingFlush();
 
         // fetch by the first column name; we should get the second version of the column value
         SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(
             "Keyspace1", ByteBufferUtil.bytes("k1"), cfname, System.currentTimeMillis(),
-            new NamesQueryFilter(FBUtilities.singleton(ByteBuffer.wrap(column1), cfs.getComparator())));
+            new NamesQueryFilter(FBUtilities.singleton(column1, cfs.getComparator())));
 
         ColumnFamily cf = cmd.getRow(keyspace).cf;
         assertEquals(1, cf.getColumnCount());
-        Column column = cf.getColumn(ByteBuffer.wrap(column1));
-        assertEquals("data2", ByteBufferUtil.string(column.value()));
-        assertEquals(ByteBuffer.wrap(column2), column.name());
+        Cell cell = cf.getColumn(column1);
+        assertEquals("data2", ByteBufferUtil.string(cell.value()));
+        assertEquals(column2, cell.name());
 
         // fetch by the second column name; we should get the second version of the column value
         cmd = new SliceByNamesReadCommand(
             "Keyspace1", ByteBufferUtil.bytes("k1"), cfname, System.currentTimeMillis(),
-            new NamesQueryFilter(FBUtilities.singleton(ByteBuffer.wrap(column2), cfs.getComparator())));
+            new NamesQueryFilter(FBUtilities.singleton(column2, cfs.getComparator())));
 
         cf = cmd.getRow(keyspace).cf;
         assertEquals(1, cf.getColumnCount());
-        column = cf.getColumn(ByteBuffer.wrap(column2));
-        assertEquals("data2", ByteBufferUtil.string(column.value()));
-        assertEquals(ByteBuffer.wrap(column2), column.name());
+        cell = cf.getColumn(column2);
+        assertEquals("data2", ByteBufferUtil.string(cell.value()));
+        assertEquals(column2, cell.name());
     }
 
     @Test
-    public void testInclusiveBounds() throws IOException, ExecutionException, InterruptedException
+    public void testInclusiveBounds()
     {
         ColumnFamilyStore cfs = insertKey1Key2();
 
-        IPartitioner p = StorageService.getPartitioner();
         List<Row> result = cfs.getRangeSlice(Util.bounds("key1", "key2"),
                                              null,
-                                             new NamesQueryFilter(FBUtilities.singleton(ByteBufferUtil.bytes("asdf"), cfs.getComparator())),
+                                             Util.namesFilter(cfs, "asdf"),
                                              10);
         assertEquals(2, result.size());
-        assert result.get(0).key.key.equals(ByteBufferUtil.bytes("key1"));
+        assert result.get(0).key.getKey().equals(ByteBufferUtil.bytes("key1"));
     }
 
     @Test
@@ -815,16 +841,16 @@
 
         // create an isolated sstable.
         putColsSuper(cfs, key, scfName,
-                new Column(getBytes(1L), ByteBufferUtil.bytes("val1"), 1),
-                new Column(getBytes(2L), ByteBufferUtil.bytes("val2"), 1),
-                new Column(getBytes(3L), ByteBufferUtil.bytes("val3"), 1));
+                new BufferCell(cellname(1L), ByteBufferUtil.bytes("val1"), 1),
+                new BufferCell(cellname(2L), ByteBufferUtil.bytes("val2"), 1),
+                new BufferCell(cellname(3L), ByteBufferUtil.bytes("val3"), 1));
         cfs.forceBlockingFlush();
 
         // insert, don't flush.
         putColsSuper(cfs, key, scfName,
-                new Column(getBytes(4L), ByteBufferUtil.bytes("val4"), 1),
-                new Column(getBytes(5L), ByteBufferUtil.bytes("val5"), 1),
-                new Column(getBytes(6L), ByteBufferUtil.bytes("val6"), 1));
+                new BufferCell(cellname(4L), ByteBufferUtil.bytes("val4"), 1),
+                new BufferCell(cellname(5L), ByteBufferUtil.bytes("val5"), 1),
+                new BufferCell(cellname(6L), ByteBufferUtil.bytes("val6"), 1));
 
         // verify insert.
         final SlicePredicate sp = new SlicePredicate();
@@ -833,39 +859,39 @@
         sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
         sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
 
-        assertRowAndColCount(1, 6, scfName, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+        assertRowAndColCount(1, 6, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
 
         // delete
-        RowMutation rm = new RowMutation(keyspace.getName(), key.key);
+        Mutation rm = new Mutation(keyspace.getName(), key.getKey());
         rm.deleteRange(cfName, SuperColumns.startOf(scfName), SuperColumns.endOf(scfName), 2);
         rm.apply();
 
         // verify delete.
-        assertRowAndColCount(1, 0, scfName, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
 
         // flush
         cfs.forceBlockingFlush();
 
         // re-verify delete.
-        assertRowAndColCount(1, 0, scfName, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
 
         // late insert.
         putColsSuper(cfs, key, scfName,
-                new Column(getBytes(4L), ByteBufferUtil.bytes("val4"), 1L),
-                new Column(getBytes(7L), ByteBufferUtil.bytes("val7"), 1L));
+                new BufferCell(cellname(4L), ByteBufferUtil.bytes("val4"), 1L),
+                new BufferCell(cellname(7L), ByteBufferUtil.bytes("val7"), 1L));
 
         // re-verify delete.
-        assertRowAndColCount(1, 0, scfName, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
 
         // make sure new writes are recognized.
         putColsSuper(cfs, key, scfName,
-                new Column(getBytes(3L), ByteBufferUtil.bytes("val3"), 3),
-                new Column(getBytes(8L), ByteBufferUtil.bytes("val8"), 3),
-                new Column(getBytes(9L), ByteBufferUtil.bytes("val9"), 3));
-        assertRowAndColCount(1, 3, scfName, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+                new BufferCell(cellname(3L), ByteBufferUtil.bytes("val3"), 3),
+                new BufferCell(cellname(8L), ByteBufferUtil.bytes("val8"), 3),
+                new BufferCell(cellname(9L), ByteBufferUtil.bytes("val9"), 3));
+        assertRowAndColCount(1, 3, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
     }
 
-    private static void assertRowAndColCount(int rowCount, int colCount, ByteBuffer sc, boolean isDeleted, Collection<Row> rows) throws CharacterCodingException
+    private static void assertRowAndColCount(int rowCount, int colCount, boolean isDeleted, Collection<Row> rows) throws CharacterCodingException
     {
         assert rows.size() == rowCount : "rowcount " + rows.size();
         for (Row row : rows)
@@ -880,26 +906,26 @@
     private static String str(ColumnFamily cf) throws CharacterCodingException
     {
         StringBuilder sb = new StringBuilder();
-        for (Column col : cf.getSortedColumns())
-            sb.append(String.format("(%s,%s,%d),", ByteBufferUtil.string(col.name()), ByteBufferUtil.string(col.value()), col.timestamp()));
+        for (Cell col : cf.getSortedColumns())
+            sb.append(String.format("(%s,%s,%d),", ByteBufferUtil.string(col.name().toByteBuffer()), ByteBufferUtil.string(col.value()), col.timestamp()));
         return sb.toString();
     }
 
-    private static void putColsSuper(ColumnFamilyStore cfs, DecoratedKey key, ByteBuffer scfName, Column... cols) throws Throwable
+    private static void putColsSuper(ColumnFamilyStore cfs, DecoratedKey key, ByteBuffer scfName, Cell... cols) throws Throwable
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(cfs.keyspace.getName(), cfs.name);
-        for (Column col : cols)
-            cf.addColumn(col.withUpdatedName(CompositeType.build(scfName, col.name())));
-        RowMutation rm = new RowMutation(cfs.keyspace.getName(), key.key, cf);
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfs.keyspace.getName(), cfs.name);
+        for (Cell col : cols)
+            cf.addColumn(col.withUpdatedName(CellNames.compositeDense(scfName, col.name().toByteBuffer())));
+        Mutation rm = new Mutation(cfs.keyspace.getName(), key.getKey(), cf);
         rm.apply();
     }
 
-    private static void putColsStandard(ColumnFamilyStore cfs, DecoratedKey key, Column... cols) throws Throwable
+    private static void putColsStandard(ColumnFamilyStore cfs, DecoratedKey key, Cell... cols) throws Throwable
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(cfs.keyspace.getName(), cfs.name);
-        for (Column col : cols)
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfs.keyspace.getName(), cfs.name);
+        for (Cell col : cols)
             cf.addColumn(col);
-        RowMutation rm = new RowMutation(cfs.keyspace.getName(), key.key, cf);
+        Mutation rm = new Mutation(cfs.keyspace.getName(), key.getKey(), cf);
         rm.apply();
     }
 
@@ -921,56 +947,57 @@
 
         // insert
         putColsStandard(cfs, key, column("col1", "val1", 1), column("col2", "val2", 1));
-        assertRowAndColCount(1, 2, null, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRowAndColCount(1, 2, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
 
         // flush.
         cfs.forceBlockingFlush();
 
         // insert, don't flush
         putColsStandard(cfs, key, column("col3", "val3", 1), column("col4", "val4", 1));
-        assertRowAndColCount(1, 4, null, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRowAndColCount(1, 4, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
 
         // delete (from sstable and memtable)
-        RowMutation rm = new RowMutation(keyspace.getName(), key.key);
+        Mutation rm = new Mutation(keyspace.getName(), key.getKey());
         rm.delete(cfs.name, 2);
         rm.apply();
 
         // verify delete
-        assertRowAndColCount(1, 0, null, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRowAndColCount(1, 0, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
 
         // flush
         cfs.forceBlockingFlush();
 
         // re-verify delete. // first breakage is right here because of CASSANDRA-1837.
-        assertRowAndColCount(1, 0, null, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRowAndColCount(1, 0, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
 
         // simulate a 'late' insertion that gets put in after the deletion. should get inserted, but fail on read.
         putColsStandard(cfs, key, column("col5", "val5", 1), column("col2", "val2", 1));
 
         // should still be nothing there because we deleted this row. 2nd breakage, but was undetected because of 1837.
-        assertRowAndColCount(1, 0, null, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRowAndColCount(1, 0, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
 
         // make sure that new writes are recognized.
         putColsStandard(cfs, key, column("col6", "val6", 3), column("col7", "val7", 3));
-        assertRowAndColCount(1, 2, null, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRowAndColCount(1, 2, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
 
         // and it remains so after flush. (this wasn't failing before, but it's good to check.)
         cfs.forceBlockingFlush();
-        assertRowAndColCount(1, 2, null, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRowAndColCount(1, 2, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
     }
 
 
-    private ColumnFamilyStore insertKey1Key2() throws IOException, ExecutionException, InterruptedException
+    private ColumnFamilyStore insertKey1Key2()
     {
-        List<IMutation> rms = new LinkedList<IMutation>();
-        RowMutation rm;
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("key1"));
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        ColumnFamilyStore cfs = Keyspace.open("Keyspace2").getColumnFamilyStore("Standard1");
+        List<Mutation> rms = new LinkedList<>();
+        Mutation rm;
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("key1"));
+        rm.add("Standard1", cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
         rms.add(rm);
         Util.writeColumnFamily(rms);
 
-        rm = new RowMutation("Keyspace2", ByteBufferUtil.bytes("key2"));
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        rm = new Mutation("Keyspace2", ByteBufferUtil.bytes("key2"));
+        rm.add("Standard1", cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
         rms.add(rm);
         return Util.writeColumnFamily(rms);
     }
@@ -982,8 +1009,8 @@
 
         for (int version = 1; version <= 2; ++version)
         {
-            Descriptor existing = new Descriptor(cfs.directories.getDirectoryForNewSSTables(), "Keyspace2", "Standard1", version, false);
-            Descriptor desc = new Descriptor(Directories.getBackupsDirectory(existing), "Keyspace2", "Standard1", version, false);
+            Descriptor existing = new Descriptor(cfs.directories.getDirectoryForNewSSTables(), "Keyspace2", "Standard1", version, Descriptor.Type.FINAL);
+            Descriptor desc = new Descriptor(Directories.getBackupsDirectory(existing), "Keyspace2", "Standard1", version, Descriptor.Type.FINAL);
             for (Component c : new Component[]{ Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.STATS })
                 assertTrue("can not find backedup file:" + desc.filenameFor(c), new File(desc.filenameFor(c)).exists());
         }
@@ -1001,39 +1028,39 @@
         DecoratedKey key = Util.dk("slice-get-uuid-type");
 
         // Insert a row with one supercolumn and multiple subcolumns
-        putColsSuper(cfs, key, superColName, new Column(ByteBufferUtil.bytes("a"), ByteBufferUtil.bytes("A"), 1),
-                                             new Column(ByteBufferUtil.bytes("b"), ByteBufferUtil.bytes("B"), 1));
+        putColsSuper(cfs, key, superColName, new BufferCell(cellname("a"), ByteBufferUtil.bytes("A"), 1),
+                                             new BufferCell(cellname("b"), ByteBufferUtil.bytes("B"), 1));
 
         // Get the entire supercolumn like normal
         ColumnFamily cfGet = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-        assertEquals(ByteBufferUtil.bytes("A"), cfGet.getColumn(CompositeType.build(superColName, ByteBufferUtil.bytes("a"))).value());
-        assertEquals(ByteBufferUtil.bytes("B"), cfGet.getColumn(CompositeType.build(superColName, ByteBufferUtil.bytes("b"))).value());
+        assertEquals(ByteBufferUtil.bytes("A"), cfGet.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a"))).value());
+        assertEquals(ByteBufferUtil.bytes("B"), cfGet.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b"))).value());
 
         // Now do the SliceByNamesCommand on the supercolumn, passing both subcolumns in as columns to get
-        SortedSet<ByteBuffer> sliceColNames = new TreeSet<ByteBuffer>(cfs.metadata.comparator);
-        sliceColNames.add(CompositeType.build(superColName, ByteBufferUtil.bytes("a")));
-        sliceColNames.add(CompositeType.build(superColName, ByteBufferUtil.bytes("b")));
-        SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(keyspaceName, key.key, cfName, System.currentTimeMillis(), new NamesQueryFilter(sliceColNames));
+        SortedSet<CellName> sliceColNames = new TreeSet<CellName>(cfs.metadata.comparator);
+        sliceColNames.add(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a")));
+        sliceColNames.add(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b")));
+        SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(keyspaceName, key.getKey(), cfName, System.currentTimeMillis(), new NamesQueryFilter(sliceColNames));
         ColumnFamily cfSliced = cmd.getRow(keyspace).cf;
 
         // Make sure the slice returns the same as the straight get
-        assertEquals(ByteBufferUtil.bytes("A"), cfSliced.getColumn(CompositeType.build(superColName, ByteBufferUtil.bytes("a"))).value());
-        assertEquals(ByteBufferUtil.bytes("B"), cfSliced.getColumn(CompositeType.build(superColName, ByteBufferUtil.bytes("b"))).value());
+        assertEquals(ByteBufferUtil.bytes("A"), cfSliced.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a"))).value());
+        assertEquals(ByteBufferUtil.bytes("B"), cfSliced.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b"))).value());
     }
 
     @Test
-    public void testSliceByNamesCommandOldMetatada() throws Throwable
+    public void testSliceByNamesCommandOldMetadata() throws Throwable
     {
         String keyspaceName = "Keyspace1";
         String cfName= "Standard1";
         DecoratedKey key = Util.dk("slice-name-old-metadata");
-        ByteBuffer cname = ByteBufferUtil.bytes("c1");
+        CellName cname = cellname("c1");
         Keyspace keyspace = Keyspace.open(keyspaceName);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
         cfs.clearUnsafe();
 
-        // Create a column a 'high timestamp'
-        putColsStandard(cfs, key, new Column(cname, ByteBufferUtil.bytes("a"), 2));
+        // Create a cell a 'high timestamp'
+        putColsStandard(cfs, key, new BufferCell(cname, ByteBufferUtil.bytes("a"), 2));
         cfs.forceBlockingFlush();
 
         // Nuke the metadata and reload that sstable
@@ -1042,20 +1069,22 @@
         cfs.clearUnsafe();
         assertEquals(0, cfs.getSSTables().size());
 
-        new File(ssTables.iterator().next().descriptor.filenameFor(SSTable.COMPONENT_STATS)).delete();
+        new File(ssTables.iterator().next().descriptor.filenameFor(Component.STATS)).delete();
         cfs.loadNewSSTables();
 
-        // Add another column with a lower timestamp
-        putColsStandard(cfs, key, new Column(cname, ByteBufferUtil.bytes("b"), 1));
+        // Add another cell with a lower timestamp
+        putColsStandard(cfs, key, new BufferCell(cname, ByteBufferUtil.bytes("b"), 1));
 
-        // Test fetching the column by name returns the first column
-        SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(keyspaceName, key.key, cfName, System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(cname, cfs.getComparator())));
+        // Test fetching the cell by name returns the first cell
+        SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(keyspaceName, key.getKey(), cfName, System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(cname, cfs.getComparator())));
         ColumnFamily cf = cmd.getRow(keyspace).cf;
-        Column column = (Column) cf.getColumn(cname);
-        assert column.value().equals(ByteBufferUtil.bytes("a")) : "expecting a, got " + ByteBufferUtil.string(column.value());
+        Cell cell = cf.getColumn(cname);
+        assert cell.value().equals(ByteBufferUtil.bytes("a")) : "expecting a, got " + ByteBufferUtil.string(cell.value());
+
+        Keyspace.clear("Keyspace1"); // CASSANDRA-7195
     }
 
-    private static void assertTotalColCount(Collection<Row> rows, int expectedCount) throws CharacterCodingException
+    private static void assertTotalColCount(Collection<Row> rows, int expectedCount)
     {
         int columns = 0;
         for (Row row : rows)
@@ -1075,7 +1104,7 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
         cfs.clearUnsafe();
 
-        Column[] cols = new Column[5];
+        Cell[] cols = new Cell[5];
         for (int i = 0; i < 5; i++)
             cols[i] = column("c" + i, "value", 1);
 
@@ -1191,7 +1220,7 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
         cfs.clearUnsafe();
 
-        Column[] cols = new Column[4];
+        Cell[] cols = new Cell[4];
         for (int i = 0; i < 4; i++)
             cols[i] = column("c" + i, "value", 1);
 
@@ -1249,11 +1278,11 @@
         assertColumnNames(row2, "c0", "c1");
 
         // Paging within bounds
-        SliceQueryFilter sf = new SliceQueryFilter(ByteBufferUtil.bytes("c1"),
-                                                   ByteBufferUtil.bytes("c2"),
+        SliceQueryFilter sf = new SliceQueryFilter(cellname("c1"),
+                                                   cellname("c2"),
                                                    false,
                                                    0);
-        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(ka, kc), sf, ByteBufferUtil.bytes("c2"), ByteBufferUtil.bytes("c1"), null, 2, true, System.currentTimeMillis()));
+        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(ka, kc), sf, cellname("c2"), cellname("c1"), null, 2, true, System.currentTimeMillis()));
         assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
         iter = rows.iterator();
         row1 = iter.next();
@@ -1261,7 +1290,7 @@
         assertColumnNames(row1, "c2");
         assertColumnNames(row2, "c1");
 
-        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(kb, kc), sf, ByteBufferUtil.bytes("c1"), ByteBufferUtil.bytes("c1"), null, 10, true, System.currentTimeMillis()));
+        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(kb, kc), sf, cellname("c1"), cellname("c1"), null, 10, true, System.currentTimeMillis()));
         assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
         iter = rows.iterator();
         row1 = iter.next();
@@ -1278,12 +1307,12 @@
             for (Row row : rows)
             {
                 sb.append("{");
-                sb.append(ByteBufferUtil.string(row.key.key));
+                sb.append(ByteBufferUtil.string(row.key.getKey()));
                 sb.append(":");
                 if (row.cf != null && !row.cf.isEmpty())
                 {
-                    for (Column c : row.cf)
-                        sb.append(" ").append(ByteBufferUtil.string(c.name()));
+                    for (Cell c : row.cf)
+                        sb.append(" ").append(row.cf.getComparator().getString(c.name()));
                 }
                 sb.append("} ");
             }
@@ -1300,15 +1329,15 @@
         if (row == null || row.cf == null)
             throw new AssertionError("The row should not be empty");
 
-        Iterator<Column> columns = row.cf.getSortedColumns().iterator();
+        Iterator<Cell> columns = row.cf.getSortedColumns().iterator();
         Iterator<String> names = Arrays.asList(columnNames).iterator();
 
         while (columns.hasNext())
         {
-            Column c = columns.next();
-            assert names.hasNext() : "Got more columns that expected (first unexpected column: " + ByteBufferUtil.string(c.name()) + ")";
+            Cell c = columns.next();
+            assert names.hasNext() : "Got more columns that expected (first unexpected column: " + ByteBufferUtil.string(c.name().toByteBuffer()) + ")";
             String n = names.next();
-            assert c.name().equals(ByteBufferUtil.bytes(n)) : "Expected " + n + ", got " + ByteBufferUtil.string(c.name());
+            assert c.name().toByteBuffer().equals(ByteBufferUtil.bytes(n)) : "Expected " + n + ", got " + ByteBufferUtil.string(c.name().toByteBuffer());
         }
         assert !names.hasNext() : "Missing expected column " + names.next();
     }
@@ -1327,7 +1356,7 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
         cfs.clearUnsafe();
 
-        Column[] cols = new Column[5];
+        Cell[] cols = new Cell[5];
         for (int i = 0; i < 5; i++)
             cols[i] = column("c" + i, "value", 1);
 
@@ -1383,28 +1412,20 @@
         for (int i = 0; i < 10; i++)
         {
             ByteBuffer key = ByteBufferUtil.bytes(String.valueOf("k" + i));
-            RowMutation rm = new RowMutation("Keyspace1", key);
-            rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), LongType.instance.decompose(1L), System.currentTimeMillis());
+            Mutation rm = new Mutation("Keyspace1", key);
+            rm.add("Indexed1", cellname("birthdate"), LongType.instance.decompose(1L), System.currentTimeMillis());
             rm.apply();
         }
 
         store.forceBlockingFlush();
 
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, LongType.instance.decompose(1L));
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, LongType.instance.decompose(1L));
         // explicitly tell to the KeysSearcher to use column limiting for rowsPerQuery to trigger bogus columnsRead--; (CASSANDRA-3996)
         List<Row> rows = store.search(store.makeExtendedFilter(Util.range("", ""), new IdentityQueryFilter(), Arrays.asList(expr), 10, true, false, System.currentTimeMillis()));
 
         assert rows.size() == 10;
     }
 
-    private static String keys(List<Row> rows) throws Throwable
-    {
-        String k = "";
-        for (Row r : rows)
-            k += " " + ByteBufferUtil.string(r.key.key);
-        return k;
-    }
-
     @SuppressWarnings("unchecked")
     @Test
     public void testMultiRangeSomeEmptyNoIndex() throws Throwable
@@ -1412,18 +1433,18 @@
         // in order not to change thrift interfaces at this stage we build SliceQueryFilter
         // directly instead of using QueryFilter to build it for us
         ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colA")),
-                new ColumnSlice(bytes("colC"), bytes("colE")),
-                new ColumnSlice(bytes("colF"), bytes("colF")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colI"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colA")),
+                new ColumnSlice(cellname("colC"), cellname("colE")),
+                new ColumnSlice(cellname("colF"), cellname("colF")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
 
         ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colI")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colF"), bytes("colF")),
-                new ColumnSlice(bytes("colE"), bytes("colC")),
-                new ColumnSlice(bytes("colA"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colI")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colF"), cellname("colF")),
+                new ColumnSlice(cellname("colE"), cellname("colC")),
+                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
 
         String tableName = "Keyspace1";
         String cfName = "Standard1";
@@ -1432,10 +1453,10 @@
         cfs.clearUnsafe();
 
         String[] letters = new String[] { "a", "b", "c", "d", "i" };
-        Column[] cols = new Column[letters.length];
+        Cell[] cols = new Cell[letters.length];
         for (int i = 0; i < cols.length; i++)
         {
-            cols[i] = new Column(ByteBufferUtil.bytes("col" + letters[i].toUpperCase()),
+            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
                     ByteBuffer.wrap(new byte[1]), 1);
         }
 
@@ -1461,18 +1482,18 @@
         // in order not to change thrift interfaces at this stage we build SliceQueryFilter
         // directly instead of using QueryFilter to build it for us
         ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colA")),
-                new ColumnSlice(bytes("colC"), bytes("colE")),
-                new ColumnSlice(bytes("colF"), bytes("colF")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colI"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colA")),
+                new ColumnSlice(cellname("colC"), cellname("colE")),
+                new ColumnSlice(cellname("colF"), cellname("colF")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
 
         ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colI")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colF"), bytes("colF")),
-                new ColumnSlice(bytes("colE"), bytes("colC")),
-                new ColumnSlice(bytes("colA"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY,  cellname("colI")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colF"), cellname("colF")),
+                new ColumnSlice(cellname("colE"), cellname("colC")),
+                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
 
         String tableName = "Keyspace1";
         String cfName = "Standard1";
@@ -1481,10 +1502,10 @@
         cfs.clearUnsafe();
 
         String[] letters = new String[] { "a", "b", "c", "d", "i" };
-        Column[] cols = new Column[letters.length];
+        Cell[] cols = new Cell[letters.length];
         for (int i = 0; i < cols.length; i++)
         {
-            cols[i] = new Column(ByteBufferUtil.bytes("col" + letters[i].toUpperCase()),
+            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
                     ByteBuffer.wrap(new byte[1366]), 1);
         }
 
@@ -1510,18 +1531,18 @@
         // in order not to change thrift interfaces at this stage we build SliceQueryFilter
         // directly instead of using QueryFilter to build it for us
         ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colA")),
-                new ColumnSlice(bytes("colC"), bytes("colE")),
-                new ColumnSlice(bytes("colF"), bytes("colF")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colI"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colA")),
+                new ColumnSlice(cellname("colC"), cellname("colE")),
+                new ColumnSlice(cellname("colF"), cellname("colF")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
 
         ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colI")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colF"), bytes("colF")),
-                new ColumnSlice(bytes("colE"), bytes("colC")),
-                new ColumnSlice(bytes("colA"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colI")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colF"), cellname("colF")),
+                new ColumnSlice(cellname("colE"), cellname("colC")),
+                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
 
         String tableName = "Keyspace1";
         String cfName = "Standard1";
@@ -1530,10 +1551,10 @@
         cfs.clearUnsafe();
 
         String[] letters = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i" };
-        Column[] cols = new Column[letters.length];
+        Cell[] cols = new Cell[letters.length];
         for (int i = 0; i < cols.length; i++)
         {
-            cols[i] = new Column(ByteBufferUtil.bytes("col" + letters[i].toUpperCase()),
+            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
                     ByteBuffer.wrap(new byte[1]), 1);
         }
 
@@ -1560,18 +1581,18 @@
         // in order not to change thrift interfaces at this stage we build SliceQueryFilter
         // directly instead of using QueryFilter to build it for us
         ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colA")),
-                new ColumnSlice(bytes("colC"), bytes("colE")),
-                new ColumnSlice(bytes("colF"), bytes("colF")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colI"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colA")),
+                new ColumnSlice(cellname("colC"), cellname("colE")),
+                new ColumnSlice(cellname("colF"), cellname("colF")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
 
         ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colI")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colF"), bytes("colF")),
-                new ColumnSlice(bytes("colE"), bytes("colC")),
-                new ColumnSlice(bytes("colA"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colI")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colF"), cellname("colF")),
+                new ColumnSlice(cellname("colE"), cellname("colC")),
+                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
 
         String tableName = "Keyspace1";
         String cfName = "Standard1";
@@ -1580,10 +1601,10 @@
         cfs.clearUnsafe();
 
         String[] letters = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i" };
-        Column[] cols = new Column[letters.length];
+        Cell[] cols = new Cell[letters.length];
         for (int i = 0; i < cols.length; i++)
         {
-            cols[i] = new Column(ByteBufferUtil.bytes("col" + letters[i].toUpperCase()),
+            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
                     ByteBuffer.wrap(new byte[1366]), 1);
         }
 
@@ -1610,16 +1631,16 @@
         // in order not to change thrift interfaces at this stage we build SliceQueryFilter
         // directly instead of using QueryFilter to build it for us
         ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colA")),
-                new ColumnSlice(bytes("colC"), bytes("colE")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colI"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colA")),
+                new ColumnSlice(cellname("colC"), cellname("colE")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
 
         ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colI")),
-                new ColumnSlice(bytes("colG"), bytes("colG")),
-                new ColumnSlice(bytes("colE"), bytes("colC")),
-                new ColumnSlice(bytes("colA"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colI")),
+                new ColumnSlice(cellname("colG"), cellname("colG")),
+                new ColumnSlice(cellname("colE"), cellname("colC")),
+                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
 
         String keyspaceName = "Keyspace1";
         String cfName = "Standard1";
@@ -1628,10 +1649,10 @@
         cfs.clearUnsafe();
 
         String[] letters = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i" };
-        Column[] cols = new Column[letters.length];
+        Cell[] cols = new Cell[letters.length];
         for (int i = 0; i < cols.length; i++)
         {
-            cols[i] = new Column(ByteBufferUtil.bytes("col" + letters[i].toUpperCase()),
+            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
                     // use 1366 so that three cols make an index segment
                     ByteBuffer.wrap(new byte[1366]), 1);
         }
@@ -1692,7 +1713,7 @@
         String cf = "Standard3"; // should be empty
 
         final CFMetaData cfmeta = Schema.instance.getCFMetaData(ks, cf);
-        Directories dir = Directories.create(ks, cf);
+        Directories dir = new Directories(cfmeta);
         ByteBuffer key = bytes("key");
 
         // 1st sstable
@@ -1713,10 +1734,11 @@
         {
             protected SSTableWriter getWriter()
             {
-                SSTableMetadata.Collector collector = SSTableMetadata.createCollector(cfmeta.comparator);
+                MetadataCollector collector = new MetadataCollector(cfmeta.comparator);
                 collector.addAncestor(sstable1.descriptor.generation); // add ancestor from previously written sstable
                 return new SSTableWriter(makeFilename(directory, metadata.ksName, metadata.cfName),
                                          0,
+                                         ActiveRepairService.UNREPAIRED_SSTABLE,
                                          metadata,
                                          StorageService.getPartitioner(),
                                          collector);
@@ -1736,7 +1758,7 @@
 
         Map<Integer, UUID> unfinishedCompaction = new HashMap<>();
         unfinishedCompaction.put(sstable1.descriptor.generation, compactionTaskID);
-        ColumnFamilyStore.removeUnfinishedCompactionLeftovers(ks, cf, unfinishedCompaction);
+        ColumnFamilyStore.removeUnfinishedCompactionLeftovers(cfmeta, unfinishedCompaction);
 
         // 2nd sstable should be removed (only 1st sstable exists in set of size 1)
         sstables = dir.sstableLister().list();
@@ -1757,7 +1779,7 @@
         final String cf = "Standard4"; // should be empty
 
         final CFMetaData cfmeta = Schema.instance.getCFMetaData(ks, cf);
-        Directories dir = Directories.create(ks, cf);
+        Directories dir = new Directories(cfmeta);
         ByteBuffer key = bytes("key");
 
         // Write SSTable generation 3 that has ancestors 1 and 2
@@ -1767,12 +1789,13 @@
         {
             protected SSTableWriter getWriter()
             {
-                SSTableMetadata.Collector collector = SSTableMetadata.createCollector(cfmeta.comparator);
+                MetadataCollector collector = new MetadataCollector(cfmeta.comparator);
                 for (int ancestor : ancestors)
                     collector.addAncestor(ancestor);
-                String file = new Descriptor(directory, ks, cf, 3, true).filenameFor(Component.DATA);
+                String file = new Descriptor(directory, ks, cf, 3, Descriptor.Type.TEMP).filenameFor(Component.DATA);
                 return new SSTableWriter(file,
                                          0,
+                                         ActiveRepairService.UNREPAIRED_SSTABLE,
                                          metadata,
                                          StorageService.getPartitioner(),
                                          collector);
@@ -1793,7 +1816,7 @@
         UUID compactionTaskID = UUID.randomUUID();
         for (Integer ancestor : ancestors)
             unfinishedCompactions.put(ancestor, compactionTaskID);
-        ColumnFamilyStore.removeUnfinishedCompactionLeftovers(ks, cf, unfinishedCompactions);
+        ColumnFamilyStore.removeUnfinishedCompactionLeftovers(cfmeta, unfinishedCompactions);
 
         // SSTable should not be deleted
         sstables = dir.sstableLister().list();
@@ -1811,7 +1834,7 @@
         SSTableDeletingTask.waitForDeletions();
 
         final CFMetaData cfmeta = Schema.instance.getCFMetaData(ks, cf);
-        Directories dir = Directories.create(ks, cf);
+        Directories dir = new Directories(cfs.metadata);
 
         // clear old SSTables (probably left by CFS.clearUnsafe() calls in other tests)
         for (Map.Entry<Descriptor, Set<Component>> entry : dir.sstableLister().list().entrySet())
@@ -1829,7 +1852,7 @@
         ByteBuffer key = bytes("key");
 
         SSTableSimpleWriter writer = new SSTableSimpleWriter(dir.getDirectoryForNewSSTables(),
-                                                              cfmeta, StorageService.getPartitioner());
+                                                             cfmeta, StorageService.getPartitioner());
         writer.newRow(key);
         writer.addColumn(bytes("col"), bytes("val"), 1);
         writer.close();
@@ -1887,10 +1910,10 @@
         cfs.clearUnsafe();
 
         String[] letters = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l" };
-        Column[] cols = new Column[12];
+        Cell[] cols = new Cell[12];
         for (int i = 0; i < cols.length; i++)
         {
-            cols[i] = new Column(ByteBufferUtil.bytes("col" + letters[i]), ByteBuffer.wrap(new byte[valueSize]), 1);
+            cols[i] = new BufferCell(cellname("col" + letters[i]), ByteBuffer.wrap(new byte[valueSize]), 1);
         }
 
         for (int i = 0; i < 12; i++)
@@ -1916,32 +1939,32 @@
         // in order not to change thrift interfaces at this stage we build SliceQueryFilter
         // directly instead of using QueryFilter to build it for us
         ColumnSlice[] startMiddleAndEndRanges = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colc")),
-                new ColumnSlice(bytes("colf"), bytes("colg")),
-                new ColumnSlice(bytes("colj"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colc")),
+                new ColumnSlice(cellname("colf"), cellname("colg")),
+                new ColumnSlice(cellname("colj"), Composites.EMPTY) };
 
         ColumnSlice[] startMiddleAndEndRangesReversed = new ColumnSlice[] {
-                new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colj")),
-                new ColumnSlice(bytes("colg"), bytes("colf")),
-                new ColumnSlice(bytes("colc"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice(Composites.EMPTY, cellname("colj")),
+                new ColumnSlice(cellname("colg"), cellname("colf")),
+                new ColumnSlice(cellname("colc"), Composites.EMPTY) };
 
         ColumnSlice[] startOnlyRange =
-                new ColumnSlice[] { new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colc")) };
+                new ColumnSlice[] { new ColumnSlice(Composites.EMPTY, cellname("colc")) };
 
         ColumnSlice[] startOnlyRangeReversed =
-                new ColumnSlice[] { new ColumnSlice(bytes("colc"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice[] { new ColumnSlice(cellname("colc"), Composites.EMPTY) };
 
         ColumnSlice[] middleOnlyRanges =
-                new ColumnSlice[] { new ColumnSlice(bytes("colf"), bytes("colg")) };
+                new ColumnSlice[] { new ColumnSlice(cellname("colf"), cellname("colg")) };
 
         ColumnSlice[] middleOnlyRangesReversed =
-                new ColumnSlice[] { new ColumnSlice(bytes("colg"), bytes("colf")) };
+                new ColumnSlice[] { new ColumnSlice(cellname("colg"), cellname("colf")) };
 
         ColumnSlice[] endOnlyRanges =
-                new ColumnSlice[] { new ColumnSlice(bytes("colj"), ByteBuffer.wrap(EMPTY_BYTE_ARRAY)) };
+                new ColumnSlice[] { new ColumnSlice(cellname("colj"), Composites.EMPTY) };
 
         ColumnSlice[] endOnlyRangesReversed =
-                new ColumnSlice[] { new ColumnSlice(ByteBuffer.wrap(EMPTY_BYTE_ARRAY), bytes("colj")) };
+                new ColumnSlice[] { new ColumnSlice(Composites.EMPTY, cellname("colj")) };
 
         SliceQueryFilter startOnlyFilter = new SliceQueryFilter(startOnlyRange, false,
                 Integer.MAX_VALUE);
@@ -2120,13 +2143,13 @@
                                            false);
         assertSame("unexpected number of rows ", 1, rows.size());
         Row row = rows.get(0);
-        Collection<Column> cols = !filter.isReversed() ? row.cf.getSortedColumns() : row.cf.getReverseSortedColumns();
+        Collection<Cell> cols = !filter.isReversed() ? row.cf.getSortedColumns() : row.cf.getReverseSortedColumns();
         // printRow(cfs, new String(row.key.key.array()), cols);
-        String[] returnedColsNames = Iterables.toArray(Iterables.transform(cols, new Function<Column, String>()
+        String[] returnedColsNames = Iterables.toArray(Iterables.transform(cols, new Function<Cell, String>()
         {
-            public String apply(Column arg0)
+            public String apply(Cell arg0)
             {
-                return new String(arg0.name().array());
+                return Util.string(arg0.name().toByteBuffer());
             }
         }), String.class);
 
@@ -2134,31 +2157,31 @@
                 "Columns did not match. Expected: " + Arrays.toString(colNames) + " but got:"
                         + Arrays.toString(returnedColsNames), Arrays.equals(colNames, returnedColsNames));
         int i = 0;
-        for (Column col : cols)
+        for (Cell col : cols)
         {
-            assertEquals(colNames[i++], new String(col.name().array()));
+            assertEquals(colNames[i++], Util.string(col.name().toByteBuffer()));
         }
     }
 
-    private void printRow(ColumnFamilyStore cfs, String rowKey, Collection<Column> cols)
+    private void printRow(ColumnFamilyStore cfs, String rowKey, Collection<Cell> cols)
     {
         DecoratedKey ROW = Util.dk(rowKey);
         System.err.println("Original:");
         ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(ROW, "Standard1", System.currentTimeMillis()));
         System.err.println("Row key: " + rowKey + " Cols: "
-                + Iterables.transform(cf.getSortedColumns(), new Function<Column, String>()
+                + Iterables.transform(cf.getSortedColumns(), new Function<Cell, String>()
                 {
-                    public String apply(Column arg0)
+                    public String apply(Cell arg0)
                     {
-                        return new String(arg0.name().array());
+                        return Util.string(arg0.name().toByteBuffer());
                     }
                 }));
         System.err.println("Filtered:");
-        Iterable<String> transformed = Iterables.transform(cols, new Function<Column, String>()
+        Iterable<String> transformed = Iterables.transform(cols, new Function<Cell, String>()
         {
-            public String apply(Column arg0)
+            public String apply(Cell arg0)
             {
-                return new String(arg0.name().array());
+                return Util.string(arg0.name().toByteBuffer());
             }
         });
         System.err.println("Row key: " + rowKey + " Cols: " + transformed);

diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyTest.java
index e13d0d7..69a851e 100644
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyTest.java
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyTest.java

@@ -22,22 +22,28 @@
 import java.io.DataInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.TreeMap;
 
 import com.google.common.collect.Iterables;
-
-import org.apache.cassandra.SchemaLoader;
 import org.junit.Test;
 
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.io.sstable.ColumnStats;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.net.MessagingService;
-import static org.apache.cassandra.Util.column;
-import static org.junit.Assert.assertEquals;
-
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.HeapAllocator;
+import org.apache.cassandra.utils.CounterId;
+import org.apache.cassandra.utils.FBUtilities;
 
+import static junit.framework.Assert.assertTrue;
+
+import static org.apache.cassandra.Util.column;
+import static org.apache.cassandra.Util.cellname;
+import static org.apache.cassandra.Util.tombstone;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 
 public class ColumnFamilyTest extends SchemaLoader
 {
@@ -50,7 +56,7 @@
     {
         ColumnFamily cf;
 
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("C", "v", 1));
         DataOutputBuffer bufOut = new DataOutputBuffer();
         ColumnFamily.serializer.serialize(cf, bufOut, version);
@@ -67,14 +73,14 @@
     {
         ColumnFamily cf;
 
-        TreeMap<String, String> map = new TreeMap<String, String>();
+        TreeMap<String, String> map = new TreeMap<>();
         for (int i = 100; i < 1000; ++i)
         {
             map.put(Integer.toString(i), "Avinash Lakshman is a good man: " + i);
         }
 
         // write
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         DataOutputBuffer bufOut = new DataOutputBuffer();
         for (String cName : map.navigableKeySet())
         {
@@ -87,7 +93,7 @@
         cf = ColumnFamily.serializer.deserialize(new DataInputStream(bufIn), version);
         for (String cName : map.navigableKeySet())
         {
-            ByteBuffer val = cf.getColumn(ByteBufferUtil.bytes(cName)).value();
+            ByteBuffer val = cf.getColumn(cellname(cName)).value();
             assert new String(val.array(),val.position(),val.remaining()).equals(map.get(cName));
         }
         assert Iterables.size(cf.getColumnNames()) == map.size();
@@ -96,7 +102,7 @@
     @Test
     public void testGetColumnCount()
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
 
         cf.addColumn(column("col1", "", 1));
         cf.addColumn(column("col2", "", 2));
@@ -107,73 +113,143 @@
     }
 
     @Test
+    public void testDigest()
+    {
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+
+        ByteBuffer digest = ColumnFamily.digest(cf);
+
+        cf.addColumn(column("col1", "", 1));
+        cf2.addColumn(column("col1", "", 1));
+
+        assert !digest.equals(ColumnFamily.digest(cf));
+
+        digest = ColumnFamily.digest(cf);
+        assert digest.equals(ColumnFamily.digest(cf2));
+
+        cf.addColumn(column("col2", "", 2));
+        assert !digest.equals(ColumnFamily.digest(cf));
+
+        digest = ColumnFamily.digest(cf);
+        cf.addColumn(column("col1", "", 3));
+        assert !digest.equals(ColumnFamily.digest(cf));
+
+        digest = ColumnFamily.digest(cf);
+        cf.delete(new DeletionTime(4, 4));
+        assert !digest.equals(ColumnFamily.digest(cf));
+
+        digest = ColumnFamily.digest(cf);
+        cf.delete(tombstone("col1", "col11", 5, 5));
+        assert !digest.equals(ColumnFamily.digest(cf));
+
+        digest = ColumnFamily.digest(cf);
+        assert digest.equals(ColumnFamily.digest(cf));
+
+        cf.delete(tombstone("col2", "col21", 5, 5));
+        assert !digest.equals(ColumnFamily.digest(cf));
+
+        digest = ColumnFamily.digest(cf);
+        cf.delete(tombstone("col1", "col11", 5, 5)); // this does not change RangeTombstoneLList
+        assert digest.equals(ColumnFamily.digest(cf));
+    }
+
+    @Test
     public void testTimestamp()
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
 
         cf.addColumn(column("col1", "val1", 2));
         cf.addColumn(column("col1", "val2", 2)); // same timestamp, new value
         cf.addColumn(column("col1", "val3", 1)); // older timestamp -- should be ignored
 
-        assert ByteBufferUtil.bytes("val2").equals(cf.getColumn(ByteBufferUtil.bytes("col1")).value());
+        assert ByteBufferUtil.bytes("val2").equals(cf.getColumn(cellname("col1")).value());
     }
 
     @Test
     public void testMergeAndAdd()
     {
-        ColumnFamily cf_new = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
-        ColumnFamily cf_old = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
-        ColumnFamily cf_result = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf_new = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf_old = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf_result = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         ByteBuffer val = ByteBufferUtil.bytes("sample value");
         ByteBuffer val2 = ByteBufferUtil.bytes("x value ");
 
-        cf_new.addColumn(ByteBufferUtil.bytes("col1"), val, 3);
-        cf_new.addColumn(ByteBufferUtil.bytes("col2"), val, 4);
+        cf_new.addColumn(cellname("col1"), val, 3);
+        cf_new.addColumn(cellname("col2"), val, 4);
 
-        cf_old.addColumn(ByteBufferUtil.bytes("col2"), val2, 1);
-        cf_old.addColumn(ByteBufferUtil.bytes("col3"), val2, 2);
+        cf_old.addColumn(cellname("col2"), val2, 1);
+        cf_old.addColumn(cellname("col3"), val2, 2);
 
-        cf_result.addAll(cf_new, HeapAllocator.instance);
-        cf_result.addAll(cf_old, HeapAllocator.instance);
+        cf_result.addAll(cf_new);
+        cf_result.addAll(cf_old);
 
         assert 3 == cf_result.getColumnCount() : "Count is " + cf_new.getColumnCount();
         //addcolumns will only add if timestamp >= old timestamp
-        assert val.equals(cf_result.getColumn(ByteBufferUtil.bytes("col2")).value());
+        assert val.equals(cf_result.getColumn(cellname("col2")).value());
 
         // check that tombstone wins timestamp ties
-        cf_result.addTombstone(ByteBufferUtil.bytes("col1"), 0, 3);
-        assert cf_result.getColumn(ByteBufferUtil.bytes("col1")).isMarkedForDelete(System.currentTimeMillis());
-        cf_result.addColumn(ByteBufferUtil.bytes("col1"), val2, 3);
-        assert cf_result.getColumn(ByteBufferUtil.bytes("col1")).isMarkedForDelete(System.currentTimeMillis());
+        cf_result.addTombstone(cellname("col1"), 0, 3);
+        assertFalse(cf_result.getColumn(cellname("col1")).isLive());
+        cf_result.addColumn(cellname("col1"), val2, 3);
+        assertFalse(cf_result.getColumn(cellname("col1")).isLive());
 
         // check that column value wins timestamp ties in absence of tombstone
-        cf_result.addColumn(ByteBufferUtil.bytes("col3"), val, 2);
-        assert cf_result.getColumn(ByteBufferUtil.bytes("col3")).value().equals(val2);
-        cf_result.addColumn(ByteBufferUtil.bytes("col3"), ByteBufferUtil.bytes("z"), 2);
-        assert cf_result.getColumn(ByteBufferUtil.bytes("col3")).value().equals(ByteBufferUtil.bytes("z"));
+        cf_result.addColumn(cellname("col3"), val, 2);
+        assert cf_result.getColumn(cellname("col3")).value().equals(val2);
+        cf_result.addColumn(cellname("col3"), ByteBufferUtil.bytes("z"), 2);
+        assert cf_result.getColumn(cellname("col3")).value().equals(ByteBufferUtil.bytes("z"));
     }
 
     @Test
-    public void testColumnStatsRecordsRowDeletesCorrectly() throws IOException
+    public void testColumnStatsRecordsRowDeletesCorrectly()
     {
         long timestamp = System.currentTimeMillis();
         int localDeletionTime = (int) (System.currentTimeMillis() / 1000);
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.delete(new DeletionInfo(timestamp, localDeletionTime));
         ColumnStats stats = cf.getColumnStats();
         assertEquals(timestamp, stats.maxTimestamp);
 
-        cf.delete(new RangeTombstone(ByteBufferUtil.bytes("col2"), ByteBufferUtil.bytes("col21"), timestamp, localDeletionTime));
+        cf.delete(new RangeTombstone(cellname("col2"), cellname("col21"), timestamp, localDeletionTime));
 
         stats = cf.getColumnStats();
         assertEquals(ByteBufferUtil.bytes("col2"), stats.minColumnNames.get(0));
         assertEquals(ByteBufferUtil.bytes("col21"), stats.maxColumnNames.get(0));
 
-        cf.delete(new RangeTombstone(ByteBufferUtil.bytes("col6"), ByteBufferUtil.bytes("col61"), timestamp, localDeletionTime));
+        cf.delete(new RangeTombstone(cellname("col6"), cellname("col61"), timestamp, localDeletionTime));
         stats = cf.getColumnStats();
 
         assertEquals(ByteBufferUtil.bytes("col2"), stats.minColumnNames.get(0));
         assertEquals(ByteBufferUtil.bytes("col61"), stats.maxColumnNames.get(0));
     }
+
+    @Test
+    public void testCounterDeletion()
+    {
+        long timestamp = FBUtilities.timestampMicros();
+        CellName name = cellname("counter1");
+
+        BufferCounterCell counter = new BufferCounterCell(name,
+                                                          CounterContext.instance().createGlobal(CounterId.fromInt(1), 1, 1),
+                                                          timestamp);
+        BufferDeletedCell tombstone = new BufferDeletedCell(name, (int) (System.currentTimeMillis() / 1000), 0L);
+
+        // check that the tombstone won the reconcile despite the counter cell having a higher timestamp
+        assertTrue(counter.reconcile(tombstone) == tombstone);
+
+        // check that a range tombstone overrides the counter cell, even with a lower timestamp than the counter
+        ColumnFamily cf0 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Counter1");
+        cf0.addColumn(counter);
+        cf0.delete(new RangeTombstone(cellname("counter0"), cellname("counter2"), 0L, (int) (System.currentTimeMillis() / 1000)));
+        assertTrue(cf0.deletionInfo().isDeleted(counter));
+        assertTrue(cf0.deletionInfo().inOrderTester(false).isDeleted(counter));
+
+        // check that a top-level deletion info overrides the counter cell, even with a lower timestamp than the counter
+        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Counter1");
+        cf1.addColumn(counter);
+        cf1.delete(new DeletionInfo(0L, (int) (System.currentTimeMillis() / 1000)));
+        assertTrue(cf1.deletionInfo().isDeleted(counter));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/CommitLogTest.java b/test/unit/org/apache/cassandra/db/CommitLogTest.java
index 1be29a6..ed9601d 100644
--- a/test/unit/org/apache/cassandra/db/CommitLogTest.java
+++ b/test/unit/org/apache/cassandra/db/CommitLogTest.java

@@ -38,6 +38,9 @@
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.CommitLogDescriptor;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.db.commitlog.CommitLogSegment;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.db.filter.NamesQueryFilter;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.StorageService;
@@ -108,8 +111,8 @@
     {
         CommitLog.instance.resetUnsafe();
         // Roughly 32 MB mutation
-        RowMutation rm = new RowMutation("Keyspace1", bytes("k"));
-        rm.add("Standard1", bytes("c1"), ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize()/4), 0);
+        Mutation rm = new Mutation("Keyspace1", bytes("k"));
+        rm.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize()/4), 0);
 
         // Adding it 5 times
         CommitLog.instance.add(rm);
@@ -119,14 +122,14 @@
         CommitLog.instance.add(rm);
 
         // Adding new mutation on another CF
-        RowMutation rm2 = new RowMutation("Keyspace1", bytes("k"));
-        rm2.add("Standard2", bytes("c1"), ByteBuffer.allocate(4), 0);
+        Mutation rm2 = new Mutation("Keyspace1", bytes("k"));
+        rm2.add("Standard2", Util.cellname("c1"), ByteBuffer.allocate(4), 0);
         CommitLog.instance.add(rm2);
 
         assert CommitLog.instance.activeSegments() == 2 : "Expecting 2 segments, got " + CommitLog.instance.activeSegments();
 
         UUID cfid2 = rm2.getColumnFamilyIds().iterator().next();
-        CommitLog.instance.discardCompletedSegments(cfid2, CommitLog.instance.getContext().get());
+        CommitLog.instance.discardCompletedSegments(cfid2, CommitLog.instance.getContext());
 
         // Assert we still have both our segment
         assert CommitLog.instance.activeSegments() == 2 : "Expecting 2 segments, got " + CommitLog.instance.activeSegments();
@@ -135,10 +138,11 @@
     @Test
     public void testDeleteIfNotDirty() throws Exception
     {
+        DatabaseDescriptor.getCommitLogSegmentSize();
         CommitLog.instance.resetUnsafe();
         // Roughly 32 MB mutation
-        RowMutation rm = new RowMutation("Keyspace1", bytes("k"));
-        rm.add("Standard1", bytes("c1"), ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize()/4), 0);
+        Mutation rm = new Mutation("Keyspace1", bytes("k"));
+        rm.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate((DatabaseDescriptor.getCommitLogSegmentSize()/4) - 1), 0);
 
         // Adding it twice (won't change segment)
         CommitLog.instance.add(rm);
@@ -148,15 +152,17 @@
 
         // "Flush": this won't delete anything
         UUID cfid1 = rm.getColumnFamilyIds().iterator().next();
-        CommitLog.instance.discardCompletedSegments(cfid1, CommitLog.instance.getContext().get());
+        CommitLog.instance.sync(true);
+        CommitLog.instance.discardCompletedSegments(cfid1, CommitLog.instance.getContext());
 
         assert CommitLog.instance.activeSegments() == 1 : "Expecting 1 segment, got " + CommitLog.instance.activeSegments();
 
         // Adding new mutation on another CF, large enough (including CL entry overhead) that a new segment is created
-        RowMutation rm2 = new RowMutation("Keyspace1", bytes("k"));
-        rm2.add("Standard2", bytes("c1"), ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize()/2), 0);
+        Mutation rm2 = new Mutation("Keyspace1", bytes("k"));
+        rm2.add("Standard2", Util.cellname("c1"), ByteBuffer.allocate((DatabaseDescriptor.getCommitLogSegmentSize()/2) - 100), 0);
         CommitLog.instance.add(rm2);
-        // also forces a new segment, since each entry-with-overhead is just over half the CL size
+        // also forces a new segment, since each entry-with-overhead is just under half the CL size
+        CommitLog.instance.add(rm2);
         CommitLog.instance.add(rm2);
 
         assert CommitLog.instance.activeSegments() == 3 : "Expecting 3 segments, got " + CommitLog.instance.activeSegments();
@@ -166,23 +172,55 @@
         // didn't write anything on cf1 since last flush (and we flush cf2)
 
         UUID cfid2 = rm2.getColumnFamilyIds().iterator().next();
-        CommitLog.instance.discardCompletedSegments(cfid2, CommitLog.instance.getContext().get());
+        CommitLog.instance.discardCompletedSegments(cfid2, CommitLog.instance.getContext());
 
         // Assert we still have both our segment
         assert CommitLog.instance.activeSegments() == 1 : "Expecting 1 segment, got " + CommitLog.instance.activeSegments();
     }
 
+    private static int getMaxRecordDataSize(String keyspace, ByteBuffer key, String table, CellName column)
+    {
+        Mutation rm = new Mutation("Keyspace1", bytes("k"));
+        rm.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate(0), 0);
+
+        int max = (DatabaseDescriptor.getCommitLogSegmentSize() / 2);
+        max -= CommitLogSegment.ENTRY_OVERHEAD_SIZE; // log entry overhead
+        return max - (int) Mutation.serializer.serializedSize(rm, MessagingService.current_version);
+    }
+
+    private static int getMaxRecordDataSize()
+    {
+        return getMaxRecordDataSize("Keyspace1", bytes("k"), "Standard1", Util.cellname("c1"));
+    }
+
     // CASSANDRA-3615
     @Test
-    public void testExceedSegmentSizeWithOverhead() throws Exception
+    public void testEqualRecordLimit() throws Exception
     {
         CommitLog.instance.resetUnsafe();
 
-        RowMutation rm = new RowMutation("Keyspace1", bytes("k"));
-        rm.add("Standard1", bytes("c1"), ByteBuffer.allocate((DatabaseDescriptor.getCommitLogSegmentSize()) - 83), 0);
+        Mutation rm = new Mutation("Keyspace1", bytes("k"));
+        rm.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate(getMaxRecordDataSize()), 0);
         CommitLog.instance.add(rm);
     }
 
+    @Test
+    public void testExceedRecordLimit() throws Exception
+    {
+        CommitLog.instance.resetUnsafe();
+        try
+        {
+            Mutation rm = new Mutation("Keyspace1", bytes("k"));
+            rm.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate(1 + getMaxRecordDataSize()), 0);
+            CommitLog.instance.add(rm);
+            throw new AssertionError("mutation larger than limit was accepted");
+        }
+        catch (IllegalArgumentException e)
+        {
+            // IAE is thrown on too-large mutations
+        }
+    }
+
     protected void testRecoveryWithBadSizeArgument(int size, int dataSize) throws Exception
     {
         Checksum checksum = new CRC32();
@@ -246,8 +284,8 @@
 
             DatabaseDescriptor.setCommitFailurePolicy(Config.CommitFailurePolicy.stop);
             commitDir.setWritable(false);
-            RowMutation rm = new RowMutation("Keyspace1", bytes("k"));
-            rm.add("Standard1", bytes("c1"), ByteBuffer.allocate(100), 0);
+            Mutation rm = new Mutation("Keyspace1", bytes("k"));
+            rm.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate(100), 0);
 
             // Adding it twice (won't change segment)
             CommitLog.instance.add(rm);
@@ -272,19 +310,19 @@
         ColumnFamilyStore cfs1 = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard1");
         ColumnFamilyStore cfs2 = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard2");
 
-        final RowMutation rm1 = new RowMutation("Keyspace1", bytes("k"));
-        rm1.add("Standard1", bytes("c1"), ByteBuffer.allocate(100), 0);
+        final Mutation rm1 = new Mutation("Keyspace1", bytes("k"));
+        rm1.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate(100), 0);
         rm1.apply();
         cfs1.truncateBlocking();
         DatabaseDescriptor.setAutoSnapshot(prev);
-        final RowMutation rm2 = new RowMutation("Keyspace1", bytes("k"));
-        rm2.add("Standard2", bytes("c1"), ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize() / 4), 0);
+        final Mutation rm2 = new Mutation("Keyspace1", bytes("k"));
+        rm2.add("Standard2", Util.cellname("c1"), ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize() / 4), 0);
 
         for (int i = 0 ; i < 5 ; i++)
             CommitLog.instance.add(rm2);
 
         Assert.assertEquals(2, CommitLog.instance.activeSegments());
-        ReplayPosition position = CommitLog.instance.getContext().get();
+        ReplayPosition position = CommitLog.instance.getContext();
         for (Keyspace ks : Keyspace.system())
             for (ColumnFamilyStore syscfs : ks.getColumnFamilyStores())
                 CommitLog.instance.discardCompletedSegments(syscfs.metadata.cfId, position);
@@ -301,17 +339,18 @@
         Keyspace notDurableKs = Keyspace.open("NoCommitlogSpace");
         Assert.assertFalse(notDurableKs.metadata.durableWrites);
         ColumnFamilyStore cfs = notDurableKs.getColumnFamilyStore("Standard1");
-        RowMutation rm;
+        CellNameType type = notDurableKs.getColumnFamilyStore("Standard1").getComparator();
+        Mutation rm;
         DecoratedKey dk = Util.dk("key1");
 
         // add data
-        rm = new RowMutation("NoCommitlogSpace", dk.key);
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("abcd"), 0);
+        rm = new Mutation("NoCommitlogSpace", dk.getKey());
+        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("abcd"), 0);
         rm.apply();
 
-        ReadCommand command = new SliceByNamesReadCommand("Keyspace1", dk.key, "Standard1", System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(ByteBufferUtil.bytes("Column1"), cfs.getComparator())));
+        ReadCommand command = new SliceByNamesReadCommand("NoCommitlogSpace", dk.getKey(), "Standard1", System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(Util.cellname("Column1"), type)));
         Row row = command.getRow(notDurableKs);
-        Column col = row.cf.getColumn(ByteBufferUtil.bytes("Column1"));
+        Cell col = row.cf.getColumn(Util.cellname("Column1"));
         Assert.assertEquals(col.value(), ByteBuffer.wrap("abcd".getBytes()));
         cfs.truncateBlocking();
         DatabaseDescriptor.setAutoSnapshot(prevAutoSnapshot);

diff --git a/test/unit/org/apache/cassandra/db/CounterCacheTest.java b/test/unit/org/apache/cassandra/db/CounterCacheTest.java
new file mode 100644
index 0000000..cb2d97a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/CounterCacheTest.java

@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.util.concurrent.ExecutionException;
+
+import org.junit.AfterClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
+import org.apache.cassandra.service.CacheService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import static org.apache.cassandra.Util.cellname;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public class CounterCacheTest extends SchemaLoader
+{
+    private static final String KS = "CounterCacheSpace";
+    private static final String CF = "Counter1";
+
+    @AfterClass
+    public static void cleanup()
+    {
+        cleanupSavedCaches();
+    }
+
+    @Test
+    public void testReadWrite()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KS).getColumnFamilyStore(CF);
+        CacheService.instance.invalidateCounterCache();
+
+        assertEquals(0, CacheService.instance.counterCache.size());
+        assertNull(cfs.getCachedCounter(bytes(1), cellname(1)));
+        assertNull(cfs.getCachedCounter(bytes(1), cellname(2)));
+        assertNull(cfs.getCachedCounter(bytes(2), cellname(1)));
+        assertNull(cfs.getCachedCounter(bytes(2), cellname(2)));
+
+        cfs.putCachedCounter(bytes(1), cellname(1), ClockAndCount.create(1L, 1L));
+        cfs.putCachedCounter(bytes(1), cellname(2), ClockAndCount.create(1L, 2L));
+        cfs.putCachedCounter(bytes(2), cellname(1), ClockAndCount.create(2L, 1L));
+        cfs.putCachedCounter(bytes(2), cellname(2), ClockAndCount.create(2L, 2L));
+
+        assertEquals(4, CacheService.instance.counterCache.size());
+        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(1), cellname(1)));
+        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(1), cellname(2)));
+        assertEquals(ClockAndCount.create(2L, 1L), cfs.getCachedCounter(bytes(2), cellname(1)));
+        assertEquals(ClockAndCount.create(2L, 2L), cfs.getCachedCounter(bytes(2), cellname(2)));
+    }
+
+    @Test
+    public void testSaveLoad() throws ExecutionException, InterruptedException, WriteTimeoutException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KS).getColumnFamilyStore(CF);
+        CacheService.instance.invalidateCounterCache();
+
+        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addColumn(new BufferCounterUpdateCell(cellname(1), 1L, FBUtilities.timestampMicros()));
+        cells.addColumn(new BufferCounterUpdateCell(cellname(2), 2L, FBUtilities.timestampMicros()));
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new Mutation(KS, bytes(2), cells), ConsistencyLevel.ONE).apply();
+
+        // flush the counter cache and invalidate
+        CacheService.instance.counterCache.submitWrite(Integer.MAX_VALUE).get();
+        CacheService.instance.invalidateCounterCache();
+        assertEquals(0, CacheService.instance.counterCache.size());
+
+        // load from cache and validate
+        CacheService.instance.counterCache.loadSaved(cfs);
+        assertEquals(4, CacheService.instance.counterCache.size());
+        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(1), cellname(1)));
+        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(1), cellname(2)));
+        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(2), cellname(1)));
+        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(2), cellname(2)));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/CounterCellTest.java b/test/unit/org/apache/cassandra/db/CounterCellTest.java
new file mode 100644
index 0000000..21826d2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/CounterCellTest.java

@@ -0,0 +1,318 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import java.security.MessageDigest;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.utils.*;
+
+import static org.apache.cassandra.Util.cellname;
+import static org.apache.cassandra.db.context.CounterContext.ContextState;
+
+public class CounterCellTest extends SchemaLoader
+{
+    private static final CounterContext cc = new CounterContext();
+
+    private static final int idLength;
+    private static final int clockLength;
+    private static final int countLength;
+
+    private static final int stepLength;
+
+    static
+    {
+        idLength      = CounterId.LENGTH;
+        clockLength   = 8; // size of long
+        countLength   = 8; // size of long
+
+        stepLength    = idLength + clockLength + countLength;
+    }
+
+    @Test
+    public void testCreate()
+    {
+        long delta = 3L;
+        CounterCell cell = new BufferCounterCell(Util.cellname("x"),
+                                           CounterContext.instance().createLocal(delta),
+                                           1L,
+                                           Long.MIN_VALUE);
+
+        Assert.assertEquals(delta, cell.total());
+        Assert.assertEquals(1, cell.value().getShort(0));
+        Assert.assertEquals(0, cell.value().getShort(2));
+        Assert.assertTrue(CounterId.wrap(cell.value(), 4).isLocalId());
+        Assert.assertEquals(1L, cell.value().getLong(4 + idLength));
+        Assert.assertEquals(delta, cell.value().getLong(4 + idLength + clockLength));
+    }
+
+    @Test
+    public void testReconcile()
+    {
+        Cell left;
+        Cell right;
+        Cell reconciled;
+
+        ByteBuffer context;
+
+        // tombstone + tombstone
+        left  = new BufferDeletedCell(cellname("x"), 1, 1L);
+        right = new BufferDeletedCell(cellname("x"), 2, 2L);
+
+        assert left.reconcile(right).timestamp() == right.timestamp();
+        assert right.reconcile(left).timestamp() == right.timestamp();
+
+        // tombstone > live
+        left  = new BufferDeletedCell(cellname("x"), 1, 2L);
+        right = BufferCounterCell.createLocal(cellname("x"), 0L, 1L, Long.MIN_VALUE);
+
+        assert left.reconcile(right) == left;
+
+        // tombstone < live last delete
+        left  = new BufferDeletedCell(cellname("x"), 1, 1L);
+        right = BufferCounterCell.createLocal(cellname("x"), 0L, 4L, 2L);
+
+        assert left.reconcile(right) == left;
+
+        // tombstone == live last delete
+        left  = new BufferDeletedCell(cellname("x"), 1, 2L);
+        right = BufferCounterCell.createLocal(cellname("x"), 0L, 4L, 2L);
+
+        assert left.reconcile(right) == left;
+
+        // tombstone > live last delete
+        left  = new BufferDeletedCell(cellname("x"), 1, 4L);
+        right = BufferCounterCell.createLocal(cellname("x"), 0L, 9L, 1L);
+
+        assert left.reconcile(right) == left;
+
+        // live < tombstone
+        left  = BufferCounterCell.createLocal(cellname("x"), 0L, 1L, Long.MIN_VALUE);
+        right = new BufferDeletedCell(cellname("x"), 1, 2L);
+
+        assert left.reconcile(right) == right;
+
+        // live last delete > tombstone
+        left  = BufferCounterCell.createLocal(cellname("x"), 0L, 4L, 2L);
+        right = new BufferDeletedCell(cellname("x"), 1, 1L);
+
+        assert left.reconcile(right) == right;
+
+        // live last delete == tombstone
+        left  = BufferCounterCell.createLocal(cellname("x"), 0L, 4L, 2L);
+        right = new BufferDeletedCell(cellname("x"), 1, 2L);
+
+        assert left.reconcile(right) == right;
+
+        // live last delete < tombstone
+        left  = BufferCounterCell.createLocal(cellname("x"), 0L, 9L, 1L);
+        right = new BufferDeletedCell(cellname("x"), 1, 4L);
+
+        assert left.reconcile(right) == right;
+
+        // live < live last delete
+        left  = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L), 1L, Long.MIN_VALUE);
+        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L), 4L, 3L);
+
+        assert left.reconcile(right) == right;
+
+        // live last delete > live
+        left  = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L), 6L, 5L);
+        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L), 4L, 3L);
+
+        assert left.reconcile(right) == left;
+
+        // live + live
+        left = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L), 4L, Long.MIN_VALUE);
+        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L), 1L, Long.MIN_VALUE);
+
+        reconciled = left.reconcile(right);
+        assert reconciled.name().equals(left.name());
+        assert ((CounterCell)reconciled).total() == 3L;
+        assert reconciled.timestamp() == 4L;
+
+        left = reconciled;
+        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(2), 1L, 5L), 2L, Long.MIN_VALUE);
+
+        reconciled = left.reconcile(right);
+        assert reconciled.name().equals(left.name());
+        assert ((CounterCell)reconciled).total() == 8L;
+        assert reconciled.timestamp() == 4L;
+
+        left = reconciled;
+        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(2), 2L, 2L), 6L, Long.MIN_VALUE);
+
+        reconciled = left.reconcile(right);
+        assert reconciled.name().equals(left.name());
+        assert ((CounterCell)reconciled).total() == 5L;
+        assert reconciled.timestamp() == 6L;
+
+        context = reconciled.value();
+        int hd = 2; // header
+        assert hd + 2 * stepLength == context.remaining();
+
+        assert Util.equalsCounterId(CounterId.fromInt(1), context, hd);
+        assert 2L == context.getLong(hd + idLength);
+        assert 3L == context.getLong(hd + idLength + clockLength);
+
+        assert Util.equalsCounterId(CounterId.fromInt(2), context, hd + stepLength);
+        assert 2L == context.getLong(hd + stepLength + idLength);
+        assert 2L == context.getLong(hd + stepLength + idLength + clockLength);
+
+        assert ((CounterCell)reconciled).timestampOfLastDelete() == Long.MIN_VALUE;
+    }
+
+    @Test
+    public void testDiff()
+    {
+        ContextState left;
+        ContextState right;
+
+        CounterCell leftCell;
+        CounterCell rightCell;
+
+        // timestamp
+        leftCell = BufferCounterCell.createLocal(cellname("x"), 0, 1L, Long.MIN_VALUE);
+        rightCell = BufferCounterCell.createLocal(cellname("x"), 0, 2L, Long.MIN_VALUE);
+
+        assert rightCell == leftCell.diff(rightCell);
+        assert null      == rightCell.diff(leftCell);
+
+        // timestampOfLastDelete
+        leftCell = BufferCounterCell.createLocal(cellname("x"), 0, 1L, 1L);
+        rightCell = BufferCounterCell.createLocal(cellname("x"), 0, 1L, 2L);
+
+        assert rightCell == leftCell.diff(rightCell);
+        assert null      == rightCell.diff(leftCell);
+
+        // equality: equal nodes, all counts same
+        left = ContextState.allocate(0, 0, 3);
+        left.writeRemote(CounterId.fromInt(3), 3L, 0L);
+        left.writeRemote(CounterId.fromInt(6), 2L, 0L);
+        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
+        right = ContextState.wrap(ByteBufferUtil.clone(left.context));
+
+        leftCell  = new BufferCounterCell(cellname("x"), left.context,  1L);
+        rightCell = new BufferCounterCell(cellname("x"), right.context, 1L);
+        assert leftCell.diff(rightCell) == null;
+
+        // greater than: left has superset of nodes (counts equal)
+        left = ContextState.allocate(0, 0, 4);
+        left.writeRemote(CounterId.fromInt(3), 3L, 0L);
+        left.writeRemote(CounterId.fromInt(6), 2L, 0L);
+        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
+        left.writeRemote(CounterId.fromInt(12), 0L, 0L);
+
+        right = ContextState.allocate(0, 0, 3);
+        right.writeRemote(CounterId.fromInt(3), 3L, 0L);
+        right.writeRemote(CounterId.fromInt(6), 2L, 0L);
+        right.writeRemote(CounterId.fromInt(9), 1L, 0L);
+
+        leftCell  = new BufferCounterCell(cellname("x"), left.context,  1L);
+        rightCell = new BufferCounterCell(cellname("x"), right.context, 1L);
+        assert leftCell.diff(rightCell) == null;
+
+        // less than: right has subset of nodes (counts equal)
+        assert leftCell == rightCell.diff(leftCell);
+
+        // disjoint: right and left have disjoint node sets
+        left = ContextState.allocate(0, 0, 3);
+        left.writeRemote(CounterId.fromInt(3), 1L, 0L);
+        left.writeRemote(CounterId.fromInt(4), 1L, 0L);
+        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
+
+        right = ContextState.allocate(0, 0, 3);
+        right.writeRemote(CounterId.fromInt(3), 1L, 0L);
+        right.writeRemote(CounterId.fromInt(6), 1L, 0L);
+        right.writeRemote(CounterId.fromInt(9), 1L, 0L);
+
+        leftCell  = new BufferCounterCell(cellname("x"), left.context,  1L);
+        rightCell = new BufferCounterCell(cellname("x"), right.context, 1L);
+        assert rightCell == leftCell.diff(rightCell);
+        assert leftCell  == rightCell.diff(leftCell);
+    }
+
+    @Test
+    public void testSerializeDeserialize() throws IOException
+    {
+        CounterContext.ContextState state = CounterContext.ContextState.allocate(0, 2, 2);
+        state.writeRemote(CounterId.fromInt(1), 4L, 4L);
+        state.writeLocal(CounterId.fromInt(2), 4L, 4L);
+        state.writeRemote(CounterId.fromInt(3), 4L, 4L);
+        state.writeLocal(CounterId.fromInt(4), 4L, 4L);
+
+        CellNameType type = new SimpleDenseCellNameType(UTF8Type.instance);
+        CounterCell original = new BufferCounterCell(cellname("x"), state.context, 1L);
+        byte[] serialized;
+        try (DataOutputBuffer bufOut = new DataOutputBuffer())
+        {
+            type.columnSerializer().serialize(original, bufOut);
+            serialized = bufOut.getData();
+        }
+
+
+        ByteArrayInputStream bufIn = new ByteArrayInputStream(serialized, 0, serialized.length);
+        CounterCell deserialized = (CounterCell) type.columnSerializer().deserialize(new DataInputStream(bufIn));
+        Assert.assertEquals(original, deserialized);
+
+        bufIn = new ByteArrayInputStream(serialized, 0, serialized.length);
+        CounterCell deserializedOnRemote = (CounterCell) type.columnSerializer().deserialize(new DataInputStream(bufIn), ColumnSerializer.Flag.FROM_REMOTE);
+        Assert.assertEquals(deserializedOnRemote.name(), original.name());
+        Assert.assertEquals(deserializedOnRemote.total(), original.total());
+        Assert.assertEquals(deserializedOnRemote.value(), cc.clearAllLocal(original.value()));
+        Assert.assertEquals(deserializedOnRemote.timestamp(), deserialized.timestamp());
+        Assert.assertEquals(deserializedOnRemote.timestampOfLastDelete(), deserialized.timestampOfLastDelete());
+    }
+
+    @Test
+    public void testUpdateDigest() throws Exception
+    {
+        MessageDigest digest1 = MessageDigest.getInstance("md5");
+        MessageDigest digest2 = MessageDigest.getInstance("md5");
+
+        CounterContext.ContextState state = CounterContext.ContextState.allocate(0, 2, 2);
+        state.writeRemote(CounterId.fromInt(1), 4L, 4L);
+        state.writeLocal(CounterId.fromInt(2), 4L, 4L);
+        state.writeRemote(CounterId.fromInt(3), 4L, 4L);
+        state.writeLocal(CounterId.fromInt(4), 4L, 4L);
+
+        CounterCell original = new BufferCounterCell(cellname("x"), state.context, 1L);
+        CounterCell cleared = new BufferCounterCell(cellname("x"), cc.clearAllLocal(state.context), 1L);
+
+        original.updateDigest(digest1);
+        cleared.updateDigest(digest2);
+
+        assert Arrays.equals(digest1.digest(), digest2.digest());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/CounterColumnTest.java b/test/unit/org/apache/cassandra/db/CounterColumnTest.java
deleted file mode 100644
index 1f2c078..0000000
--- a/test/unit/org/apache/cassandra/db/CounterColumnTest.java
+++ /dev/null

@@ -1,320 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.db;
-
-import java.security.MessageDigest;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.net.UnknownHostException;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.utils.*;
-
-import static org.apache.cassandra.db.context.CounterContext.ContextState;
-
-public class CounterColumnTest extends SchemaLoader
-{
-    private static final CounterContext cc = new CounterContext();
-
-    private static final int idLength;
-    private static final int clockLength;
-    private static final int countLength;
-
-    private static final int stepLength;
-
-    static
-    {
-        idLength      = CounterId.LENGTH;
-        clockLength   = 8; // size of long
-        countLength   = 8; // size of long
-
-        stepLength    = idLength + clockLength + countLength;
-    }
-
-    @Test
-    public void testCreate() throws UnknownHostException
-    {
-        long delta = 3L;
-        CounterUpdateColumn cuc = new CounterUpdateColumn(ByteBufferUtil.bytes("x"), delta, 1L);
-        CounterColumn column = cuc.localCopy(Keyspace.open("Keyspace5").getColumnFamilyStore("Counter1"));
-
-        assert delta == column.total();
-        assert 1 == column.value().getShort(0);
-        assert 0 == column.value().getShort(2);
-        assert CounterId.wrap(column.value(), 4).isLocalId();
-        assert 1L == column.value().getLong(4 + idLength);
-        assert delta == column.value().getLong(4 + idLength + clockLength);
-    }
-
-    @Test
-    public void testReconcile() throws UnknownHostException
-    {
-        Column left;
-        Column right;
-        Column reconciled;
-
-        ByteBuffer context;
-
-        Allocator allocator = HeapAllocator.instance;
-
-        // tombstone + tombstone
-        left  = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 1L);
-        right = new DeletedColumn(ByteBufferUtil.bytes("x"), 2, 2L);
-
-        assert left.reconcile(right).getMarkedForDeleteAt() == right.getMarkedForDeleteAt();
-        assert right.reconcile(left).getMarkedForDeleteAt() == right.getMarkedForDeleteAt();
-
-        // tombstone > live
-        left  = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 2L);
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), 0L, 1L);
-
-        assert left.reconcile(right) == left;
-
-        // tombstone < live last delete
-        left  = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 1L);
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), 0L, 4L, 2L);
-
-        assert left.reconcile(right) == right;
-
-        // tombstone == live last delete
-        left  = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 2L);
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), 0L, 4L, 2L);
-
-        assert left.reconcile(right) == right;
-
-        // tombstone > live last delete
-        left  = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 4L);
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), 0L, 9L, 1L);
-
-        reconciled = left.reconcile(right);
-        assert reconciled.name() == right.name();
-        assert reconciled.value() == right.value();
-        assert reconciled.timestamp() == right.timestamp();
-        assert ((CounterColumn)reconciled).timestampOfLastDelete() == left.getMarkedForDeleteAt();
-
-        // live < tombstone
-        left  = new CounterColumn(ByteBufferUtil.bytes("x"), 0L, 1L);
-        right = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 2L);
-
-        assert left.reconcile(right) == right;
-
-        // live last delete > tombstone
-        left  = new CounterColumn(ByteBufferUtil.bytes("x"), 0L, 4L, 2L);
-        right = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 1L);
-
-        assert left.reconcile(right) == left;
-
-        // live last delete == tombstone
-        left  = new CounterColumn(ByteBufferUtil.bytes("x"), 0L, 4L, 2L);
-        right = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 2L);
-
-        assert left.reconcile(right) == left;
-
-        // live last delete < tombstone
-        left  = new CounterColumn(ByteBufferUtil.bytes("x"), 0L, 9L, 1L);
-        right = new DeletedColumn(ByteBufferUtil.bytes("x"), 1, 4L);
-
-        reconciled = left.reconcile(right);
-        assert reconciled.name() == left.name();
-        assert reconciled.value() == left.value();
-        assert reconciled.timestamp() == left.timestamp();
-        assert ((CounterColumn)reconciled).timestampOfLastDelete() == right.getMarkedForDeleteAt();
-
-        // live < live last delete
-        left  = new CounterColumn(ByteBufferUtil.bytes("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L, allocator), 1L, Long.MIN_VALUE);
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L, allocator), 4L, 3L);
-
-        assert left.reconcile(right) == right;
-
-        // live last delete > live
-        left  = new CounterColumn(ByteBufferUtil.bytes("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L, allocator), 6L, 5L);
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L, allocator), 4L, 3L);
-
-        assert left.reconcile(right) == left;
-
-        // live + live
-        left = new CounterColumn(ByteBufferUtil.bytes("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L, allocator), 4L, Long.MIN_VALUE);
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L, allocator), 1L, Long.MIN_VALUE);
-
-        reconciled = left.reconcile(right);
-        assert reconciled.name().equals(left.name());
-        assert ((CounterColumn)reconciled).total() == 3L;
-        assert reconciled.timestamp() == 4L;
-
-        left = reconciled;
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), cc.createRemote(CounterId.fromInt(2), 1L, 5L, allocator), 2L, Long.MIN_VALUE);
-
-        reconciled = left.reconcile(right);
-        assert reconciled.name().equals(left.name());
-        assert ((CounterColumn)reconciled).total() == 8L;
-        assert reconciled.timestamp() == 4L;
-
-        left = reconciled;
-        right = new CounterColumn(ByteBufferUtil.bytes("x"), cc.createRemote(CounterId.fromInt(2), 2L, 2L, allocator), 6L, Long.MIN_VALUE);
-
-        reconciled = left.reconcile(right);
-        assert reconciled.name().equals(left.name());
-        assert ((CounterColumn)reconciled).total() == 5L;
-        assert reconciled.timestamp() == 6L;
-
-        context = reconciled.value();
-        int hd = 2; // header
-        assert hd + 2 * stepLength == context.remaining();
-
-        assert Util.equalsCounterId(CounterId.fromInt(1), context, hd);
-        assert 2L == context.getLong(hd + idLength);
-        assert 3L == context.getLong(hd + idLength + clockLength);
-
-        assert Util.equalsCounterId(CounterId.fromInt(2), context, hd + stepLength);
-        assert 2L == context.getLong(hd + stepLength + idLength);
-        assert 2L == context.getLong(hd + stepLength + idLength + clockLength);
-
-        assert ((CounterColumn)reconciled).timestampOfLastDelete() == Long.MIN_VALUE;
-    }
-
-    @Test
-    public void testDiff() throws UnknownHostException
-    {
-        Allocator allocator = HeapAllocator.instance;
-        ContextState left;
-        ContextState right;
-
-        CounterColumn leftCol;
-        CounterColumn rightCol;
-
-        // timestamp
-        leftCol = new CounterColumn(ByteBufferUtil.bytes("x"), 0, 1L);
-        rightCol = new CounterColumn(ByteBufferUtil.bytes("x"), 0, 2L);
-
-        assert rightCol == leftCol.diff(rightCol);
-        assert null     == rightCol.diff(leftCol);
-
-        // timestampOfLastDelete
-        leftCol = new CounterColumn(ByteBufferUtil.bytes("x"), 0, 1L, 1L);
-        rightCol = new CounterColumn(ByteBufferUtil.bytes("x"), 0, 1L, 2L);
-
-        assert rightCol == leftCol.diff(rightCol);
-        assert null     == rightCol.diff(leftCol);
-
-        // equality: equal nodes, all counts same
-        left = ContextState.allocate(0, 0, 3, allocator);
-        left.writeRemote(CounterId.fromInt(3), 3L, 0L);
-        left.writeRemote(CounterId.fromInt(6), 2L, 0L);
-        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
-        right = ContextState.wrap(ByteBufferUtil.clone(left.context));
-
-        leftCol  = new CounterColumn(ByteBufferUtil.bytes("x"), left.context,  1L);
-        rightCol = new CounterColumn(ByteBufferUtil.bytes("x"), right.context, 1L);
-        assert leftCol.diff(rightCol) == null;
-
-        // greater than: left has superset of nodes (counts equal)
-        left = ContextState.allocate(0, 0, 4, allocator);
-        left.writeRemote(CounterId.fromInt(3), 3L, 0L);
-        left.writeRemote(CounterId.fromInt(6), 2L, 0L);
-        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
-        left.writeRemote(CounterId.fromInt(12), 0L, 0L);
-
-        right = ContextState.allocate(0, 0, 3, allocator);
-        right.writeRemote(CounterId.fromInt(3), 3L, 0L);
-        right.writeRemote(CounterId.fromInt(6), 2L, 0L);
-        right.writeRemote(CounterId.fromInt(9), 1L, 0L);
-
-        leftCol  = new CounterColumn(ByteBufferUtil.bytes("x"), left.context,  1L);
-        rightCol = new CounterColumn(ByteBufferUtil.bytes("x"), right.context, 1L);
-        assert leftCol.diff(rightCol) == null;
-
-        // less than: right has subset of nodes (counts equal)
-        assert leftCol == rightCol.diff(leftCol);
-
-        // disjoint: right and left have disjoint node sets
-        left = ContextState.allocate(0, 0, 3, allocator);
-        left.writeRemote(CounterId.fromInt(3), 1L, 0L);
-        left.writeRemote(CounterId.fromInt(4), 1L, 0L);
-        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
-
-        right = ContextState.allocate(0, 0, 3, allocator);
-        right.writeRemote(CounterId.fromInt(3), 1L, 0L);
-        right.writeRemote(CounterId.fromInt(6), 1L, 0L);
-        right.writeRemote(CounterId.fromInt(9), 1L, 0L);
-
-        leftCol  = new CounterColumn(ByteBufferUtil.bytes("x"), left.context,  1L);
-        rightCol = new CounterColumn(ByteBufferUtil.bytes("x"), right.context, 1L);
-        assert rightCol == leftCol.diff(rightCol);
-        assert leftCol  == rightCol.diff(leftCol);
-    }
-
-    @Test
-    public void testSerializeDeserialize() throws IOException
-    {
-        Allocator allocator = HeapAllocator.instance;
-        CounterContext.ContextState state = CounterContext.ContextState.allocate(0, 2, 2, allocator);
-        state.writeRemote(CounterId.fromInt(1), 4L, 4L);
-        state.writeLocal(CounterId.fromInt(2), 4L, 4L);
-        state.writeRemote(CounterId.fromInt(3), 4L, 4L);
-        state.writeLocal(CounterId.fromInt(4), 4L, 4L);
-
-        CounterColumn original = new CounterColumn(ByteBufferUtil.bytes("x"), state.context, 1L);
-        DataOutputBuffer bufOut = new DataOutputBuffer();
-        Column.serializer.serialize(original, bufOut);
-        byte[] serialized = bufOut.getData();
-
-        ByteArrayInputStream bufIn = new ByteArrayInputStream(serialized, 0, serialized.length);
-        CounterColumn deserialized = (CounterColumn) Column.serializer.deserialize(new DataInputStream(bufIn));
-        assert original.equals(deserialized);
-
-        bufIn = new ByteArrayInputStream(serialized, 0, serialized.length);
-        CounterColumn deserializedOnRemote = (CounterColumn) Column.serializer.deserialize(new DataInputStream(bufIn), ColumnSerializer.Flag.FROM_REMOTE);
-        assert deserializedOnRemote.name().equals(original.name());
-        assert deserializedOnRemote.total() == original.total();
-        assert deserializedOnRemote.value().equals(cc.clearAllLocal(original.value()));
-        assert deserializedOnRemote.timestamp() == deserialized.timestamp();
-        assert deserializedOnRemote.timestampOfLastDelete() == deserialized.timestampOfLastDelete();
-    }
-
-    @Test
-    public void testUpdateDigest() throws Exception
-    {
-        Allocator allocator = HeapAllocator.instance;
-        MessageDigest digest1 = MessageDigest.getInstance("md5");
-        MessageDigest digest2 = MessageDigest.getInstance("md5");
-
-        CounterContext.ContextState state = CounterContext.ContextState.allocate(0, 2, 2, allocator);
-        state.writeRemote(CounterId.fromInt(1), 4L, 4L);
-        state.writeLocal(CounterId.fromInt(2), 4L, 4L);
-        state.writeRemote(CounterId.fromInt(3), 4L, 4L);
-        state.writeLocal(CounterId.fromInt(4), 4L, 4L);
-
-        CounterColumn original = new CounterColumn(ByteBufferUtil.bytes("x"), state.context, 1L);
-        CounterColumn cleared = new CounterColumn(ByteBufferUtil.bytes("x"), cc.clearAllLocal(state.context), 1L);
-
-        original.updateDigest(digest1);
-        cleared.updateDigest(digest2);
-
-        assert Arrays.equals(digest1.digest(), digest2.digest());
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/CounterMutationTest.java b/test/unit/org/apache/cassandra/db/CounterMutationTest.java
index 389b7b7..431531c 100644
--- a/test/unit/org/apache/cassandra/db/CounterMutationTest.java
+++ b/test/unit/org/apache/cassandra/db/CounterMutationTest.java

@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -17,29 +17,201 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.IOException;
-import java.util.List;
+import java.nio.ByteBuffer;
 
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.utils.*;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import static org.apache.cassandra.Util.cellname;
+import static org.apache.cassandra.Util.dk;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 
 public class CounterMutationTest extends SchemaLoader
 {
+    private static final String KS = "CounterCacheSpace";
+    private static final String CF1 = "Counter1";
+    private static final String CF2 = "Counter2";
+
     @Test
-    public void testGetOldShardFromSystemKeyspace() throws IOException
+    public void testSingleCell() throws WriteTimeoutException
     {
-        // Renewing a bunch of times and checking we get the same thing from
-        // the system keyspace that what is in memory
-        CounterId.renewLocalId();
-        CounterId.renewLocalId();
-        CounterId.renewLocalId();
+        ColumnFamilyStore cfs = Keyspace.open(KS).getColumnFamilyStore(CF1);
+        cfs.truncateBlocking();
 
-        List<CounterId.CounterIdRecord> inMem = CounterId.getOldLocalCounterIds();
-        List<CounterId.CounterIdRecord> onDisk = SystemKeyspace.getOldLocalCounterIds();
+        // Do the initial update (+1)
+        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), 1L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        ColumnFamily current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertEquals(1L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
 
-        assert inMem.equals(onDisk);
+        // Make another increment (+2)
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), 2L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertEquals(3L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+
+        // Decrement to 0 (-3)
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), -3L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertEquals(0L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+        assertEquals(ClockAndCount.create(3L, 0L), cfs.getCachedCounter(bytes(1), cellname(1)));
+    }
+
+    @Test
+    public void testTwoCells() throws WriteTimeoutException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KS).getColumnFamilyStore(CF1);
+        cfs.truncateBlocking();
+
+        // Do the initial update (+1, -1)
+        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), 1L);
+        cells.addCounter(cellname(2), -1L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        ColumnFamily current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertEquals(1L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+        assertEquals(-1L, CounterContext.instance().total(current.getColumn(cellname(2)).value()));
+
+        // Make another increment (+2, -2)
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), 2L);
+        cells.addCounter(cellname(2), -2L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertEquals(3L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+
+        // Decrement to 0 (-3, +3)
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), -3L);
+        cells.addCounter(cellname(2), 3L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertEquals(0L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+        assertEquals(0L, CounterContext.instance().total(current.getColumn(cellname(2)).value()));
+
+        // Check the caches, separately
+        assertEquals(ClockAndCount.create(3L, 0L), cfs.getCachedCounter(bytes(1), cellname(1)));
+        assertEquals(ClockAndCount.create(3L, 0L), cfs.getCachedCounter(bytes(1), cellname(2)));
+    }
+
+    @Test
+    public void testBatch() throws WriteTimeoutException
+    {
+        ColumnFamilyStore cfs1 = Keyspace.open(KS).getColumnFamilyStore(CF1);
+        ColumnFamilyStore cfs2 = Keyspace.open(KS).getColumnFamilyStore(CF2);
+
+        cfs1.truncateBlocking();
+        cfs2.truncateBlocking();
+
+        // Do the update (+1, -1), (+2, -2)
+        ColumnFamily cells1 = ArrayBackedSortedColumns.factory.create(cfs1.metadata);
+        cells1.addCounter(cellname(1), 1L);
+        cells1.addCounter(cellname(2), -1L);
+
+        ColumnFamily cells2 = ArrayBackedSortedColumns.factory.create(cfs2.metadata);
+        cells2.addCounter(cellname(1), 2L);
+        cells2.addCounter(cellname(2), -2L);
+
+        Mutation mutation = new Mutation(KS, bytes(1));
+        mutation.add(cells1);
+        mutation.add(cells2);
+
+        new CounterMutation(mutation, ConsistencyLevel.ONE).apply();
+
+        // Validate all values
+        ColumnFamily current1 = cfs1.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        ColumnFamily current2 = cfs2.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF2, System.currentTimeMillis()));
+
+        assertEquals(1L, CounterContext.instance().total(current1.getColumn(cellname(1)).value()));
+        assertEquals(-1L, CounterContext.instance().total(current1.getColumn(cellname(2)).value()));
+        assertEquals(2L, CounterContext.instance().total(current2.getColumn(cellname(1)).value()));
+        assertEquals(-2L, CounterContext.instance().total(current2.getColumn(cellname(2)).value()));
+
+        // Check the caches, separately
+        assertEquals(ClockAndCount.create(1L, 1L), cfs1.getCachedCounter(bytes(1), cellname(1)));
+        assertEquals(ClockAndCount.create(1L, -1L), cfs1.getCachedCounter(bytes(1), cellname(2)));
+        assertEquals(ClockAndCount.create(1L, 2L), cfs2.getCachedCounter(bytes(1), cellname(1)));
+        assertEquals(ClockAndCount.create(1L, -2L), cfs2.getCachedCounter(bytes(1), cellname(2)));
+    }
+
+    @Test
+    public void testDeletes() throws WriteTimeoutException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KS).getColumnFamilyStore(CF1);
+        cfs.truncateBlocking();
+
+        // Do the initial update (+1, -1)
+        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), 1L);
+        cells.addCounter(cellname(2), 1L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        ColumnFamily current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertEquals(1L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+        assertEquals(1L, CounterContext.instance().total(current.getColumn(cellname(2)).value()));
+
+        // Remove the first counter, increment the second counter
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addTombstone(cellname(1), (int) System.currentTimeMillis() / 1000, FBUtilities.timestampMicros());
+        cells.addCounter(cellname(2), 1L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertNull(current.getColumn(cellname(1)));
+        assertEquals(2L, CounterContext.instance().total(current.getColumn(cellname(2)).value()));
+
+        // Increment the first counter, make sure it's still shadowed by the tombstone
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), 1L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertNull(current.getColumn(cellname(1)));
+
+        // Get rid of the complete partition
+        Mutation mutation = new Mutation(KS, bytes(1));
+        mutation.delete(CF1, FBUtilities.timestampMicros());
+        new CounterMutation(mutation, ConsistencyLevel.ONE).apply();
+        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertNull(current.getColumn(cellname(1)));
+        assertNull(current.getColumn(cellname(2)));
+
+        // Increment both counters, ensure that both stay dead
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), 1L);
+        cells.addCounter(cellname(2), 1L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        assertNull(current.getColumn(cellname(1)));
+        assertNull(current.getColumn(cellname(2)));
+    }
+
+    @Test
+    public void testDuplicateCells() throws WriteTimeoutException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KS).getColumnFamilyStore(CF1);
+        cfs.truncateBlocking();
+
+        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addCounter(cellname(1), 1L);
+        cells.addCounter(cellname(1), 2L);
+        cells.addCounter(cellname(1), 3L);
+        cells.addCounter(cellname(1), 4L);
+        new CounterMutation(new Mutation(KS, bytes(1), cells), ConsistencyLevel.ONE).apply();
+
+        ColumnFamily current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
+        ByteBuffer context = current.getColumn(cellname(1)).value();
+        assertEquals(10L, CounterContext.instance().total(context));
+        assertEquals(ClockAndCount.create(1L, 10L), CounterContext.instance().getLocalClockAndCount(context));
+        assertEquals(ClockAndCount.create(1L, 10L), cfs.getCachedCounter(bytes(1), cellname(1)));
     }
 }
-

diff --git a/test/unit/org/apache/cassandra/db/DirectoriesTest.java b/test/unit/org/apache/cassandra/db/DirectoriesTest.java
index 681951e..9e6b26b 100644
--- a/test/unit/org/apache/cassandra/db/DirectoriesTest.java
+++ b/test/unit/org/apache/cassandra/db/DirectoriesTest.java

@@ -19,33 +19,50 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.*;
-import java.util.concurrent.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 
 import org.junit.AfterClass;
-import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Config.DiskFailurePolicy;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Directories.DataDirectory;
-import org.apache.cassandra.db.compaction.LeveledManifest;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 public class DirectoriesTest
 {
     private static File tempDataDir;
-    private static String KS = "ks";
-    private static String[] CFS = new String[] { "cf1", "ks" };
+    private static final String KS = "ks";
+    private static final String[] CFS = new String[] { "cf1", "ks" };
 
+    private static final Set<CFMetaData> CFM = new HashSet<>(CFS.length);
     private static Map<String, List<File>> files = new HashMap<String, List<File>>();
 
     @BeforeClass
     public static void beforeClass() throws IOException
     {
+        for (String cf : CFS)
+        {
+            CFM.add(new CFMetaData(KS, cf, ColumnFamilyType.Standard, null));
+        }
+
         tempDataDir = File.createTempFile("cassandra", "unittest");
         tempDataDir.delete(); // hack to create a temp dir
         tempDataDir.mkdir();
@@ -64,31 +81,29 @@
 
     private static void createTestFiles() throws IOException
     {
-        for (String cf : CFS)
+        for (CFMetaData cfm : CFM)
         {
-            List<File> fs = new ArrayList<File>();
-            files.put(cf, fs);
-            File dir = cfDir(cf);
+            List<File> fs = new ArrayList<>();
+            files.put(cfm.cfName, fs);
+            File dir = cfDir(cfm);
             dir.mkdirs();
 
-            createFakeSSTable(dir, cf, 1, false, fs);
-            createFakeSSTable(dir, cf, 2, true, fs);
-            // leveled manifest
-            new File(dir, cf + LeveledManifest.EXTENSION).createNewFile();
+            createFakeSSTable(dir, cfm.cfName, 1, false, fs);
+            createFakeSSTable(dir, cfm.cfName, 2, true, fs);
 
             File backupDir = new File(dir, Directories.BACKUPS_SUBDIR);
             backupDir.mkdir();
-            createFakeSSTable(backupDir, cf, 1, false, fs);
+            createFakeSSTable(backupDir, cfm.cfName, 1, false, fs);
 
             File snapshotDir = new File(dir, Directories.SNAPSHOT_SUBDIR + File.separator + "42");
             snapshotDir.mkdirs();
-            createFakeSSTable(snapshotDir, cf, 1, false, fs);
+            createFakeSSTable(snapshotDir, cfm.cfName, 1, false, fs);
         }
     }
 
     private static void createFakeSSTable(File dir, String cf, int gen, boolean temp, List<File> addTo) throws IOException
     {
-        Descriptor desc = new Descriptor(dir, KS, cf, gen, temp);
+        Descriptor desc = new Descriptor(dir, KS, cf, gen, temp ? Descriptor.Type.TEMP : Descriptor.Type.FINAL);
         for (Component c : new Component[]{ Component.DATA, Component.PRIMARY_INDEX, Component.FILTER })
         {
             File f = new File(desc.filenameFor(c));
@@ -97,41 +112,42 @@
         }
     }
 
-    private static File cfDir(String cf)
+    private static File cfDir(CFMetaData metadata)
     {
-        return new File(tempDataDir, KS + File.separator + cf);
+        String cfId = ByteBufferUtil.bytesToHex(ByteBufferUtil.bytes(metadata.cfId));
+        return new File(tempDataDir, metadata.ksName + File.separator + metadata.cfName + "-" + cfId);
     }
 
     @Test
     public void testStandardDirs()
     {
-        for (String cf : CFS)
+        for (CFMetaData cfm : CFM)
         {
-            Directories directories = Directories.create(KS, cf);
-            Assert.assertEquals(cfDir(cf), directories.getDirectoryForNewSSTables());
+            Directories directories = new Directories(cfm);
+            assertEquals(cfDir(cfm), directories.getDirectoryForNewSSTables());
 
-            Descriptor desc = new Descriptor(cfDir(cf), KS, cf, 1, false);
-            File snapshotDir = new File(cfDir(cf),  File.separator + Directories.SNAPSHOT_SUBDIR + File.separator + "42");
-            Assert.assertEquals(snapshotDir, directories.getSnapshotDirectory(desc, "42"));
+            Descriptor desc = new Descriptor(cfDir(cfm), KS, cfm.cfName, 1, Descriptor.Type.FINAL);
+            File snapshotDir = new File(cfDir(cfm),  File.separator + Directories.SNAPSHOT_SUBDIR + File.separator + "42");
+            assertEquals(snapshotDir, Directories.getSnapshotDirectory(desc, "42"));
 
-            File backupsDir = new File(cfDir(cf),  File.separator + Directories.BACKUPS_SUBDIR);
-            Assert.assertEquals(backupsDir, directories.getBackupsDirectory(desc));
+            File backupsDir = new File(cfDir(cfm),  File.separator + Directories.BACKUPS_SUBDIR);
+            assertEquals(backupsDir, Directories.getBackupsDirectory(desc));
         }
     }
 
     @Test
     public void testSSTableLister()
     {
-        for (String cf : CFS)
+        for (CFMetaData cfm : CFM)
         {
-            Directories directories = Directories.create(KS, cf);
+            Directories directories = new Directories(cfm);
             Directories.SSTableLister lister;
             Set<File> listed;
 
             // List all but no snapshot, backup
             lister = directories.sstableLister();
-            listed = new HashSet<File>(lister.listFiles());
-            for (File f : files.get(cf))
+            listed = new HashSet<>(lister.listFiles());
+            for (File f : files.get(cfm.cfName))
             {
                 if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR) || f.getPath().contains(Directories.BACKUPS_SUBDIR))
                     assert !listed.contains(f) : f + " should not be listed";
@@ -141,8 +157,8 @@
 
             // List all but including backup (but no snapshot)
             lister = directories.sstableLister().includeBackups(true);
-            listed = new HashSet<File>(lister.listFiles());
-            for (File f : files.get(cf))
+            listed = new HashSet<>(lister.listFiles());
+            for (File f : files.get(cfm.cfName))
             {
                 if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR))
                     assert !listed.contains(f) : f + " should not be listed";
@@ -152,8 +168,8 @@
 
             // Skip temporary and compacted
             lister = directories.sstableLister().skipTemporary(true);
-            listed = new HashSet<File>(lister.listFiles());
-            for (File f : files.get(cf))
+            listed = new HashSet<>(lister.listFiles());
+            for (File f : files.get(cfm.cfName))
             {
                 if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR) || f.getPath().contains(Directories.BACKUPS_SUBDIR))
                     assert !listed.contains(f) : f + " should not be listed";
@@ -165,19 +181,9 @@
         }
     }
 
-    @Test
-    public void testLeveledManifestPath()
-    {
-        for (String cf : CFS)
-        {
-            Directories directories = Directories.create(KS, cf);
-            File manifest = new File(cfDir(cf), cf + LeveledManifest.EXTENSION);
-            Assert.assertEquals(manifest, directories.tryGetLeveledManifest());
-        }
-    }
 
     @Test
-    public void testDiskFailurePolicy_best_effort() throws IOException
+    public void testDiskFailurePolicy_best_effort()
     {
         DiskFailurePolicy origPolicy = DatabaseDescriptor.getDiskFailurePolicy();
         
@@ -185,23 +191,24 @@
         {
             DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.best_effort);
             
-            for (DataDirectory dd : Directories.dataFileLocations)
+            for (DataDirectory dd : Directories.dataDirectories)
             {
                 dd.location.setExecutable(false);
                 dd.location.setWritable(false);
             }
-            
-            Directories.create(KS, "bad");
-            
-            for (DataDirectory dd : Directories.dataFileLocations)
+
+            // nested folders in /tmp is enough to fail on *nix but we need to pass the 255 char limit to get a failure on Windows and blacklist
+            CFMetaData cfm = new CFMetaData(KS, "badbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbadbad", ColumnFamilyType.Standard, null);
+            Directories dir = new Directories(cfm);
+
+            for (File file : dir.getCFDirectories())
             {
-                File file = new File(dd.location, new File(KS, "bad").getPath());
-                Assert.assertTrue(BlacklistedDirectories.isUnwritable(file));
+                assertTrue(BlacklistedDirectories.isUnwritable(file));
             }
         } 
         finally 
         {
-            for (DataDirectory dd : Directories.dataFileLocations)
+            for (DataDirectory dd : Directories.dataDirectories)
             {
                 dd.location.setExecutable(true);
                 dd.location.setWritable(true);
@@ -214,20 +221,20 @@
     @Test
     public void testMTSnapshots() throws Exception
     {
-        for (final String cf : CFS)
+        for (final CFMetaData cfm : CFM)
         {
-            final Directories directories = Directories.create(KS, cf);
-            Assert.assertEquals(cfDir(cf), directories.getDirectoryForNewSSTables());
+            final Directories directories = new Directories(cfm);
+            assertEquals(cfDir(cfm), directories.getDirectoryForNewSSTables());
             final String n = Long.toString(System.nanoTime());
             Callable<File> directoryGetter = new Callable<File>() {
                 public File call() throws Exception {
-                    Descriptor desc = new Descriptor(cfDir(cf), KS, cf, 1, false);
-                    return directories.getSnapshotDirectory(desc, n);
+                    Descriptor desc = new Descriptor(cfDir(cfm), KS, cfm.cfName, 1, Descriptor.Type.FINAL);
+                    return Directories.getSnapshotDirectory(desc, n);
                 }
             };
             List<Future<File>> invoked = Executors.newFixedThreadPool(2).invokeAll(Arrays.asList(directoryGetter, directoryGetter));
             for(Future<File> fut:invoked) {
-                Assert.assertTrue(fut.get().exists());
+                assertTrue(fut.get().exists());
             }
         }
     }

diff --git a/test/unit/org/apache/cassandra/db/HintedHandOffTest.java b/test/unit/org/apache/cassandra/db/HintedHandOffTest.java
index 302a1e7..c29c08e 100644
--- a/test/unit/org/apache/cassandra/db/HintedHandOffTest.java
+++ b/test/unit/org/apache/cassandra/db/HintedHandOffTest.java

@@ -29,6 +29,7 @@
 import com.google.common.collect.Iterators;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
 import org.apache.cassandra.db.marshal.Int32Type;
@@ -36,7 +37,7 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.junit.Assert.assertEquals;
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
 
 public class HintedHandOffTest extends SchemaLoader
 {
@@ -58,8 +59,8 @@
         hintStore.disableAutoCompaction();
 
         // insert 1 hint
-        RowMutation rm = new RowMutation(KEYSPACE4, ByteBufferUtil.bytes(1));
-        rm.add(STANDARD1_CF, ByteBufferUtil.bytes(String.valueOf(COLUMN1)), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
+        Mutation rm = new Mutation(KEYSPACE4, ByteBufferUtil.bytes(1));
+        rm.add(STANDARD1_CF, Util.cellname(COLUMN1), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
 
         HintedHandOffManager.instance.hintFor(rm,
                                               System.currentTimeMillis(),
@@ -86,7 +87,7 @@
             HintedHandOffManager.instance.metrics.incrPastWindow(InetAddress.getLocalHost());
         HintedHandOffManager.instance.metrics.log();
 
-        UntypedResultSet rows = processInternal("SELECT hints_dropped FROM system." + SystemKeyspace.PEER_EVENTS_CF);
+        UntypedResultSet rows = executeInternal("SELECT hints_dropped FROM system." + SystemKeyspace.PEER_EVENTS_CF);
         Map<UUID, Integer> returned = rows.one().getMap("hints_dropped", UUIDType.instance, Int32Type.instance);
         assertEquals(Iterators.getLast(returned.values().iterator()).intValue(), 99);
     }
@@ -99,8 +100,8 @@
         hintStore.clearUnsafe();
 
         // insert 1 hint
-        RowMutation rm = new RowMutation(KEYSPACE4, ByteBufferUtil.bytes(1));
-        rm.add(STANDARD1_CF, ByteBufferUtil.bytes(String.valueOf(COLUMN1)), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
+        Mutation rm = new Mutation(KEYSPACE4, ByteBufferUtil.bytes(1));
+        rm.add(STANDARD1_CF, Util.cellname(COLUMN1), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
 
         HintedHandOffManager.instance.hintFor(rm,
                                               System.currentTimeMillis(),
@@ -123,7 +124,7 @@
     private int getNoOfHints()
     {
         String req = "SELECT * FROM system.%s";
-        UntypedResultSet resultSet = processInternal(String.format(req, SystemKeyspace.HINTS_CF));
+        UntypedResultSet resultSet = executeInternal(String.format(req, SystemKeyspace.HINTS_CF));
         return resultSet.size();
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/KeyCacheTest.java b/test/unit/org/apache/cassandra/db/KeyCacheTest.java
index 1f41860..c0560ab 100644
--- a/test/unit/org/apache/cassandra/db/KeyCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyCacheTest.java

@@ -17,20 +17,26 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.IOException;
+import java.nio.file.Files;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
 
+import com.google.common.util.concurrent.Uninterruptibles;
 import org.junit.AfterClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.cache.KeyCacheKey;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.service.CacheService;
+import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.junit.Assert.assertEquals;
@@ -113,14 +119,14 @@
 
         DecoratedKey key1 = Util.dk("key1");
         DecoratedKey key2 = Util.dk("key2");
-        RowMutation rm;
+        Mutation rm;
 
         // inserts
-        rm = new RowMutation(KEYSPACE1, key1.key);
-        rm.add(COLUMN_FAMILY1, ByteBufferUtil.bytes("1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+        rm = new Mutation(KEYSPACE1, key1.getKey());
+        rm.add(COLUMN_FAMILY1, Util.cellname("1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
         rm.apply();
-        rm = new RowMutation(KEYSPACE1, key2.key);
-        rm.add(COLUMN_FAMILY1, ByteBufferUtil.bytes("2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+        rm = new Mutation(KEYSPACE1, key2.getKey());
+        rm.add(COLUMN_FAMILY1, Util.cellname("2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
         rm.apply();
 
         // to make sure we have SSTable
@@ -129,45 +135,59 @@
         // reads to cache key position
         cfs.getColumnFamily(QueryFilter.getSliceFilter(key1,
                                                        COLUMN_FAMILY1,
-                                                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
+                                                       Composites.EMPTY,
+                                                       Composites.EMPTY,
                                                        false,
                                                        10,
                                                        System.currentTimeMillis()));
 
         cfs.getColumnFamily(QueryFilter.getSliceFilter(key2,
                                                        COLUMN_FAMILY1,
-                                                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
+                                                       Composites.EMPTY,
+                                                       Composites.EMPTY,
                                                        false,
                                                        10,
                                                        System.currentTimeMillis()));
 
         assertKeyCacheSize(2, KEYSPACE1, COLUMN_FAMILY1);
 
+        Set<SSTableReader> readers = cfs.getDataTracker().getSSTables();
+        for (SSTableReader reader : readers)
+            reader.acquireReference();
+
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
-        // after compaction cache should have entries for
-        // new SSTables, if we had 2 keys in cache previously it should become 4
+        // after compaction cache should have entries for new SSTables,
+        // but since we have kept a reference to the old sstables,
+        // if we had 2 keys in cache previously it should become 4
         assertKeyCacheSize(4, KEYSPACE1, COLUMN_FAMILY1);
 
+        for (SSTableReader reader : readers)
+            reader.releaseReference();
+
+        Uninterruptibles.sleepUninterruptibly(10, TimeUnit.MILLISECONDS);;
+        while (StorageService.tasks.getActiveCount() + StorageService.tasks.getQueue().size() > 0);
+
+        // after releasing the reference this should drop to 2
+        assertKeyCacheSize(2, KEYSPACE1, COLUMN_FAMILY1);
+
         // re-read same keys to verify that key cache didn't grow further
         cfs.getColumnFamily(QueryFilter.getSliceFilter(key1,
                                                        COLUMN_FAMILY1,
-                                                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
+                                                       Composites.EMPTY,
+                                                       Composites.EMPTY,
                                                        false,
                                                        10,
                                                        System.currentTimeMillis()));
 
         cfs.getColumnFamily(QueryFilter.getSliceFilter(key2,
                                                        COLUMN_FAMILY1,
-                                                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
+                                                       Composites.EMPTY,
+                                                       Composites.EMPTY,
                                                        false,
                                                        10,
                                                        System.currentTimeMillis()));
 
-        assertKeyCacheSize(4, KEYSPACE1, COLUMN_FAMILY1);
+        assertKeyCacheSize(2, KEYSPACE1, COLUMN_FAMILY1);
     }
 
     private void assertKeyCacheSize(int expected, String keyspace, String columnFamily)

diff --git a/test/unit/org/apache/cassandra/db/KeyCollisionTest.java b/test/unit/org/apache/cassandra/db/KeyCollisionTest.java
index 345febd..1869872 100644
--- a/test/unit/org/apache/cassandra/db/KeyCollisionTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyCollisionTest.java

@@ -18,7 +18,6 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.IOException;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.util.*;
@@ -26,6 +25,7 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.IntegerType;
@@ -73,23 +73,23 @@
 
         List<Row> rows = cfs.getRangeSlice(new Bounds<RowPosition>(dk("k2"), dk("key2")), null, new IdentityQueryFilter(), 10000);
         assert rows.size() == 4 : "Expecting 4 keys, got " + rows.size();
-        assert rows.get(0).key.key.equals(ByteBufferUtil.bytes("k2"));
-        assert rows.get(1).key.key.equals(ByteBufferUtil.bytes("k3"));
-        assert rows.get(2).key.key.equals(ByteBufferUtil.bytes("key1"));
-        assert rows.get(3).key.key.equals(ByteBufferUtil.bytes("key2"));
+        assert rows.get(0).key.getKey().equals(ByteBufferUtil.bytes("k2"));
+        assert rows.get(1).key.getKey().equals(ByteBufferUtil.bytes("k3"));
+        assert rows.get(2).key.getKey().equals(ByteBufferUtil.bytes("key1"));
+        assert rows.get(3).key.getKey().equals(ByteBufferUtil.bytes("key2"));
     }
 
-    private void insert(String... keys) throws IOException
+    private void insert(String... keys)
     {
         for (String key : keys)
             insert(key);
     }
 
-    private void insert(String key) throws IOException
+    private void insert(String key)
     {
-        RowMutation rm;
-        rm = new RowMutation(KEYSPACE, ByteBufferUtil.bytes(key));
-        rm.add(CF, ByteBufferUtil.bytes("column"), ByteBufferUtil.bytes("asdf"), 0);
+        Mutation rm;
+        rm = new Mutation(KEYSPACE, ByteBufferUtil.bytes(key));
+        rm.add(CF, Util.cellname("column"), ByteBufferUtil.bytes("asdf"), 0);
         rm.apply();
     }
 
@@ -102,7 +102,7 @@
 
         public DecoratedKey decorateKey(ByteBuffer key)
         {
-            return new DecoratedKey(getToken(key), key);
+            return new BufferDecoratedKey(getToken(key), key);
         }
 
         public Token midpoint(Token ltoken, Token rtoken)
@@ -166,6 +166,12 @@
             return new BigIntegerToken(BigInteger.valueOf(key.remaining()));
         }
 
+        @Override
+        public long getHeapSizeOf(BigIntegerToken token)
+        {
+            return 0;
+        }
+
         public Map<Token, Float> describeOwnership(List<Token> sortedTokens)
         {
             // allTokens will contain the count and be returned, sorted_ranges is shorthand for token<->token math.
@@ -188,7 +194,7 @@
                     for (Range<Token> r : sortedRanges)
                     {
                         // Looping over every KS:CF:Range, get the splits size and add it to the count
-                        allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, 1, cfmd).size());
+                        allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, 1).size());
                     }
                 }
             }

diff --git a/test/unit/org/apache/cassandra/db/KeyspaceTest.java b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
index efa43cf..d610563 100644
--- a/test/unit/org/apache/cassandra/db/KeyspaceTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyspaceTest.java

@@ -32,18 +32,16 @@
 import static org.junit.Assert.*;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.marshal.IntegerType;
 import org.apache.cassandra.utils.WrappedRunnable;
 import static org.apache.cassandra.Util.column;
 import static org.apache.cassandra.Util.expiringColumn;
+import static org.apache.cassandra.Util.cellname;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 
 
 public class KeyspaceTest extends SchemaLoader
@@ -64,9 +62,9 @@
         final Keyspace keyspace = Keyspace.open("Keyspace2");
         final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard3");
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace2", "Standard3");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace2", "Standard3");
         cf.addColumn(column("col1","val1", 1L));
-        RowMutation rm = new RowMutation("Keyspace2", TEST_KEY.key, cf);
+        Mutation rm = new Mutation("Keyspace2", TEST_KEY.getKey(), cf);
         rm.apply();
 
         Runnable verify = new WrappedRunnable()
@@ -75,25 +73,13 @@
             {
                 ColumnFamily cf;
 
-                cf = cfStore.getColumnFamily(QueryFilter.getNamesFilter(TEST_KEY,
-                                                                        "Standard3",
-                                                                        new TreeSet<ByteBuffer>(),
-                                                                        System.currentTimeMillis()));
+                cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, TEST_KEY));
                 assertColumns(cf);
 
-                cf = cfStore.getColumnFamily(QueryFilter.getSliceFilter(TEST_KEY,
-                                                                        "Standard3",
-                                                                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                        false,
-                                                                        0,
-                                                                        System.currentTimeMillis()));
+                cf = cfStore.getColumnFamily(QueryFilter.getSliceFilter(TEST_KEY, "Standard3", Composites.EMPTY, Composites.EMPTY, false, 0, System.currentTimeMillis()));
                 assertColumns(cf);
 
-                cf = cfStore.getColumnFamily(QueryFilter.getNamesFilter(TEST_KEY,
-                                                                        "Standard3",
-                                                                        FBUtilities.singleton(ByteBufferUtil.bytes("col99"), cfStore.getComparator()),
-                                                                        System.currentTimeMillis()));
+                cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, TEST_KEY, "col99"));
                 assertColumns(cf);
             }
         };
@@ -106,11 +92,11 @@
         final Keyspace keyspace = Keyspace.open("Keyspace1");
         final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("col1","val1", 1L));
         cf.addColumn(column("col2","val2", 1L));
         cf.addColumn(column("col3","val3", 1L));
-        RowMutation rm = new RowMutation("Keyspace1", TEST_KEY.key, cf);
+        Mutation rm = new Mutation("Keyspace1", TEST_KEY.getKey(), cf);
         rm.apply();
 
         Runnable verify = new WrappedRunnable()
@@ -119,16 +105,10 @@
             {
                 ColumnFamily cf;
 
-                cf = cfStore.getColumnFamily(QueryFilter.getNamesFilter(TEST_KEY,
-                                                                        "Standard1",
-                                                                        FBUtilities.singleton(ByteBufferUtil.bytes("col1"), cfStore.getComparator()),
-                                                                        System.currentTimeMillis()));
+                cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, TEST_KEY, "col1"));
                 assertColumns(cf, "col1");
 
-                cf = cfStore.getColumnFamily(QueryFilter.getNamesFilter(TEST_KEY,
-                                                                        "Standard1",
-                                                                        FBUtilities.singleton(ByteBufferUtil.bytes("col3"), cfStore.getComparator()),
-                                                                        System.currentTimeMillis()));
+                cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, TEST_KEY, "col3"));
                 assertColumns(cf, "col3");
             }
         };
@@ -141,21 +121,21 @@
     	DecoratedKey key = TEST_SLICE_KEY;
     	Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         // First write "a", "b", "c"
         cf.addColumn(column("a", "val1", 1L));
         cf.addColumn(column("b", "val2", 1L));
         cf.addColumn(column("c", "val3", 1L));
-        RowMutation rm = new RowMutation("Keyspace1", key.key, cf);
+        Mutation rm = new Mutation("Keyspace1", key.getKey(), cf);
         rm.apply();
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("b"), ByteBufferUtil.bytes("c"), false, 100, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("b"), cellname("c"), false, 100, System.currentTimeMillis());
         assertEquals(2, cf.getColumnCount());
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("b"), ByteBufferUtil.bytes("b"), false, 100, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("b"), cellname("b"), false, 100, System.currentTimeMillis());
         assertEquals(1, cf.getColumnCount());
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("b"), ByteBufferUtil.bytes("c"), false, 1, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("b"), cellname("c"), false, 1, System.currentTimeMillis());
         assertEquals(1, cf.getColumnCount());
     }
 
@@ -163,9 +143,9 @@
     public void testGetSliceNoMatch() throws Throwable
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard2");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard2");
         cf.addColumn(column("col1", "val1", 1));
-        RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("row1000"), cf);
+        Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("row1000"), cf);
         rm.apply();
 
         validateGetSliceNoMatch(keyspace);
@@ -187,12 +167,12 @@
         final DecoratedKey ROW = Util.dk("row4");
         final NumberFormat fmt = new DecimalFormat("000");
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         // at this rate, we're getting 78-79 cos/block, assuming the blocks are set to be about 4k.
         // so if we go to 300, we'll get at least 4 blocks, which is plenty for testing.
         for (int i = 0; i < 300; i++)
             cf.addColumn(column("col" + fmt.format(i), "omg!thisisthevalue!"+i, 1L));
-        RowMutation rm = new RowMutation("Keyspace1", ROW.key, cf);
+        Mutation rm = new Mutation("Keyspace1", ROW.getKey(), cf);
         rm.apply();
 
         Runnable verify = new WrappedRunnable()
@@ -205,30 +185,30 @@
                 assert DatabaseDescriptor.getColumnIndexSize() == 4096 : "Unexpected column index size, block boundaries won't be where tests expect them.";
 
                 // test forward, spanning a segment.
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col096"), ByteBufferUtil.bytes("col099"), false, 4, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col096"), cellname("col099"), false, 4, System.currentTimeMillis());
                 assertColumns(cf, "col096", "col097", "col098", "col099");
 
                 // test reversed, spanning a segment.
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col099"), ByteBufferUtil.bytes("col096"), true, 4, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col099"), cellname("col096"), true, 4, System.currentTimeMillis());
                 assertColumns(cf, "col096", "col097", "col098", "col099");
 
                 // test forward, within a segment.
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col100"), ByteBufferUtil.bytes("col103"), false, 4, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col100"), cellname("col103"), false, 4, System.currentTimeMillis());
                 assertColumns(cf, "col100", "col101", "col102", "col103");
 
                 // test reversed, within a segment.
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col103"), ByteBufferUtil.bytes("col100"), true, 4, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col103"), cellname("col100"), true, 4, System.currentTimeMillis());
                 assertColumns(cf, "col100", "col101", "col102", "col103");
 
                 // test forward from beginning, spanning a segment.
                 String[] strCols = new String[100]; // col000-col099
                 for (int i = 0; i < 100; i++)
                     strCols[i] = "col" + fmt.format(i);
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.bytes("col099"), false, 100, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, Composites.EMPTY, cellname("col099"), false, 100, System.currentTimeMillis());
                 assertColumns(cf, strCols);
 
                 // test reversed, from end, spanning a segment.
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.bytes("col288"), true, 12, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, Composites.EMPTY, cellname("col288"), true, 12, System.currentTimeMillis());
                 assertColumns(cf, "col288", "col289", "col290", "col291", "col292", "col293", "col294", "col295", "col296", "col297", "col298", "col299");
             }
         };
@@ -245,9 +225,9 @@
 
         for (int i = 0; i < 10; i++)
         {
-            ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "StandardLong1");
-            cf.addColumn(new Column(ByteBufferUtil.bytes((long)i), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0));
-            RowMutation rm = new RowMutation("Keyspace1", ROW.key, cf);
+            ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "StandardLong1");
+            cf.addColumn(new BufferCell(cellname((long)i), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0));
+            Mutation rm = new Mutation("Keyspace1", ROW.getKey(), cf);
             rm.apply();
         }
 
@@ -255,14 +235,14 @@
 
         for (int i = 10; i < 20; i++)
         {
-            ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "StandardLong1");
-            cf.addColumn(new Column(ByteBufferUtil.bytes((long)i), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0));
-            RowMutation rm = new RowMutation("Keyspace1", ROW.key, cf);
+            ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "StandardLong1");
+            cf.addColumn(new BufferCell(cellname((long)i), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0));
+            Mutation rm = new Mutation("Keyspace1", ROW.getKey(), cf);
             rm.apply();
 
-            cf = cfs.getColumnFamily(ROW, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 1, System.currentTimeMillis());
+            cf = cfs.getColumnFamily(ROW, Composites.EMPTY, Composites.EMPTY, true, 1, System.currentTimeMillis());
             assertEquals(1, Iterables.size(cf.getColumnNames()));
-            assertEquals(i, cf.getColumnNames().iterator().next().getLong());
+            assertEquals(i, cf.getColumnNames().iterator().next().toByteBuffer().getLong());
         }
     }
 
@@ -272,11 +252,11 @@
         ColumnFamily cf;
 
         // key before the rows that exists
-        cf = cfStore.getColumnFamily(Util.dk("a"), ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(Util.dk("a"), Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
         assertColumns(cf);
 
         // key after the rows that exist
-        cf = cfStore.getColumnFamily(Util.dk("z"), ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(Util.dk("z"), Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
         assertColumns(cf);
     }
 
@@ -288,18 +268,18 @@
         final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
         final DecoratedKey ROW = Util.dk("row1");
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("col1", "val1", 1L));
         cf.addColumn(column("col3", "val3", 1L));
         cf.addColumn(column("col4", "val4", 1L));
         cf.addColumn(column("col5", "val5", 1L));
         cf.addColumn(column("col7", "val7", 1L));
         cf.addColumn(column("col9", "val9", 1L));
-        RowMutation rm = new RowMutation("Keyspace1", ROW.key, cf);
+        Mutation rm = new Mutation("Keyspace1", ROW.getKey(), cf);
         rm.apply();
 
-        rm = new RowMutation("Keyspace1", ROW.key);
-        rm.delete("Standard1", ByteBufferUtil.bytes("col4"), 2L);
+        rm = new Mutation("Keyspace1", ROW.getKey());
+        rm.delete("Standard1", cellname("col4"), 2L);
         rm.apply();
 
         Runnable verify = new WrappedRunnable()
@@ -308,26 +288,26 @@
             {
                 ColumnFamily cf;
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col5"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 2, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col5"), Composites.EMPTY, false, 2, System.currentTimeMillis());
                 assertColumns(cf, "col5", "col7");
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col4"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 2, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col4"), Composites.EMPTY, false, 2, System.currentTimeMillis());
                 assertColumns(cf, "col4", "col5", "col7");
                 assertColumns(ColumnFamilyStore.removeDeleted(cf, Integer.MAX_VALUE), "col5", "col7");
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col5"), ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 2, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col5"), Composites.EMPTY, true, 2, System.currentTimeMillis());
                 assertColumns(cf, "col3", "col4", "col5");
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col6"), ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 2, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col6"), Composites.EMPTY, true, 2, System.currentTimeMillis());
                 assertColumns(cf, "col3", "col4", "col5");
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 2, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, Composites.EMPTY, Composites.EMPTY, true, 2, System.currentTimeMillis());
                 assertColumns(cf, "col7", "col9");
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col95"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 2, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col95"), Composites.EMPTY, false, 2, System.currentTimeMillis());
                 assertColumns(cf);
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 2, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col0"), Composites.EMPTY, true, 2, System.currentTimeMillis());
                 assertColumns(cf);
             }
         };
@@ -343,11 +323,11 @@
         final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
         final DecoratedKey ROW = Util.dk("row5");
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("col1", "val1", 1L));
         cf.addColumn(expiringColumn("col2", "val2", 1L, 60)); // long enough not to be tombstoned
         cf.addColumn(column("col3", "val3", 1L));
-        RowMutation rm = new RowMutation("Keyspace1", ROW.key, cf);
+        Mutation rm = new Mutation("Keyspace1", ROW.getKey(), cf);
         rm.apply();
 
         Runnable verify = new WrappedRunnable()
@@ -356,11 +336,11 @@
             {
                 ColumnFamily cf;
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 2, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, Composites.EMPTY, Composites.EMPTY, false, 2, System.currentTimeMillis());
                 assertColumns(cf, "col1", "col2");
                 assertColumns(ColumnFamilyStore.removeDeleted(cf, Integer.MAX_VALUE), "col1");
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col2"), Composites.EMPTY, false, 1, System.currentTimeMillis());
                 assertColumns(cf, "col2");
                 assertColumns(ColumnFamilyStore.removeDeleted(cf, Integer.MAX_VALUE));
             }
@@ -377,22 +357,22 @@
         final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
         final DecoratedKey ROW = Util.dk("row2");
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("col1", "val1", 1L));
         cf.addColumn(column("col2", "val2", 1L));
         cf.addColumn(column("col3", "val3", 1L));
         cf.addColumn(column("col4", "val4", 1L));
         cf.addColumn(column("col5", "val5", 1L));
         cf.addColumn(column("col6", "val6", 1L));
-        RowMutation rm = new RowMutation("Keyspace1", ROW.key, cf);
+        Mutation rm = new Mutation("Keyspace1", ROW.getKey(), cf);
         rm.apply();
         cfStore.forceBlockingFlush();
 
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("col1", "valx", 2L));
         cf.addColumn(column("col2", "valx", 2L));
         cf.addColumn(column("col3", "valx", 2L));
-        rm = new RowMutation("Keyspace1", ROW.key, cf);
+        rm = new Mutation("Keyspace1", ROW.getKey(), cf);
         rm.apply();
 
         Runnable verify = new WrappedRunnable()
@@ -401,16 +381,16 @@
             {
                 ColumnFamily cf;
 
-                cf = cfStore.getColumnFamily(ROW, ByteBufferUtil.bytes("col2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 3, System.currentTimeMillis());
+                cf = cfStore.getColumnFamily(ROW, cellname("col2"), Composites.EMPTY, false, 3, System.currentTimeMillis());
                 assertColumns(cf, "col2", "col3", "col4");
 
-                ByteBuffer col = cf.getColumn(ByteBufferUtil.bytes("col2")).value();
+                ByteBuffer col = cf.getColumn(cellname("col2")).value();
                 assertEquals(ByteBufferUtil.string(col), "valx");
 
-                col = cf.getColumn(ByteBufferUtil.bytes("col3")).value();
+                col = cf.getColumn(cellname("col3")).value();
                 assertEquals(ByteBufferUtil.string(col), "valx");
 
-                col = cf.getColumn(ByteBufferUtil.bytes("col4")).value();
+                col = cf.getColumn(cellname("col4")).value();
                 assertEquals(ByteBufferUtil.string(col), "val4");
             }
         };
@@ -425,10 +405,10 @@
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
         DecoratedKey key = Util.dk("row3");
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         for (int i = 1000; i < 2000; i++)
             cf.addColumn(column("col" + i, ("v" + i), 1L));
-        RowMutation rm = new RowMutation("Keyspace1", key.key, cf);
+        Mutation rm = new Mutation("Keyspace1", key.getKey(), cf);
         rm.apply();
         cfStore.forceBlockingFlush();
 
@@ -456,42 +436,42 @@
         DecoratedKey key = Util.dk("row_maxmin");
         for (int j = 0; j < 10; j++)
         {
-            ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+            ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
             for (int i = 1000 + (j*100); i < 1000 + ((j+1)*100); i++)
             {
                 cf.addColumn(column("col" + i, ("v" + i), i));
             }
-            RowMutation rm = new RowMutation("Keyspace1", key.key, cf);
+            Mutation rm = new Mutation("Keyspace1", key.getKey(), cf);
             rm.apply();
             cfStore.forceBlockingFlush();
         }
         cfStore.metric.sstablesPerReadHistogram.cf.clear();
-        ColumnFamily cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes(""), ByteBufferUtil.bytes("col1499"), false, 1000, System.currentTimeMillis());
+        ColumnFamily cf = cfStore.getColumnFamily(key, Composites.EMPTY, cellname("col1499"), false, 1000, System.currentTimeMillis());
         assertEquals(cfStore.metric.sstablesPerReadHistogram.cf.max(), 5, 0.1);
         int i = 0;
-        for (Column c : cf.getSortedColumns())
+        for (Cell c : cf.getSortedColumns())
         {
-            assertEquals(ByteBufferUtil.string(c.name), "col" + (1000 + i++));
+            assertEquals(ByteBufferUtil.string(c.name().toByteBuffer()), "col" + (1000 + i++));
         }
         assertEquals(i, 500);
         cfStore.metric.sstablesPerReadHistogram.cf.clear();
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("col1500"), ByteBufferUtil.bytes("col2000"), false, 1000, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("col1500"), cellname("col2000"), false, 1000, System.currentTimeMillis());
         assertEquals(cfStore.metric.sstablesPerReadHistogram.cf.max(), 5, 0.1);
 
-        for (Column c : cf.getSortedColumns())
+        for (Cell c : cf.getSortedColumns())
         {
-            assertEquals(ByteBufferUtil.string(c.name), "col"+(1000 + i++));
+            assertEquals(ByteBufferUtil.string(c.name().toByteBuffer()), "col"+(1000 + i++));
         }
         assertEquals(i, 1000);
 
         // reverse
         cfStore.metric.sstablesPerReadHistogram.cf.clear();
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("col2000"), ByteBufferUtil.bytes("col1500"), true, 1000, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("col2000"), cellname("col1500"), true, 1000, System.currentTimeMillis());
         assertEquals(cfStore.metric.sstablesPerReadHistogram.cf.max(), 5, 0.1);
         i = 500;
-        for (Column c : cf.getSortedColumns())
+        for (Cell c : cf.getSortedColumns())
         {
-            assertEquals(ByteBufferUtil.string(c.name), "col"+(1000 + i++));
+            assertEquals(ByteBufferUtil.string(c.name().toByteBuffer()), "col"+(1000 + i++));
         }
         assertEquals(i, 1000);
 
@@ -519,25 +499,25 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("StandardComposite2");
         cfs.disableAutoCompaction();
 
-        CompositeType ct = CompositeType.getInstance(BytesType.instance, IntegerType.instance);
+        CellNameType type = cfs.getComparator();
         DecoratedKey key = Util.dk("k");
         for (int j = 0; j < 10; j++)
         {
             for (int i = 0; i < 10; i++)
             {
-                RowMutation rm = new RowMutation("Keyspace1", key.key);
-                ByteBuffer colName = ct.builder().add(ByteBufferUtil.bytes("a" + i)).add(ByteBufferUtil.bytes(j*10 + i)).build();
+                Mutation rm = new Mutation("Keyspace1", key.getKey());
+                CellName colName = type.makeCellName(ByteBufferUtil.bytes("a" + i), ByteBufferUtil.bytes(j*10 + i));
                 rm.add("StandardComposite2", colName, ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
                 rm.apply();
             }
             cfs.forceBlockingFlush();
         }
-        ByteBuffer start = ct.builder().add(ByteBufferUtil.bytes("a5")).add(ByteBufferUtil.bytes(85)).build();
-        ByteBuffer finish = ct.builder().add(ByteBufferUtil.bytes("a5")).buildAsEndOfRange();
+        Composite start = type.builder().add(ByteBufferUtil.bytes("a5")).add(ByteBufferUtil.bytes(85)).build();
+        Composite finish = type.builder().add(ByteBufferUtil.bytes("a5")).build().end();
         cfs.metric.sstablesPerReadHistogram.cf.clear();
         ColumnFamily cf = cfs.getColumnFamily(key, start, finish, false, 1000, System.currentTimeMillis());
         int colCount = 0;
-        for (Column c : cf)
+        for (Cell c : cf)
             colCount++;
         assertEquals(2, colCount);
         assertEquals(2, cfs.metric.sstablesPerReadHistogram.cf.max(), 0.1);
@@ -547,77 +527,70 @@
     {
         DecoratedKey key = Util.dk("row3");
         ColumnFamily cf;
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("col1000"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 3, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("col1000"), Composites.EMPTY, false, 3, System.currentTimeMillis());
         assertColumns(cf, "col1000", "col1001", "col1002");
 
         ByteBuffer col;
-        col = cf.getColumn(ByteBufferUtil.bytes("col1000")).value();
+        col = cf.getColumn(cellname("col1000")).value();
         assertEquals(ByteBufferUtil.string(col), "v1000");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1001")).value();
+        col = cf.getColumn(cellname("col1001")).value();
         assertEquals(ByteBufferUtil.string(col), "v1001");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1002")).value();
+        col = cf.getColumn(cellname("col1002")).value();
         assertEquals(ByteBufferUtil.string(col), "v1002");
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("col1195"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 3, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("col1195"), Composites.EMPTY, false, 3, System.currentTimeMillis());
         assertColumns(cf, "col1195", "col1196", "col1197");
 
-        col = cf.getColumn(ByteBufferUtil.bytes("col1195")).value();
+        col = cf.getColumn(cellname("col1195")).value();
         assertEquals(ByteBufferUtil.string(col), "v1195");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1196")).value();
+        col = cf.getColumn(cellname("col1196")).value();
         assertEquals(ByteBufferUtil.string(col), "v1196");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1197")).value();
+        col = cf.getColumn(cellname("col1197")).value();
         assertEquals(ByteBufferUtil.string(col), "v1197");
 
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("col1996"), ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 1000, System.currentTimeMillis());
-        Column[] columns = cf.getSortedColumns().toArray(new Column[0]);
+        cf = cfStore.getColumnFamily(key, cellname("col1996"), Composites.EMPTY, true, 1000, System.currentTimeMillis());
+        Cell[] cells = cf.getSortedColumns().toArray(new Cell[0]);
         for (int i = 1000; i < 1996; i++)
         {
             String expectedName = "col" + i;
-            Column column = columns[i - 1000];
-            assertEquals(ByteBufferUtil.string(column.name()), expectedName);
-            assertEquals(ByteBufferUtil.string(column.value()), ("v" + i));
+            Cell cell = cells[i - 1000];
+            assertEquals(ByteBufferUtil.string(cell.name().toByteBuffer()), expectedName);
+            assertEquals(ByteBufferUtil.string(cell.value()), ("v" + i));
         }
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("col1990"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 3, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("col1990"), Composites.EMPTY, false, 3, System.currentTimeMillis());
         assertColumns(cf, "col1990", "col1991", "col1992");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1990")).value();
+        col = cf.getColumn(cellname("col1990")).value();
         assertEquals(ByteBufferUtil.string(col), "v1990");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1991")).value();
+        col = cf.getColumn(cellname("col1991")).value();
         assertEquals(ByteBufferUtil.string(col), "v1991");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1992")).value();
+        col = cf.getColumn(cellname("col1992")).value();
         assertEquals(ByteBufferUtil.string(col), "v1992");
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 3, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, true, 3, System.currentTimeMillis());
         assertColumns(cf, "col1997", "col1998", "col1999");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1997")).value();
+        col = cf.getColumn(cellname("col1997")).value();
         assertEquals(ByteBufferUtil.string(col), "v1997");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1998")).value();
+        col = cf.getColumn(cellname("col1998")).value();
         assertEquals(ByteBufferUtil.string(col), "v1998");
-        col = cf.getColumn(ByteBufferUtil.bytes("col1999")).value();
+        col = cf.getColumn(cellname("col1999")).value();
         assertEquals(ByteBufferUtil.string(col), "v1999");
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("col9000"), ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 3, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("col9000"), Composites.EMPTY, true, 3, System.currentTimeMillis());
         assertColumns(cf, "col1997", "col1998", "col1999");
 
-        cf = cfStore.getColumnFamily(key, ByteBufferUtil.bytes("col9000"), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 3, System.currentTimeMillis());
+        cf = cfStore.getColumnFamily(key, cellname("col9000"), Composites.EMPTY, false, 3, System.currentTimeMillis());
         assertColumns(cf);
     }
 
     public static void assertColumns(ColumnFamily container, String... columnNames)
     {
-        Collection<Column> columns = container == null ? new TreeSet<Column>() : container.getSortedColumns();
+        Collection<Cell> cells = container == null ? new TreeSet<Cell>() : container.getSortedColumns();
         List<String> L = new ArrayList<String>();
-        for (Column column : columns)
+        for (Cell cell : cells)
         {
-            try
-            {
-                L.add(ByteBufferUtil.string(column.name()));
-            }
-            catch (CharacterCodingException e)
-            {
-                throw new AssertionError(e);
-            }
+            L.add(Util.string(cell.name().toByteBuffer()));
         }
 
         List<String> names = new ArrayList<String>(columnNames.length);
@@ -625,23 +598,23 @@
         names.addAll(Arrays.asList(columnNames));
 
         String[] columnNames1 = names.toArray(new String[0]);
-        String[] la = L.toArray(new String[columns.size()]);
+        String[] la = L.toArray(new String[cells.size()]);
 
         assert Arrays.equals(la, columnNames1)
                 : String.format("Columns [%s])] is not expected [%s]",
-                                ((container == null) ? "" : container.getComparator().getColumnsString(columns)),
+                                ((container == null) ? "" : CellNames.getColumnsString(container.getComparator(), cells)),
                                 StringUtils.join(columnNames1, ","));
     }
 
     public static void assertColumn(ColumnFamily cf, String name, String value, long timestamp)
     {
-        assertColumn(cf.getColumn(ByteBufferUtil.bytes(name)), value, timestamp);
+        assertColumn(cf.getColumn(cellname(name)), value, timestamp);
     }
 
-    public static void assertColumn(Column column, String value, long timestamp)
+    public static void assertColumn(Cell cell, String value, long timestamp)
     {
-        assertNotNull(column);
-        assertEquals(0, ByteBufferUtil.compareUnsigned(column.value(), ByteBufferUtil.bytes(value)));
-        assertEquals(timestamp, column.timestamp());
+        assertNotNull(cell);
+        assertEquals(0, ByteBufferUtil.compareUnsigned(cell.value(), ByteBufferUtil.bytes(value)));
+        assertEquals(timestamp, cell.timestamp());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/MultitableTest.java b/test/unit/org/apache/cassandra/db/MultitableTest.java
index 3d0c8ac..cc11163 100644
--- a/test/unit/org/apache/cassandra/db/MultitableTest.java
+++ b/test/unit/org/apache/cassandra/db/MultitableTest.java

@@ -20,10 +20,6 @@
  *
  */
 
-
-import java.io.IOException;
-import java.util.concurrent.ExecutionException;
-
 import org.apache.cassandra.Util;
 import org.junit.Test;
 
@@ -34,23 +30,23 @@
 public class MultitableTest extends SchemaLoader
 {
     @Test
-    public void testSameCFs() throws IOException, ExecutionException, InterruptedException
+    public void testSameCFs()
     {
         Keyspace keyspace1 = Keyspace.open("Keyspace1");
         Keyspace keyspace2 = Keyspace.open("Keyspace2");
 
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("keymulti");
         ColumnFamily cf;
 
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("col1", "val1", 1L));
-        rm = new RowMutation("Keyspace1", dk.key, cf);
+        rm = new Mutation("Keyspace1", dk.getKey(), cf);
         rm.apply();
 
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace2", "Standard1");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace2", "Standard1");
         cf.addColumn(column("col2", "val2", 1L));
-        rm = new RowMutation("Keyspace2", dk.key, cf);
+        rm = new Mutation("Keyspace2", dk.getKey(), cf);
         rm.apply();
 
         keyspace1.getColumnFamilyStore("Standard1").forceBlockingFlush();

diff --git a/test/unit/org/apache/cassandra/db/NameSortTest.java b/test/unit/org/apache/cassandra/db/NameSortTest.java
index 1a8905e..6bd71c7 100644
--- a/test/unit/org/apache/cassandra/db/NameSortTest.java
+++ b/test/unit/org/apache/cassandra/db/NameSortTest.java

@@ -60,21 +60,21 @@
         for (int i = 0; i < N; ++i)
         {
             ByteBuffer key = ByteBufferUtil.bytes(Integer.toString(i));
-            RowMutation rm;
+            Mutation rm;
 
             // standard
             for (int j = 0; j < 8; ++j)
             {
                 ByteBuffer bytes = j % 2 == 0 ? ByteBufferUtil.bytes("a") : ByteBufferUtil.bytes("b");
-                rm = new RowMutation("Keyspace1", key);
-                rm.add("Standard1", ByteBufferUtil.bytes(("Column-" + j)), bytes, j);
+                rm = new Mutation("Keyspace1", key);
+                rm.add("Standard1", Util.cellname("Cell-" + j), bytes, j);
                 rm.applyUnsafe();
             }
 
             // super
             for (int j = 0; j < 8; ++j)
             {
-                rm = new RowMutation("Keyspace1", key);
+                rm = new Mutation("Keyspace1", key);
                 for (int k = 0; k < 4; ++k)
                 {
                     String value = (j + k) % 2 == 0 ? "a" : "b";
@@ -99,13 +99,13 @@
             ColumnFamily cf;
 
             cf = Util.getColumnFamily(keyspace, key, "Standard1");
-            Collection<Column> columns = cf.getSortedColumns();
-            for (Column column : columns)
+            Collection<Cell> cells = cf.getSortedColumns();
+            for (Cell cell : cells)
             {
-                String name = ByteBufferUtil.string(column.name());
+                String name = ByteBufferUtil.string(cell.name().toByteBuffer());
                 int j = Integer.valueOf(name.substring(name.length() - 1));
                 byte[] bytes = j % 2 == 0 ? "a".getBytes() : "b".getBytes();
-                assertEquals(new String(bytes), ByteBufferUtil.string(column.value()));
+                assertEquals(new String(bytes), ByteBufferUtil.string(cell.value()));
             }
         }
     }

diff --git a/test/unit/org/apache/cassandra/db/NativeCellTest.java b/test/unit/org/apache/cassandra/db/NativeCellTest.java
new file mode 100644
index 0000000..6a2bf73
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/NativeCellTest.java

@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.composites.CompoundDenseCellNameType;
+import org.apache.cassandra.db.composites.CompoundSparseCellNameType;
+import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
+import org.apache.cassandra.db.composites.SimpleSparseCellNameType;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.NativeAllocator;
+import org.apache.cassandra.utils.memory.NativePool;
+
+import static org.apache.cassandra.db.composites.CellNames.compositeDense;
+import static org.apache.cassandra.db.composites.CellNames.compositeSparse;
+import static org.apache.cassandra.db.composites.CellNames.simpleDense;
+import static org.apache.cassandra.db.composites.CellNames.simpleSparse;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public class NativeCellTest
+{
+
+    private static final NativeAllocator nativeAllocator = new NativePool(Integer.MAX_VALUE, Integer.MAX_VALUE, 1f, null).newAllocator();
+    private static final OpOrder.Group group = new OpOrder().start();
+
+    static class Name
+    {
+        final CellName name;
+        final CellNameType type;
+        Name(CellName name, CellNameType type)
+        {
+            this.name = name;
+            this.type = type;
+        }
+    }
+
+    static ByteBuffer[] bytess(String ... strings)
+    {
+        ByteBuffer[] r = new ByteBuffer[strings.length];
+        for (int i = 0 ; i < r.length ; i++)
+            r[i] = bytes(strings[i]);
+        return r;
+    }
+
+    final static Name[] TESTS = new Name[]
+                          {
+                              new Name(simpleDense(bytes("a")), new SimpleDenseCellNameType(UTF8Type.instance)),
+                              new Name(simpleSparse(new ColumnIdentifier("a", true)), new SimpleSparseCellNameType(UTF8Type.instance)),
+                              new Name(compositeDense(bytes("a"), bytes("b")), new CompoundDenseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance))),
+                              new Name(compositeSparse(bytess("b", "c"), new ColumnIdentifier("a", true), false), new CompoundSparseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance))),
+                              new Name(compositeSparse(bytess("b", "c"), new ColumnIdentifier("a", true), true), new CompoundSparseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance)))
+                          };
+
+    private static final CFMetaData metadata = new CFMetaData("", "", ColumnFamilyType.Standard, null);
+    static
+    {
+        try
+        {
+            metadata.addColumnDefinition(new ColumnDefinition(null, null, new ColumnIdentifier("a", true), UTF8Type.instance, null, null, null, null, null));
+        }
+        catch (ConfigurationException e)
+        {
+            throw new AssertionError();
+        }
+    }
+
+    @Test
+    public void testCells() throws IOException
+    {
+        Random rand = ThreadLocalRandom.current();
+        for (Name test : TESTS)
+        {
+            byte[] bytes = new byte[16];
+            rand.nextBytes(bytes);
+
+            // test regular Cell
+            Cell buf, nat;
+            buf = new BufferCell(test.name, ByteBuffer.wrap(bytes), rand.nextLong());
+            nat = buf.localCopy(metadata, nativeAllocator, group);
+            test(test, buf, nat);
+
+            // test DeletedCell
+            buf = new BufferDeletedCell(test.name, rand.nextInt(100000), rand.nextLong());
+            nat = buf.localCopy(metadata, nativeAllocator, group);
+            test(test, buf, nat);
+
+            // test ExpiringCell
+            buf = new BufferExpiringCell(test.name, ByteBuffer.wrap(bytes), rand.nextLong(),  rand.nextInt(100000));
+            nat = buf.localCopy(metadata, nativeAllocator, group);
+            test(test, buf, nat);
+
+            // test CounterCell
+            buf = new BufferCounterCell(test.name, CounterContext.instance().createLocal(rand.nextLong()), rand.nextLong(),  rand.nextInt(100000));
+            nat = buf.localCopy(metadata, nativeAllocator, group);
+            test(test, buf, nat);
+        }
+    }
+
+
+    @Test
+    public void testComparator()
+    {
+
+        Random rand = ThreadLocalRandom.current();
+        for (Name test : TESTS)
+        {
+            byte[] bytes = new byte[7];
+            byte[] bytes2 = new byte[7];
+            rand.nextBytes(bytes);
+            rand.nextBytes(bytes2);
+
+            // test regular Cell
+            Cell buf, nat, buf2, nat2;
+            buf = new BufferCell(test.name, ByteBuffer.wrap(bytes), rand.nextLong());
+            nat = buf.localCopy(metadata, nativeAllocator, group);
+
+            buf2 = new BufferCell(test.name, ByteBuffer.wrap(bytes2), rand.nextLong());
+            nat2 = buf2.localCopy(metadata, nativeAllocator, group);
+
+            assert test.type.compare(buf.name(), nat.name()) == 0;
+            assert test.type.compare(buf2.name(), nat2.name()) == 0;
+
+            int val = test.type.compare(buf.name(), buf2.name());
+            assert test.type.compare(nat.name(), nat2.name()) == val;
+            assert test.type.compare(nat.name(), buf2.name()) == val;
+            assert test.type.compare(buf.name(), nat2.name()) == val;
+
+
+            // test DeletedCell
+            buf = new BufferDeletedCell(test.name, rand.nextInt(100000), rand.nextLong());
+            nat = buf.localCopy(metadata, nativeAllocator, group);
+            buf2 = new BufferDeletedCell(test.name, rand.nextInt(100000), rand.nextLong());
+            nat2 = buf2.localCopy(metadata, nativeAllocator, group);
+
+            assert test.type.compare(buf.name(), nat.name()) == 0;
+            assert test.type.compare(buf2.name(), nat2.name()) == 0;
+
+            val = test.type.compare(buf.name(), buf2.name());
+            assert test.type.compare(nat.name(), nat2.name()) == val;
+            assert test.type.compare(nat.name(), buf2.name()) == val;
+            assert test.type.compare(buf.name(), nat2.name()) == val;
+
+
+
+            // test ExpiringCell
+            buf = new BufferExpiringCell(test.name, ByteBuffer.wrap(bytes), rand.nextLong(),  rand.nextInt(100000));
+            nat = buf.localCopy(metadata, nativeAllocator, group);
+
+            buf2 = new BufferExpiringCell(test.name, ByteBuffer.wrap(bytes2), rand.nextLong(),  rand.nextInt(100000));
+            nat2 = buf2.localCopy(metadata, nativeAllocator, group);
+
+            assert test.type.compare(buf.name(), nat.name()) == 0;
+            assert test.type.compare(buf2.name(), nat2.name()) == 0;
+
+            val = test.type.compare(buf.name(), buf2.name());
+            assert test.type.compare(nat.name(), nat2.name()) == val;
+            assert test.type.compare(nat.name(), buf2.name()) == val;
+            assert test.type.compare(buf.name(), nat2.name()) == val;
+
+
+            // test CounterCell
+            buf = new BufferCounterCell(test.name, CounterContext.instance().createLocal(rand.nextLong()), rand.nextLong(),  rand.nextInt(100000));
+            nat = buf.localCopy(metadata, nativeAllocator, group);
+
+            buf2 = new BufferCounterCell(test.name, CounterContext.instance().createLocal(rand.nextLong()), rand.nextLong(),  rand.nextInt(100000));
+            nat2 = buf2.localCopy(metadata, nativeAllocator, group);
+
+            assert test.type.compare(buf.name(), nat.name()) == 0;
+            assert test.type.compare(buf2.name(), nat2.name()) == 0;
+
+            val = test.type.compare(buf.name(), buf2.name());
+            assert test.type.compare(nat.name(), nat2.name()) == val;
+            assert test.type.compare(nat.name(), buf2.name()) == val;
+            assert test.type.compare(buf.name(), nat2.name()) == val;
+
+        }
+    }
+
+    static void test(Name test, Cell buf, Cell nat) throws IOException
+    {
+        Assert.assertTrue(buf.equals(nat));
+        Assert.assertTrue(nat.equals(buf));
+        Assert.assertTrue(buf.equals(buf));
+        Assert.assertTrue(nat.equals(nat));
+
+        try
+        {
+            MessageDigest d1 = MessageDigest.getInstance("MD5");
+            MessageDigest d2 = MessageDigest.getInstance("MD5");
+            buf.updateDigest(d1);
+            nat.updateDigest(d2);
+            Assert.assertArrayEquals(d1.digest(), d2.digest());
+        }
+        catch (NoSuchAlgorithmException e)
+        {
+            throw new IllegalStateException(e);
+        }
+
+        byte[] serialized;
+        try (DataOutputBuffer bufOut = new DataOutputBuffer())
+        {
+            test.type.columnSerializer().serialize(nat, bufOut);
+            serialized = bufOut.getData();
+        }
+
+        ByteArrayInputStream bufIn = new ByteArrayInputStream(serialized, 0, serialized.length);
+        Cell deserialized = test.type.columnSerializer().deserialize(new DataInputStream(bufIn));
+        Assert.assertTrue(buf.equals(deserialized));
+
+    }
+
+
+
+}

diff --git a/test/unit/org/apache/cassandra/db/RangeTombstoneListTest.java b/test/unit/org/apache/cassandra/db/RangeTombstoneListTest.java
index b0065e0..712cfa2 100644
--- a/test/unit/org/apache/cassandra/db/RangeTombstoneListTest.java
+++ b/test/unit/org/apache/cassandra/db/RangeTombstoneListTest.java

@@ -18,21 +18,193 @@
 */
 package org.apache.cassandra.db;
 
-import java.nio.ByteBuffer;
 import java.util.*;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
 
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.marshal.IntegerType;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class RangeTombstoneListTest
 {
-    private static final Comparator<ByteBuffer> cmp = IntegerType.instance;
+    private static final Comparator<Composite> cmp = new SimpleDenseCellNameType(IntegerType.instance);
     private static final Random rand = new Random();
 
     @Test
+    public void testDiff()
+    {
+        RangeTombstoneList superset;
+        RangeTombstoneList subset;
+        RangeTombstoneList diff;
+        Iterator<RangeTombstone> iter;
+
+        // no difference
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 10));
+        superset.add(rt(40, 50, 10));
+        subset.add(rt(1, 10, 10));
+        subset.add(rt(20, 30, 10));
+        subset.add(rt(40, 50, 10));
+        assertNull( subset.diff(superset));
+
+        // all items in subset are contained by the first range in the superset
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        subset.add(rt(1, 2, 3));
+        subset.add(rt(3, 4, 4));
+        subset.add(rt(5, 6, 5));
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 10));
+        superset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(1, 10, 10), iter.next());
+        assertRT(rt(20, 30, 10), iter.next());
+        assertRT(rt(40, 50, 10), iter.next());
+        assertFalse(iter.hasNext());
+
+        // multiple subset RTs are contained by superset RTs
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        subset.add(rt(1, 2, 1));
+        subset.add(rt(3, 4, 2));
+        subset.add(rt(5, 6, 3));
+        superset.add(rt(1, 5, 2));
+        superset.add(rt(5, 6, 3));
+        superset.add(rt(6, 10, 2));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(1, 5, 2), iter.next());
+        assertRT(rt(6, 10, 2), iter.next());
+        assertFalse(iter.hasNext());
+
+        // the superset has one RT that covers the entire subset
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 50, 10));
+        subset.add(rt(1, 10, 10));
+        subset.add(rt(20, 30, 10));
+        subset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(1, 50, 10), iter.next());
+        assertFalse(iter.hasNext());
+
+        // the superset has one RT that covers the remainder of the subset
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 50, 10));
+        subset.add(rt(1, 10, 10));
+        subset.add(rt(20, 30, 10));
+        subset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(20, 50, 10), iter.next());
+        assertFalse(iter.hasNext());
+
+        // only the timestamp differs on one RT
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 20));
+        superset.add(rt(40, 50, 10));
+        subset.add(rt(1, 10, 10));
+        subset.add(rt(20, 30, 10));
+        subset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(20, 30, 20), iter.next());
+        assertFalse(iter.hasNext());
+
+        // superset has a large range on an RT at the start
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 10));
+        superset.add(rt(40, 50, 10));
+        subset.add(rt(1, 2, 3));
+        subset.add(rt(20, 30, 10));
+        subset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(1, 10, 10), iter.next());
+        assertFalse(iter.hasNext());
+
+        // superset has a larger range on an RT in the middle
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 10));
+        superset.add(rt(40, 50, 10));
+        subset.add(rt(1, 10, 10));
+        subset.add(rt(20, 25, 10));
+        subset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(20, 30, 10), iter.next());
+        assertFalse(iter.hasNext());
+
+        // superset has a larger range on an RT at the end
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 10));
+        superset.add(rt(40, 55, 10));
+        subset.add(rt(1, 10, 10));
+        subset.add(rt(20, 30, 10));
+        subset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(40, 55, 10), iter.next());
+        assertFalse(iter.hasNext());
+
+         // superset has one additional RT in the middle
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 10));
+        superset.add(rt(40, 50, 10));
+        subset.add(rt(1, 10, 10));
+        subset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(20, 30, 10), iter.next());
+        assertFalse(iter.hasNext());
+
+        // superset has one additional RT at the start
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 10));
+        superset.add(rt(40, 50, 10));
+        subset.add(rt(20, 30, 10));
+        subset.add(rt(40, 50, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(1, 10, 10), iter.next());
+        assertFalse(iter.hasNext());
+
+        // superset has one additional RT at the end
+        superset = new RangeTombstoneList(cmp, 10);
+        subset = new RangeTombstoneList(cmp, 10);
+        superset.add(rt(1, 10, 10));
+        superset.add(rt(20, 30, 10));
+        superset.add(rt(40, 50, 10));
+        subset.add(rt(1, 10, 10));
+        subset.add(rt(20, 30, 10));
+        diff = subset.diff(superset);
+        iter = diff.iterator();
+        assertRT(rt(40, 50, 10), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
     public void sortedAdditionTest()
     {
         sortedAdditionTest(0);
@@ -112,7 +284,7 @@
         l2.add(rt(4, 10, 12L));
         l2.add(rt(0, 8, 25L));
 
-        assertEquals(25L, l2.search(b(8)).markedForDeleteAt);
+        assertEquals(25L, l2.searchDeletionTime(b(8)).markedForDeleteAt);
     }
 
     @Test
@@ -159,9 +331,9 @@
         l.add(rt(1, 4, 2));
         l.add(rt(4, 10, 5));
 
-        assertEquals(2, l.search(b(3)).markedForDeleteAt);
-        assertEquals(5, l.search(b(4)).markedForDeleteAt);
-        assertEquals(5, l.search(b(8)).markedForDeleteAt);
+        assertEquals(2, l.searchDeletionTime(b(3)).markedForDeleteAt);
+        assertEquals(5, l.searchDeletionTime(b(4)).markedForDeleteAt);
+        assertEquals(5, l.searchDeletionTime(b(8)).markedForDeleteAt);
         assertEquals(3, l.size());
     }
 
@@ -175,31 +347,25 @@
         l.add(rt(14, 15, 3));
         l.add(rt(15, 17, 6));
 
-        assertEquals(null, l.search(b(-1)));
+        assertEquals(null, l.searchDeletionTime(b(-1)));
 
-        assertEquals(5, l.search(b(0)).markedForDeleteAt);
-        assertEquals(5, l.search(b(3)).markedForDeleteAt);
-        assertEquals(5, l.search(b(4)).markedForDeleteAt);
+        assertEquals(5, l.searchDeletionTime(b(0)).markedForDeleteAt);
+        assertEquals(5, l.searchDeletionTime(b(3)).markedForDeleteAt);
+        assertEquals(5, l.searchDeletionTime(b(4)).markedForDeleteAt);
 
-        assertEquals(2, l.search(b(5)).markedForDeleteAt);
+        assertEquals(2, l.searchDeletionTime(b(5)).markedForDeleteAt);
 
-        assertEquals(null, l.search(b(7)));
+        assertEquals(null, l.searchDeletionTime(b(7)));
 
-        assertEquals(3, l.search(b(14)).markedForDeleteAt);
+        assertEquals(3, l.searchDeletionTime(b(14)).markedForDeleteAt);
 
-        assertEquals(6, l.search(b(15)).markedForDeleteAt);
-        assertEquals(null, l.search(b(18)));
+        assertEquals(6, l.searchDeletionTime(b(15)).markedForDeleteAt);
+        assertEquals(null, l.searchDeletionTime(b(18)));
     }
 
     @Test
     public void addAllTest()
     {
-        //addAllTest(false);
-        addAllTest(true);
-    }
-
-    private void addAllTest(boolean doMerge)
-    {
         RangeTombstoneList l1 = new RangeTombstoneList(cmp, 0);
         l1.add(rt(0, 4, 5));
         l1.add(rt(6, 10, 2));
@@ -393,14 +559,14 @@
         return sb.append(" }").toString();
     }
 
-    private static ByteBuffer b(int i)
+    private static Composite b(int i)
     {
-        return ByteBufferUtil.bytes(i);
+        return Util.cellname(i);
     }
 
-    private static int i(ByteBuffer bb)
+    private static int i(Composite c)
     {
-        return ByteBufferUtil.toInt(bb);
+        return ByteBufferUtil.toInt(c.toByteBuffer());
     }
 
     private static RangeTombstone rt(int start, int end, long tstamp)

diff --git a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
index 4dc7c0b..e266bf6 100644
--- a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
+++ b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java

@@ -20,28 +20,43 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
 import java.util.concurrent.ExecutionException;
 
 import com.google.common.collect.ImmutableMap;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
-import org.apache.cassandra.db.index.*;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
-import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.thrift.IndexType;
-
-import org.junit.Ignore;
+import com.google.common.collect.Iterators;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.IndexType;
+import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.composites.Composites;
+import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.filter.IDiskAtomFilter;
+import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.filter.SliceQueryFilter;
+import org.apache.cassandra.db.index.PerColumnSecondaryIndex;
+import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.db.index.SecondaryIndexSearcher;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
 
 import static org.apache.cassandra.Util.dk;
 import static org.junit.Assert.assertEquals;
@@ -60,28 +75,28 @@
 
         // Inserting data
         String key = "k1";
-        RowMutation rm;
+        Mutation rm;
         ColumnFamily cf;
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         for (int i = 0; i < 40; i += 2)
             add(rm, i, 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         cf = rm.addOrGet(CFNAME);
         delete(cf, 10, 22, 1);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         for (int i = 1; i < 40; i += 2)
             add(rm, i, 2);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         cf = rm.addOrGet(CFNAME);
         delete(cf, 19, 27, 3);
         rm.apply();
@@ -90,7 +105,7 @@
         // Queries by name
         int[] live = new int[]{ 4, 9, 11, 17, 28 };
         int[] dead = new int[]{ 12, 19, 21, 24, 27 };
-        SortedSet<ByteBuffer> columns = new TreeSet<>(cfs.getComparator());
+        SortedSet<CellName> columns = new TreeSet<CellName>(cfs.getComparator());
         for (int i : live)
             columns.add(b(i));
         for (int i : dead)
@@ -98,17 +113,118 @@
         cf = cfs.getColumnFamily(QueryFilter.getNamesFilter(dk(key), CFNAME, columns, System.currentTimeMillis()));
 
         for (int i : live)
-            assert isLive(cf, cf.getColumn(b(i))) : "Column " + i + " should be live";
+            assert isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " should be live";
         for (int i : dead)
-            assert !isLive(cf, cf.getColumn(b(i))) : "Column " + i + " shouldn't be live";
+            assert !isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " shouldn't be live";
 
         // Queries by slices
         cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(7), b(30), false, Integer.MAX_VALUE, System.currentTimeMillis()));
 
         for (int i : new int[]{ 7, 8, 9, 11, 13, 15, 17, 28, 29, 30 })
-            assert isLive(cf, cf.getColumn(b(i))) : "Column " + i + " should be live";
+            assert isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " should be live";
         for (int i : new int[]{ 10, 12, 14, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 })
-            assert !isLive(cf, cf.getColumn(b(i))) : "Column " + i + " shouldn't be live";
+            assert !isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " shouldn't be live";
+    }
+
+    @Test
+    public void rangeTombstoneFilteringTest() throws Exception
+    {
+        CompactionManager.instance.disableAutoCompaction();
+        Keyspace keyspace = Keyspace.open(KSNAME);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CFNAME);
+
+        // Inserting data
+        String key = "k111";
+        Mutation rm;
+        ColumnFamily cf;
+
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
+        for (int i = 0; i < 40; i += 2)
+            add(rm, i, 0);
+        rm.apply();
+
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
+        cf = rm.addOrGet(CFNAME);
+        delete(cf, 5, 10, 1);
+        rm.apply();
+
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
+        cf = rm.addOrGet(CFNAME);
+        delete(cf, 15, 20, 2);
+        rm.apply();
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(11), b(14), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        Collection<RangeTombstone> rt = rangeTombstones(cf);
+        assertEquals(0, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(11), b(15), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(1, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(20), b(25), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(1, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(12), b(25), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(1, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(25), b(35), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(0, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(1), b(40), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(2, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(7), b(17), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(2, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(5), b(20), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(2, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(5), b(15), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(2, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(1), b(2), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(0, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(1), b(5), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(1, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(1), b(10), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(1, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(5), b(6), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(1, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(17), b(20), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(1, rt.size());
+
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(17), b(18), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        rt = rangeTombstones(cf);
+        assertEquals(1, rt.size());
+
+        ColumnSlice[] slices = new ColumnSlice[]{new ColumnSlice( b(1), b(10)), new ColumnSlice( b(16), b(20))};
+        IDiskAtomFilter sqf = new SliceQueryFilter(slices, false, Integer.MAX_VALUE);
+        cf = cfs.getColumnFamily( new QueryFilter(dk(key), CFNAME, sqf, System.currentTimeMillis()) );
+        rt = rangeTombstones(cf);
+        assertEquals(2, rt.size());
+    }
+
+    private Collection<RangeTombstone> rangeTombstones(ColumnFamily cf)
+    {
+        List<RangeTombstone> tombstones = new ArrayList<RangeTombstone>();
+        Iterators.addAll(tombstones, cf.deletionInfo().rangeIterator());
+        return tombstones;
     }
 
     @Test
@@ -118,7 +234,7 @@
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.truncateBlocking();
         String key = "rt_times";
-        RowMutation rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        Mutation rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         ColumnFamily cf = rm.addOrGet(CFNAME);
         long timestamp = System.currentTimeMillis();
         cf.delete(new DeletionInfo(1000, (int)(timestamp/1000)));
@@ -138,11 +254,11 @@
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.truncateBlocking();
         String key = "rt_times";
-        RowMutation rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        Mutation rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         add(rm, 5, 999);
         rm.apply();
         key = "rt_times2";
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         ColumnFamily cf = rm.addOrGet(CFNAME);
         int timestamp = (int)(System.currentTimeMillis()/1000);
         cf.delete(new DeletionInfo(1000, timestamp));
@@ -161,7 +277,7 @@
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.truncateBlocking();
         String key = "rt_times";
-        RowMutation rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        Mutation rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         ColumnFamily cf = rm.addOrGet(CFNAME);
         long timestamp = System.currentTimeMillis();
         cf.delete(new DeletionInfo(b(1), b(2), cfs.getComparator(), 1000, (int)(timestamp/1000)));
@@ -181,11 +297,11 @@
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.truncateBlocking();
         String key = "rt_times";
-        RowMutation rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        Mutation rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         add(rm, 5, 999);
         rm.apply();
         key = "rt_times2";
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         ColumnFamily cf = rm.addOrGet(CFNAME);
         int timestamp = (int)(System.currentTimeMillis()/1000);
         cf.delete(new DeletionInfo(b(1), b(2), cfs.getComparator(), 1000, timestamp));
@@ -198,7 +314,7 @@
         assertTimes(sstable.getSSTableMetadata(), 999, 1000, Integer.MAX_VALUE);
     }
 
-    private void assertTimes(SSTableMetadata metadata, long min, long max, int localDeletionTime)
+    private void assertTimes(StatsMetadata metadata, long min, long max, int localDeletionTime)
     {
         assertEquals(min, metadata.minTimestamp);
         assertEquals(max, metadata.maxTimestamp);
@@ -208,20 +324,19 @@
     @Test
     public void test7810() throws ExecutionException, InterruptedException, IOException
     {
-        DatabaseDescriptor.setInMemoryCompactionLimit(0);
         Keyspace ks = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.metadata.gcGraceSeconds(2);
 
         String key = "7810";
-        RowMutation rm;
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        Mutation rm;
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         for (int i = 10; i < 20; i++)
             add(rm, i, 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         ColumnFamily cf = rm.addOrGet(CFNAME);
         cf.delete(new DeletionInfo(b(10),b(11), cfs.getComparator(), 1, 1));
         rm.apply();
@@ -234,19 +349,18 @@
     @Test
     public void test7808_1() throws ExecutionException, InterruptedException
     {
-        DatabaseDescriptor.setInMemoryCompactionLimit(0);
         Keyspace ks = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.metadata.gcGraceSeconds(2);
 
         String key = "7808_1";
-        RowMutation rm;
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        Mutation rm;
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         for (int i = 0; i < 40; i += 2)
             add(rm, i, 0);
         rm.apply();
         cfs.forceBlockingFlush();
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         ColumnFamily cf = rm.addOrGet(CFNAME);
         cf.delete(new DeletionInfo(1, 1));
         rm.apply();
@@ -258,25 +372,24 @@
     @Test
     public void test7808_2() throws ExecutionException, InterruptedException, IOException
     {
-        DatabaseDescriptor.setInMemoryCompactionLimit(0);
         Keyspace ks = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.metadata.gcGraceSeconds(2);
 
         String key = "7808_2";
-        RowMutation rm;
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        Mutation rm;
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         for (int i = 10; i < 20; i++)
             add(rm, i, 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         ColumnFamily cf = rm.addOrGet(CFNAME);
         cf.delete(new DeletionInfo(0,0));
         rm.apply();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         add(rm, 5, 1);
         rm.apply();
 
@@ -295,28 +408,28 @@
 
         // Inserting data
         String key = "k2";
-        RowMutation rm;
+        Mutation rm;
         ColumnFamily cf;
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         for (int i = 0; i < 20; i++)
             add(rm, i, 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         cf = rm.addOrGet(CFNAME);
         delete(cf, 5, 15, 1);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         cf = rm.addOrGet(CFNAME);
         delete(cf, 5, 10, 1);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         cf = rm.addOrGet(CFNAME);
         delete(cf, 5, 8, 2);
         rm.apply();
@@ -325,22 +438,22 @@
         cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(key), CFNAME, System.currentTimeMillis()));
 
         for (int i = 0; i < 5; i++)
-            assert isLive(cf, cf.getColumn(b(i))) : "Column " + i + " should be live";
+            assert isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " should be live";
         for (int i = 16; i < 20; i++)
-            assert isLive(cf, cf.getColumn(b(i))) : "Column " + i + " should be live";
+            assert isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " should be live";
         for (int i = 5; i <= 15; i++)
-            assert !isLive(cf, cf.getColumn(b(i))) : "Column " + i + " shouldn't be live";
+            assert !isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " shouldn't be live";
 
         // Compact everything and re-test
         CompactionManager.instance.performMaximal(cfs);
         cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(key), CFNAME, System.currentTimeMillis()));
 
         for (int i = 0; i < 5; i++)
-            assert isLive(cf, cf.getColumn(b(i))) : "Column " + i + " should be live";
+            assert isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " should be live";
         for (int i = 16; i < 20; i++)
-            assert isLive(cf, cf.getColumn(b(i))) : "Column " + i + " should be live";
+            assert isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " should be live";
         for (int i = 5; i <= 15; i++)
-            assert !isLive(cf, cf.getColumn(b(i))) : "Column " + i + " shouldn't be live";
+            assert !isLive(cf, cf.getColumn(b(i))) : "Cell " + i + " shouldn't be live";
     }
 
     @Test
@@ -351,15 +464,15 @@
 
         // Inserting data
         String key = "k3";
-        RowMutation rm;
+        Mutation rm;
         ColumnFamily cf;
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         add(rm, 2, 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, ByteBufferUtil.bytes(key));
+        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
         // Deletes everything but without being a row tombstone
         delete(rm.addOrGet(CFNAME), 0, 10, 1);
         add(rm, 1, 2);
@@ -367,7 +480,7 @@
         cfs.forceBlockingFlush();
 
         // Get the last value of the row
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 1, System.currentTimeMillis()));
+        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, Composites.EMPTY, Composites.EMPTY, true, 1, System.currentTimeMillis()));
 
         assert !cf.isEmpty();
         int last = i(cf.getSortedColumns().iterator().next().name());
@@ -375,22 +488,13 @@
     }
 
     @Test
-    public void testPreCompactedRowWithRangeTombstonesUpdatesSecondaryIndex() throws Exception
+    public void testRowWithRangeTombstonesUpdatesSecondaryIndex() throws Exception
     {
-        // nothing special to do here, just run the test
         runCompactionWithRangeTombstoneAndCheckSecondaryIndex();
     }
 
     @Test
-    public void testLazilyCompactedRowWithRangeTombstonesUpdatesSecondaryIndex() throws Exception
-    {
-        // make sure we use LazilyCompactedRow by exceeding in_memory_compaction_limit
-        DatabaseDescriptor.setInMemoryCompactionLimit(0);
-        runCompactionWithRangeTombstoneAndCheckSecondaryIndex();
-    }
-
-    @Test
-    public void testLazilyCompactedRowGeneratesSameSSTablesAsPreCompactedRow() throws Exception
+    public void testRangeTombstoneCompaction() throws Exception
     {
         Keyspace table = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = table.getColumnFamilyStore(CFNAME);
@@ -401,13 +505,13 @@
         cfs.disableAutoCompaction();
         cfs.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getCanonicalName());
 
-        RowMutation rm = new RowMutation(KSNAME, key);
+        Mutation rm = new Mutation(KSNAME, key);
         for (int i = 0; i < 10; i += 2)
             add(rm, i, 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, key);
+        rm = new Mutation(KSNAME, key);
         ColumnFamily cf = rm.addOrGet(CFNAME);
         for (int i = 0; i < 10; i += 2)
             delete(cf, 0, 7, 0);
@@ -432,7 +536,7 @@
             if (cnt == 0)
                 assertTrue(atom instanceof RangeTombstone);
             if (cnt > 0)
-                assertTrue(atom instanceof Column);
+                assertTrue(atom instanceof Cell);
             cnt++;
         }
         assertEquals(2, cnt);
@@ -451,48 +555,35 @@
         cfs.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getCanonicalName());
         if (cfs.indexManager.getIndexForColumn(indexedColumnName) == null)
         {
-            ColumnDefinition cd = new ColumnDefinition(indexedColumnName,
-                    cfs.getComparator(),
-                    IndexType.CUSTOM,
-                    ImmutableMap.of(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME, TestIndex.class.getName()),
-                    "test_index",
-                    0,
-                    null);
+            ColumnDefinition cd = new ColumnDefinition(cfs.metadata, indexedColumnName, Int32Type.instance, null, ColumnDefinition.Kind.REGULAR);
+            cd.setIndex("test_index", IndexType.CUSTOM, ImmutableMap.of(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME, TestIndex.class.getName()));
             cfs.indexManager.addIndexedColumn(cd);
         }
 
         TestIndex index = ((TestIndex)cfs.indexManager.getIndexForColumn(indexedColumnName));
         index.resetCounts();
 
-        RowMutation rm = new RowMutation(KSNAME, key);
+        Mutation rm = new Mutation(KSNAME, key);
         add(rm, 1, 0);
         rm.apply();
 
         // add a RT which hides the column we just inserted
-        rm = new RowMutation(KSNAME, key);
+        rm = new Mutation(KSNAME, key);
         ColumnFamily cf = rm.addOrGet(CFNAME);
         delete(cf, 0, 1, 1);
         rm.apply();
 
         // now re-insert that column
-        rm = new RowMutation(KSNAME, key);
+        rm = new Mutation(KSNAME, key);
         add(rm, 1, 2);
         rm.apply();
 
         cfs.forceBlockingFlush();
 
-        // We should have 2 updates to the indexed "1" column
-        assertEquals(2, index.inserts.size());
-
-        CompactionManager.instance.performMaximal(cfs);
-
-        // verify that the "1" indexed column removed from the index
-        // only once, by the re-indexing caused by the second insertion.
-        // This second write deletes from the 2i because the original column
-        // was still in the main cf's memtable (shadowed by the RT). One
-        // thing we're checking for here is that there wasn't an additional,
-        // bogus delete issued to the 2i (CASSANDRA-6517)
-        assertEquals(1, index.deletes.size());
+        // We should have 1 insert and 1 update to the indexed "1" column
+        // CASSANDRA-6640 changed index update to just update, not insert then delete
+        assertEquals(1, index.inserts.size());
+        assertEquals(1, index.updates.size());
     }
 
     private void runCompactionWithRangeTombstoneAndCheckSecondaryIndex() throws Exception
@@ -507,26 +598,21 @@
         cfs.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getCanonicalName());
         if (cfs.indexManager.getIndexForColumn(indexedColumnName) == null)
         {
-            ColumnDefinition cd = new ColumnDefinition(indexedColumnName,
-                                                       cfs.getComparator(),
-                                                       IndexType.CUSTOM,
-                                                       ImmutableMap.of(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME, TestIndex.class.getName()),
-                                                       "test_index",
-                                                       0,
-                                                       null);
+            ColumnDefinition cd = ColumnDefinition.regularDef(cfs.metadata, indexedColumnName, cfs.getComparator().asAbstractType(), 0)
+                                                  .setIndex("test_index", IndexType.CUSTOM, ImmutableMap.of(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME, TestIndex.class.getName()));
             cfs.indexManager.addIndexedColumn(cd);
         }
 
         TestIndex index = ((TestIndex)cfs.indexManager.getIndexForColumn(indexedColumnName));
         index.resetCounts();
 
-        RowMutation rm = new RowMutation(KSNAME, key);
+        Mutation rm = new Mutation(KSNAME, key);
         for (int i = 0; i < 10; i++)
             add(rm, i, 0);
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation(KSNAME, key);
+        rm = new Mutation(KSNAME, key);
         ColumnFamily cf = rm.addOrGet(CFNAME);
         for (int i = 0; i < 10; i += 2)
             delete(cf, 0, 7, 0);
@@ -546,24 +632,24 @@
         assertEquals(index.deletes.get(0), index.inserts.get(0));
     }
 
-    private static boolean isLive(ColumnFamily cf, Column c)
+    private static boolean isLive(ColumnFamily cf, Cell c)
     {
-        return c != null && !c.isMarkedForDelete(System.currentTimeMillis()) && !cf.deletionInfo().isDeleted(c);
+        return c != null && c.isLive() && !cf.deletionInfo().isDeleted(c);
     }
 
-    private static ByteBuffer b(int i)
+    private static CellName b(int i)
     {
-        return ByteBufferUtil.bytes(i);
+        return CellNames.simpleDense(ByteBufferUtil.bytes(i));
     }
 
-    private static int i(ByteBuffer i)
+    private static int i(CellName i)
     {
-        return ByteBufferUtil.toInt(i);
+        return ByteBufferUtil.toInt(i.toByteBuffer());
     }
 
-    private static void add(RowMutation rm, int value, long timestamp)
+    private static void add(Mutation rm, int value, long timestamp)
     {
-        rm.add(CFNAME, b(value), b(value), timestamp);
+        rm.add(CFNAME, b(value), ByteBufferUtil.bytes(value), timestamp);
     }
 
     private static void delete(ColumnFamily cf, int from, int to, long timestamp)
@@ -577,26 +663,31 @@
 
     public static class TestIndex extends PerColumnSecondaryIndex
     {
-        public List<Column> inserts = new ArrayList<>();
-        public List<Column> deletes = new ArrayList<>();
+        public List<Cell> inserts = new ArrayList<>();
+        public List<Cell> deletes = new ArrayList<>();
+        public List<Cell> updates = new ArrayList<>();
 
         public void resetCounts()
         {
             inserts.clear();
             deletes.clear();
+            updates.clear();
         }
 
-        public void delete(ByteBuffer rowKey, Column col)
+        public void delete(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup)
         {
             deletes.add(col);
         }
 
-        public void insert(ByteBuffer rowKey, Column col)
+        public void insert(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup)
         {
             inserts.add(col);
         }
 
-        public void update(ByteBuffer rowKey, Column col){}
+        public void update(ByteBuffer rowKey, Cell oldCol, Cell col, OpOrder.Group opGroup)
+        {
+            updates.add(col);
+        }
 
         public void init(){}
 
@@ -610,8 +701,6 @@
 
         public void forceBlockingFlush(){}
 
-        public long getLiveSize(){ return 0; }
-
         public ColumnFamilyStore getIndexCfs(){ return null; }
 
         public void removeIndex(ByteBuffer columnName){}
@@ -619,5 +708,12 @@
         public void invalidate(){}
 
         public void truncateBlocking(long truncatedAt) { }
+
+        public boolean indexes(CellName name) { return name.toByteBuffer().equals(ByteBufferUtil.bytes(1)); }
+
+        @Override
+        public long estimateResultRows() {
+            return 0;
+        }
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/ReadMessageTest.java b/test/unit/org/apache/cassandra/db/ReadMessageTest.java
index 94928c0..6d19d3a 100644
--- a/test/unit/org/apache/cassandra/db/ReadMessageTest.java
+++ b/test/unit/org/apache/cassandra/db/ReadMessageTest.java

@@ -30,6 +30,7 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.NamesQueryFilter;
 import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.io.util.DataOutputBuffer;
@@ -43,33 +44,27 @@
     @Test
     public void testMakeReadMessage() throws IOException
     {
-        SortedSet<ByteBuffer> colList = new TreeSet<ByteBuffer>();
-        colList.add(ByteBufferUtil.bytes("col1"));
-        colList.add(ByteBufferUtil.bytes("col2"));
+        CellNameType type = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard1").getComparator();
+
+        SortedSet<CellName> colList = new TreeSet<CellName>(type);
+        colList.add(Util.cellname("col1"));
+        colList.add(Util.cellname("col2"));
 
         ReadCommand rm, rm2;
         DecoratedKey dk = Util.dk("row1");
         long ts = System.currentTimeMillis();
 
-        rm = new SliceByNamesReadCommand("Keyspace1", dk.key, "Standard1", ts, new NamesQueryFilter(colList));
+        rm = new SliceByNamesReadCommand("Keyspace1", dk.getKey(), "Standard1", ts, new NamesQueryFilter(colList));
         rm2 = serializeAndDeserializeReadMessage(rm);
         assert rm2.toString().equals(rm.toString());
 
-        rm = new SliceFromReadCommand("Keyspace1", dk.key, "Standard1", ts, new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 2));
+        rm = new SliceFromReadCommand("Keyspace1", dk.getKey(), "Standard1", ts, new SliceQueryFilter(Composites.EMPTY, Composites.EMPTY, true, 2));
         rm2 = serializeAndDeserializeReadMessage(rm);
         assert rm2.toString().equals(rm.toString());
 
-        rm = new SliceFromReadCommand("Keyspace1", dk.key, "Standard1", ts, new SliceQueryFilter(ByteBufferUtil.bytes("a"), ByteBufferUtil.bytes("z"), true, 5));
-        rm2 = serializeAndDeserializeReadMessage(rm);
-        assertEquals(rm2.toString(), rm.toString());
-
-        rm = new SliceFromReadCommand("Keyspace1", dk.key, "Standard1", ts, new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 2));
+        rm = new SliceFromReadCommand("Keyspace1", dk.getKey(), "Standard1", ts, new SliceQueryFilter(Util.cellname("a"), Util.cellname("z"), true, 5));
         rm2 = serializeAndDeserializeReadMessage(rm);
         assert rm2.toString().equals(rm.toString());
-
-        rm = new SliceFromReadCommand("Keyspace1", dk.key, "Standard1", ts, new SliceQueryFilter(ByteBufferUtil.bytes("a"), ByteBufferUtil.bytes("z"), true, 5));
-        rm2 = serializeAndDeserializeReadMessage(rm);
-        assertEquals(rm2.toString(), rm.toString());
     }
 
     private ReadCommand serializeAndDeserializeReadMessage(ReadCommand rm) throws IOException
@@ -84,34 +79,33 @@
     }
 
     @Test
-    public void testGetColumn() throws IOException, ColumnFamilyNotDefinedException
+    public void testGetColumn()
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
-        RowMutation rm;
+        CellNameType type = keyspace.getColumnFamilyStore("Standard1").getComparator();
+        Mutation rm;
         DecoratedKey dk = Util.dk("key1");
 
         // add data
-        rm = new RowMutation("Keyspace1", dk.key);
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("abcd"), 0);
+        rm = new Mutation("Keyspace1", dk.getKey());
+        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("abcd"), 0);
         rm.apply();
 
-        ReadCommand command = new SliceByNamesReadCommand("Keyspace1", dk.key, "Standard1", System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(ByteBufferUtil.bytes("Column1"), cfs.getComparator())));
+        ReadCommand command = new SliceByNamesReadCommand("Keyspace1", dk.getKey(), "Standard1", System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(Util.cellname("Column1"), type)));
         Row row = command.getRow(keyspace);
-        Column col = row.cf.getColumn(ByteBufferUtil.bytes("Column1"));
+        Cell col = row.cf.getColumn(Util.cellname("Column1"));
         assertEquals(col.value(), ByteBuffer.wrap("abcd".getBytes()));
     }
 
     @Test
     public void testNoCommitLog() throws Exception
     {
-
-        RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("row"));
-        rm.add("Standard1", ByteBufferUtil.bytes("commit1"), ByteBufferUtil.bytes("abcd"), 0);
+        Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("row"));
+        rm.add("Standard1", Util.cellname("commit1"), ByteBufferUtil.bytes("abcd"), 0);
         rm.apply();
 
-        rm = new RowMutation("NoCommitlogSpace", ByteBufferUtil.bytes("row"));
-        rm.add("Standard1", ByteBufferUtil.bytes("commit2"), ByteBufferUtil.bytes("abcd"), 0);
+        rm = new Mutation("NoCommitlogSpace", ByteBufferUtil.bytes("row"));
+        rm.add("Standard1", Util.cellname("commit2"), ByteBufferUtil.bytes("abcd"), 0);
         rm.apply();
 
         boolean commitLogMessageFound = false;

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManager2Test.java b/test/unit/org/apache/cassandra/db/RecoveryManager2Test.java
index ac2d642..ede3e9b 100644
--- a/test/unit/org/apache/cassandra/db/RecoveryManager2Test.java
+++ b/test/unit/org/apache/cassandra/db/RecoveryManager2Test.java

@@ -21,7 +21,6 @@
  */
 
 
-import java.io.IOException;
 import org.junit.Test;
 
 import org.slf4j.Logger;
@@ -69,11 +68,11 @@
         assert replayed == 1 : "Expecting only 1 replayed mutation, got " + replayed;
     }
 
-    private void insertRow(String cfname, String key) throws IOException
+    private void insertRow(String cfname, String key) 
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", cfname);
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", cfname);
         cf.addColumn(column("col1", "val1", 1L));
-        RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes(key), cf);
+        Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes(key), cf);
         rm.apply();
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManager3Test.java b/test/unit/org/apache/cassandra/db/RecoveryManager3Test.java
index 8258970..c9bc86a 100644
--- a/test/unit/org/apache/cassandra/db/RecoveryManager3Test.java
+++ b/test/unit/org/apache/cassandra/db/RecoveryManager3Test.java

@@ -23,7 +23,6 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.concurrent.ExecutionException;
 
 import org.junit.Test;
 
@@ -39,23 +38,23 @@
 public class RecoveryManager3Test extends SchemaLoader
 {
     @Test
-    public void testMissingHeader() throws IOException, ExecutionException, InterruptedException
+    public void testMissingHeader() throws IOException
     {
         Keyspace keyspace1 = Keyspace.open("Keyspace1");
         Keyspace keyspace2 = Keyspace.open("Keyspace2");
 
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("keymulti");
         ColumnFamily cf;
 
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("col1", "val1", 1L));
-        rm = new RowMutation("Keyspace1", dk.key, cf);
+        rm = new Mutation("Keyspace1", dk.getKey(), cf);
         rm.apply();
 
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace2", "Standard3");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace2", "Standard3");
         cf.addColumn(column("col2", "val2", 1L));
-        rm = new RowMutation("Keyspace2", dk.key, cf);
+        rm = new Mutation("Keyspace2", dk.getKey(), cf);
         rm.apply();
 
         keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManagerTest.java b/test/unit/org/apache/cassandra/db/RecoveryManagerTest.java
index f3e8fa7..2d820f3 100644
--- a/test/unit/org/apache/cassandra/db/RecoveryManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/RecoveryManagerTest.java

@@ -20,7 +20,6 @@
 
 import java.io.IOException;
 import java.util.Date;
-import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
@@ -32,10 +31,10 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.CommitLogArchiver;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.apache.cassandra.Util.column;
 import static org.apache.cassandra.db.KeyspaceTest.assertColumns;
+import static org.apache.cassandra.Util.cellname;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class RecoveryManagerTest extends SchemaLoader
@@ -46,23 +45,23 @@
     }
 
     @Test
-    public void testOne() throws IOException, ExecutionException, InterruptedException
+    public void testOne() throws IOException
     {
         Keyspace keyspace1 = Keyspace.open("Keyspace1");
         Keyspace keyspace2 = Keyspace.open("Keyspace2");
 
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("keymulti");
         ColumnFamily cf;
 
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf.addColumn(column("col1", "val1", 1L));
-        rm = new RowMutation("Keyspace1", dk.key, cf);
+        rm = new Mutation("Keyspace1", dk.getKey(), cf);
         rm.apply();
 
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace2", "Standard3");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace2", "Standard3");
         cf.addColumn(column("col2", "val2", 1L));
-        rm = new RowMutation("Keyspace2", dk.key, cf);
+        rm = new Mutation("Keyspace2", dk.getKey(), cf);
         rm.apply();
 
         keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();
@@ -76,19 +75,19 @@
     }
 
     @Test
-    public void testRecoverCounter() throws IOException, ExecutionException, InterruptedException
+    public void testRecoverCounter() throws IOException
     {
         Keyspace keyspace1 = Keyspace.open("Keyspace1");
 
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("key");
         ColumnFamily cf;
 
         for (int i = 0; i < 10; ++i)
         {
-            cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Counter1");
-            cf.addColumn(new CounterColumn(ByteBufferUtil.bytes("col"), 1L, 1L));
-            rm = new RowMutation("Keyspace1", dk.key, cf);
+            cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Counter1");
+            cf.addColumn(BufferCounterCell.createLocal(cellname("col"), 1L, 1L, Long.MIN_VALUE));
+            rm = new Mutation("Keyspace1", dk.getKey(), cf);
             rm.apply();
         }
 
@@ -100,10 +99,10 @@
         cf = Util.getColumnFamily(keyspace1, dk, "Counter1");
 
         assert cf.getColumnCount() == 1;
-        Column c = cf.getColumn(ByteBufferUtil.bytes("col"));
+        Cell c = cf.getColumn(cellname("col"));
 
         assert c != null;
-        assert ((CounterColumn)c).total() == 10L;
+        assert ((CounterCell)c).total() == 10L;
     }
 
     @Test
@@ -117,9 +116,9 @@
         for (int i = 0; i < 10; ++i)
         {
             long ts = TimeUnit.MILLISECONDS.toMicros(timeMS + (i * 1000));
-            ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+            ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
             cf.addColumn(column("name-" + i, "value", ts));
-            RowMutation rm = new RowMutation("Keyspace1", dk.key, cf);
+            Mutation rm = new Mutation("Keyspace1", dk.getKey(), cf);
             rm.apply();
         }
         keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();
@@ -151,7 +150,7 @@
 
             ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
             cf.addColumn(column("name-" + i, "value", ts));
-            RowMutation rm = new RowMutation("Keyspace1", dk.key, cf);
+            Mutation rm = new Mutation("Keyspace1", dk.getKey(), cf);
             rm.apply();
         }
 

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManagerTruncateTest.java b/test/unit/org/apache/cassandra/db/RecoveryManagerTruncateTest.java
index 1c98079..1f7d388 100644
--- a/test/unit/org/apache/cassandra/db/RecoveryManagerTruncateTest.java
+++ b/test/unit/org/apache/cassandra/db/RecoveryManagerTruncateTest.java

@@ -23,15 +23,12 @@
 import static org.junit.Assert.assertNull;
 
 import java.io.IOException;
-import java.util.concurrent.ExecutionException;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.commitlog.CommitLog;
-import org.apache.cassandra.db.filter.QueryFilter;
 import org.junit.Test;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 
 /**
  * Test for the truncate operation.
@@ -39,18 +36,18 @@
 public class RecoveryManagerTruncateTest extends SchemaLoader
 {
 	@Test
-	public void testTruncate() throws IOException, ExecutionException, InterruptedException
+	public void testTruncate() throws IOException
 	{
 		Keyspace keyspace = Keyspace.open("Keyspace1");
 		ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
 
-		RowMutation rm;
+		Mutation rm;
 		ColumnFamily cf;
 
 		// add a single cell
-        cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
 		cf.addColumn(column("col1", "val1", 1L));
-        rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes("keymulti"), cf);
+        rm = new Mutation("Keyspace1", ByteBufferUtil.bytes("keymulti"), cf);
 		rm.apply();
 
 		// Make sure data was written
@@ -65,7 +62,7 @@
 		assertNull(getFromTable(keyspace, "Standard1", "keymulti", "col1"));
 	}
 
-	private Column getFromTable(Keyspace keyspace, String cfName, String keyName, String columnName)
+	private Cell getFromTable(Keyspace keyspace, String cfName, String keyName, String columnName)
 	{
 		ColumnFamily cf;
 		ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(cfName);
@@ -73,14 +70,11 @@
 		{
 			return null;
 		}
-		cf = cfStore.getColumnFamily(QueryFilter.getNamesFilter(Util.dk(keyName),
-                                                                cfName,
-                                                                FBUtilities.singleton(ByteBufferUtil.bytes(columnName), cfStore.getComparator()),
-                                                                System.currentTimeMillis()));
+		cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, Util.dk(keyName), columnName));
 		if (cf == null)
 		{
 			return null;
 		}
-		return cf.getColumn(ByteBufferUtil.bytes(columnName));
+		return cf.getColumn(Util.cellname(columnName));
 	}
 }

diff --git a/test/unit/org/apache/cassandra/db/RemoveCellTest.java b/test/unit/org/apache/cassandra/db/RemoveCellTest.java
new file mode 100644
index 0000000..77ff02d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/RemoveCellTest.java

@@ -0,0 +1,84 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class RemoveCellTest extends SchemaLoader
+{
+    @Test
+    public void testRemoveColumn()
+    {
+        Keyspace keyspace = Keyspace.open("Keyspace1");
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
+        Mutation rm;
+        DecoratedKey dk = Util.dk("key1");
+
+        // add data
+        rm = new Mutation("Keyspace1", dk.getKey());
+        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        rm.apply();
+        store.forceBlockingFlush();
+
+        // remove
+        rm = new Mutation("Keyspace1", dk.getKey());
+        rm.delete("Standard1", Util.cellname("Column1"), 1);
+        rm.apply();
+
+        ColumnFamily retrieved = store.getColumnFamily(Util.namesQueryFilter(store, dk, "Column1"));
+        assertFalse(retrieved.getColumn(Util.cellname("Column1")).isLive());
+        assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
+        assertNull(Util.cloneAndRemoveDeleted(store.getColumnFamily(QueryFilter.getIdentityFilter(dk,
+                                                                                                  "Standard1",
+                                                                                                  System.currentTimeMillis())),
+                                              Integer.MAX_VALUE));
+    }
+
+    private static BufferDeletedCell dc(String name, int ldt, long timestamp)
+    {
+        return new BufferDeletedCell(Util.cellname(name), ldt, timestamp);
+    }
+
+    @Test
+    public void deletedColumnShouldAlwaysBeMarkedForDelete()
+    {
+        // Check for bug in #4307
+        long timestamp = System.currentTimeMillis();
+        int localDeletionTime = (int) (timestamp / 1000);
+        Cell c = dc("dc1", localDeletionTime, timestamp);
+        assertFalse("DeletedCell was not marked for delete", c.isLive(timestamp));
+
+        // Simulate a node that is 30 seconds behind
+        c = dc("dc2", localDeletionTime + 30, timestamp + 30000);
+        assertFalse("DeletedCell was not marked for delete", c.isLive(timestamp));
+
+        // Simulate a node that is 30 ahead behind
+        c = dc("dc3", localDeletionTime - 30, timestamp - 30000);
+        assertFalse("DeletedCell was not marked for delete", c.isLive(timestamp));
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyTest.java b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyTest.java
index e3db863..09eed71 100644
--- a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyTest.java
+++ b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyTest.java

@@ -35,22 +35,22 @@
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("key1");
 
         // add data
-        rm = new RowMutation("Keyspace1", dk.key);
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        rm = new Mutation("Keyspace1", dk.getKey());
+        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
         rm.apply();
 
         // remove
-        rm = new RowMutation("Keyspace1", dk.key);
+        rm = new Mutation("Keyspace1", dk.getKey());
         rm.delete("Standard1", 1);
         rm.apply();
 
         ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getIdentityFilter(dk, "Standard1", System.currentTimeMillis()));
         assert retrieved.isMarkedForDelete();
-        assertNull(retrieved.getColumn(ByteBufferUtil.bytes("Column1")));
+        assertNull(retrieved.getColumn(Util.cellname("Column1")));
         assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush1Test.java b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush1Test.java
index 7c71dc9..f898f16 100644
--- a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush1Test.java
+++ b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush1Test.java

@@ -35,24 +35,24 @@
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("key1");
 
         // add data
-        rm = new RowMutation("Keyspace1", dk.key);
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.add("Standard1", ByteBufferUtil.bytes("Column2"), ByteBufferUtil.bytes("asdf"), 0);
+        rm = new Mutation("Keyspace1", dk.getKey());
+        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        rm.add("Standard1", Util.cellname("Column2"), ByteBufferUtil.bytes("asdf"), 0);
         rm.apply();
         store.forceBlockingFlush();
 
         // remove
-        rm = new RowMutation("Keyspace1", dk.key);
+        rm = new Mutation("Keyspace1", dk.getKey());
         rm.delete("Standard1", 1);
         rm.apply();
 
         ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getIdentityFilter(dk, "Standard1", System.currentTimeMillis()));
         assert retrieved.isMarkedForDelete();
-        assertNull(retrieved.getColumn(ByteBufferUtil.bytes("Column1")));
+        assertNull(retrieved.getColumn(Util.cellname("Column1")));
         assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush2Test.java b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush2Test.java
index f6fae5a..1e910ad 100644
--- a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush2Test.java
+++ b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush2Test.java

@@ -35,22 +35,22 @@
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("key1");
 
         // add data
-        rm = new RowMutation("Keyspace1", dk.key);
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
+        rm = new Mutation("Keyspace1", dk.getKey());
+        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
         rm.apply();
         // remove
-        rm = new RowMutation("Keyspace1", dk.key);
+        rm = new Mutation("Keyspace1", dk.getKey());
         rm.delete("Standard1", 1);
         rm.apply();
         store.forceBlockingFlush();
 
         ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getIdentityFilter(dk, "Standard1", System.currentTimeMillis()));
         assert retrieved.isMarkedForDelete();
-        assertNull(retrieved.getColumn(ByteBufferUtil.bytes("Column1")));
+        assertNull(retrieved.getColumn(Util.cellname("Column1")));
         assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RemoveColumnTest.java b/test/unit/org/apache/cassandra/db/RemoveColumnTest.java
deleted file mode 100644
index eddf5e0..0000000
--- a/test/unit/org/apache/cassandra/db/RemoveColumnTest.java
+++ /dev/null

@@ -1,85 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.db;
-
-import org.junit.Test;
-
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.cassandra.db.filter.QueryFilter;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-
-
-public class RemoveColumnTest extends SchemaLoader
-{
-    @Test
-    public void testRemoveColumn()
-    {
-        Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        RowMutation rm;
-        DecoratedKey dk = Util.dk("key1");
-
-        // add data
-        rm = new RowMutation("Keyspace1", dk.key);
-        rm.add("Standard1", ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.apply();
-        store.forceBlockingFlush();
-
-        // remove
-        rm = new RowMutation("Keyspace1", dk.key);
-        rm.delete("Standard1", ByteBufferUtil.bytes("Column1"), 1);
-        rm.apply();
-
-        ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getNamesFilter(dk,
-                                                                                  "Standard1",
-                                                                                  FBUtilities.singleton(ByteBufferUtil.bytes("Column1"), store.getComparator()),
-                                                                                  System.currentTimeMillis()));
-        assert retrieved.getColumn(ByteBufferUtil.bytes("Column1")).isMarkedForDelete(System.currentTimeMillis());
-        assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
-        assertNull(Util.cloneAndRemoveDeleted(store.getColumnFamily(QueryFilter.getIdentityFilter(dk,
-                                                                                                  "Standard1",
-                                                                                                  System.currentTimeMillis())),
-                                              Integer.MAX_VALUE));
-    }
-
-    @Test
-    public void deletedColumnShouldAlwaysBeMarkedForDelete()
-    {
-        // Check for bug in #4307
-        long timestamp = System.currentTimeMillis();
-        int localDeletionTime = (int) (timestamp / 1000);
-        Column c = DeletedColumn.create(localDeletionTime, timestamp, "dc1");
-        assertTrue("DeletedColumn was not marked for delete", c.isMarkedForDelete(timestamp));
-
-        // Simulate a node that is 30 seconds behind
-        c = DeletedColumn.create(localDeletionTime + 30, timestamp + 30000, "dc2");
-        assertTrue("DeletedColumn was not marked for delete", c.isMarkedForDelete(timestamp));
-
-        // Simulate a node that is 30 ahead behind
-        c = DeletedColumn.create(localDeletionTime - 30, timestamp - 30000, "dc3");
-        assertTrue("DeletedColumn was not marked for delete", c.isMarkedForDelete(timestamp));
-    }
-
-}

diff --git a/test/unit/org/apache/cassandra/db/RemoveSubColumnTest.java b/test/unit/org/apache/cassandra/db/RemoveSubCellTest.java
similarity index 80%
rename from test/unit/org/apache/cassandra/db/RemoveSubColumnTest.java
rename to test/unit/org/apache/cassandra/db/RemoveSubCellTest.java
index 6d2be75..cec1bce 100644
--- a/test/unit/org/apache/cassandra/db/RemoveSubColumnTest.java
+++ b/test/unit/org/apache/cassandra/db/RemoveSubCellTest.java

@@ -23,9 +23,11 @@
 
 import org.junit.Test;
 
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNull;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.CompositeType;
+
 import static org.apache.cassandra.Util.getBytes;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.SchemaLoader;
@@ -34,30 +36,30 @@
 import com.google.common.util.concurrent.Uninterruptibles;
 
 
-public class RemoveSubColumnTest extends SchemaLoader
+public class RemoveSubCellTest extends SchemaLoader
 {
     @Test
     public void testRemoveSubColumn()
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Super1");
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("key1");
 
         // add data
-        rm = new RowMutation("Keyspace1", dk.key);
+        rm = new Mutation("Keyspace1", dk.getKey());
         Util.addMutation(rm, "Super1", "SC1", 1, "asdf", 0);
         rm.apply();
         store.forceBlockingFlush();
 
-        ByteBuffer cname = CompositeType.build(ByteBufferUtil.bytes("SC1"), getBytes(1L));
+        CellName cname = CellNames.compositeDense(ByteBufferUtil.bytes("SC1"), getBytes(1L));
         // remove
-        rm = new RowMutation("Keyspace1", dk.key);
+        rm = new Mutation("Keyspace1", dk.getKey());
         rm.delete("Super1", cname, 1);
         rm.apply();
 
         ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getIdentityFilter(dk, "Super1", System.currentTimeMillis()));
-        assert retrieved.getColumn(cname).isMarkedForDelete(System.currentTimeMillis());
+        assertFalse(retrieved.getColumn(cname).isLive());
         assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
     }
 
@@ -66,19 +68,19 @@
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Super1");
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey dk = Util.dk("key2");
 
         // add data
-        rm = new RowMutation("Keyspace1", dk.key);
+        rm = new Mutation("Keyspace1", dk.getKey());
         Util.addMutation(rm, "Super1", "SC1", 1, "asdf", 0);
         rm.apply();
         store.forceBlockingFlush();
 
         // remove the SC
         ByteBuffer scName = ByteBufferUtil.bytes("SC1");
-        ByteBuffer cname = CompositeType.build(scName, getBytes(1L));
-        rm = new RowMutation("Keyspace1", dk.key);
+        CellName cname = CellNames.compositeDense(scName, getBytes(1L));
+        rm = new Mutation("Keyspace1", dk.getKey());
         rm.deleteRange("Super1", SuperColumns.startOf(scName), SuperColumns.endOf(scName), 1);
         rm.apply();
 
@@ -88,12 +90,12 @@
         Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
 
         // remove the column itself
-        rm = new RowMutation("Keyspace1", dk.key);
+        rm = new Mutation("Keyspace1", dk.getKey());
         rm.delete("Super1", cname, 2);
         rm.apply();
 
         ColumnFamily retrieved = store.getColumnFamily(filter);
-        assert retrieved.getColumn(cname).isMarkedForDelete(System.currentTimeMillis());
+        assertFalse(retrieved.getColumn(cname).isLive());
         assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RowCacheCQLTest.java b/test/unit/org/apache/cassandra/db/RowCacheCQLTest.java
new file mode 100644
index 0000000..3dc5ce3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/RowCacheCQLTest.java

@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.service.CacheService;
+import static org.junit.Assert.assertEquals;
+
+public class RowCacheCQLTest extends CQLTester
+{
+    @Test
+    public void test7636() throws Throwable
+    {
+        CacheService.instance.setRowCacheCapacityInMB(1);
+        createTable("CREATE TABLE %s (p1 bigint, c1 int, PRIMARY KEY (p1, c1)) WITH caching = '{\"keys\":\"NONE\", \"rows_per_partition\":\"ALL\"}'");
+        execute("INSERT INTO %s (p1, c1) VALUES (123, 10)");
+        assertEmpty(execute("SELECT * FROM %s WHERE p1=123 and c1 > 1000"));
+        UntypedResultSet res = execute("SELECT * FROM %s WHERE p1=123 and c1 > 0");
+        assertEquals(1, res.size());
+        assertEmpty(execute("SELECT * FROM %s WHERE p1=123 and c1 > 1000"));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/RowCacheTest.java b/test/unit/org/apache/cassandra/db/RowCacheTest.java
index 8934a27..7b6ff99 100644
--- a/test/unit/org/apache/cassandra/db/RowCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/RowCacheTest.java

@@ -19,6 +19,7 @@
 package org.apache.cassandra.db;
 
 import java.net.InetAddress;
+import java.nio.ByteBuffer;
 import java.util.Collection;
 
 import org.junit.AfterClass;
@@ -26,7 +27,10 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.cache.RowCacheKey;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.dht.BytesToken;
 import org.apache.cassandra.locator.TokenMetadata;
 import org.apache.cassandra.service.CacheService;
@@ -67,29 +71,19 @@
         {
             DecoratedKey key = Util.dk("key" + i);
 
-            cachedStore.getColumnFamily(key,
-                                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                        false,
-                                        1,
-                                        System.currentTimeMillis());
+            cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
             assert CacheService.instance.rowCache.size() == i + 1;
             assert cachedStore.containsCachedRow(key); // current key should be stored in the cache
 
-            // checking if column is read correctly after cache
-            ColumnFamily cf = cachedStore.getColumnFamily(key,
-                                                          ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                          ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                          false,
-                                                          1,
-                                                          System.currentTimeMillis());
-            Collection<Column> columns = cf.getSortedColumns();
+            // checking if cell is read correctly after cache
+            ColumnFamily cf = cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
+            Collection<Cell> cells = cf.getSortedColumns();
 
-            Column column = columns.iterator().next();
+            Cell cell = cells.iterator().next();
 
-            assert columns.size() == 1;
-            assert column.name().equals(ByteBufferUtil.bytes("col" + i));
-            assert column.value().equals(ByteBufferUtil.bytes("val" + i));
+            assert cells.size() == 1;
+            assert cell.name().toByteBuffer().equals(ByteBufferUtil.bytes("col" + i));
+            assert cell.value().equals(ByteBufferUtil.bytes("val" + i));
         }
 
         // insert 10 more keys
@@ -99,28 +93,18 @@
         {
             DecoratedKey key = Util.dk("key" + i);
 
-            cachedStore.getColumnFamily(key,
-                                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                        false,
-                                        1,
-                                        System.currentTimeMillis());
+            cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
             assert cachedStore.containsCachedRow(key); // cache should be populated with the latest rows read (old ones should be popped)
 
-            // checking if column is read correctly after cache
-            ColumnFamily cf = cachedStore.getColumnFamily(key,
-                                                          ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                          ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                          false,
-                                                          1,
-                                                          System.currentTimeMillis());
-            Collection<Column> columns = cf.getSortedColumns();
+            // checking if cell is read correctly after cache
+            ColumnFamily cf = cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
+            Collection<Cell> cells = cf.getSortedColumns();
 
-            Column column = columns.iterator().next();
+            Cell cell = cells.iterator().next();
 
-            assert columns.size() == 1;
-            assert column.name().equals(ByteBufferUtil.bytes("col" + i));
-            assert column.value().equals(ByteBufferUtil.bytes("val" + i));
+            assert cells.size() == 1;
+            assert cell.name().toByteBuffer().equals(ByteBufferUtil.bytes("col" + i));
+            assert cell.value().equals(ByteBufferUtil.bytes("val" + i));
         }
 
         // clear 100 rows from the cache
@@ -172,6 +156,79 @@
         rowCacheLoad(100, 50, 0);
         CacheService.instance.setRowCacheCapacityInMB(0);
     }
+    @Test
+    public void testRowCacheRange()
+    {
+        CompactionManager.instance.disableAutoCompaction();
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        String cf = "CachedIntCF";
+        ColumnFamilyStore cachedStore  = keyspace.getColumnFamilyStore(cf);
+        long startRowCacheHits = cachedStore.metric.rowCacheHit.count();
+        long startRowCacheOutOfRange = cachedStore.metric.rowCacheHitOutOfRange.count();
+        // empty the row cache
+        CacheService.instance.invalidateRowCache();
+
+        // set global row cache size to 1 MB
+        CacheService.instance.setRowCacheCapacityInMB(1);
+
+        ByteBuffer key = ByteBufferUtil.bytes("rowcachekey");
+        DecoratedKey dk = cachedStore.partitioner.decorateKey(key);
+        RowCacheKey rck = new RowCacheKey(cachedStore.metadata.cfId, dk);
+        Mutation mutation = new Mutation(KEYSPACE, key);
+        for (int i = 0; i < 200; i++)
+            mutation.add(cf, Util.cellname(i), ByteBufferUtil.bytes("val" + i), System.currentTimeMillis());
+        mutation.applyUnsafe();
+
+        // populate row cache, we should not get a row cache hit;
+        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
+                                                                Composites.EMPTY,
+                                                                Composites.EMPTY,
+                                                                false, 10, System.currentTimeMillis()));
+        assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.count());
+
+        // do another query, limit is 20, which is < 100 that we cache, we should get a hit and it should be in range
+        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
+                                                                Composites.EMPTY,
+                                                                Composites.EMPTY,
+                                                                false, 20, System.currentTimeMillis()));
+        assertEquals(++startRowCacheHits, cachedStore.metric.rowCacheHit.count());
+        assertEquals(startRowCacheOutOfRange, cachedStore.metric.rowCacheHitOutOfRange.count());
+
+        // get a slice from 95 to 105, 95->99 are in cache, we should not get a hit and then row cache is out of range
+        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
+                                                               CellNames.simpleDense(ByteBufferUtil.bytes(95)),
+                                                               CellNames.simpleDense(ByteBufferUtil.bytes(105)),
+                                                               false, 10, System.currentTimeMillis()));
+        assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.count());
+        assertEquals(++startRowCacheOutOfRange, cachedStore.metric.rowCacheHitOutOfRange.count());
+
+        // get a slice with limit > 100, we should get a hit out of range.
+        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
+                                                               Composites.EMPTY,
+                                                               Composites.EMPTY,
+                                                               false, 101, System.currentTimeMillis()));
+        assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.count());
+        assertEquals(++startRowCacheOutOfRange, cachedStore.metric.rowCacheHitOutOfRange.count());
+
+
+        CacheService.instance.invalidateRowCache();
+
+        // try to populate row cache with a limit > rows to cache, we should still populate row cache;
+        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
+                                                                Composites.EMPTY,
+                                                                Composites.EMPTY,
+                                                                false, 105, System.currentTimeMillis()));
+        assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.count());
+        // validate the stuff in cache;
+        ColumnFamily cachedCf = (ColumnFamily)CacheService.instance.rowCache.get(rck);
+        assertEquals(cachedCf.getColumnCount(), 100);
+        int i = 0;
+        for(Cell c : cachedCf)
+        {
+            assertEquals(c.name(), Util.cellname(i++));
+        }
+    }
 
     public void rowCacheLoad(int totalKeys, int keysToSave, int offset) throws Exception
     {

diff --git a/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java b/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
index 9728f1f..237573e 100644
--- a/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
+++ b/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java

@@ -22,6 +22,9 @@
 import junit.framework.Assert;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
+import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
@@ -35,20 +38,22 @@
         final RowIndexEntry simple = new RowIndexEntry(123);
 
         DataOutputBuffer buffer = new DataOutputBuffer();
-        RowIndexEntry.serializer.serialize(simple, buffer);
+        RowIndexEntry.Serializer serializer = new RowIndexEntry.Serializer(new SimpleDenseCellNameType(UTF8Type.instance));
 
-        Assert.assertEquals(buffer.size(), simple.serializedSize());
+        serializer.serialize(simple, buffer);
+
+        Assert.assertEquals(buffer.getLength(), serializer.serializedSize(simple));
 
         buffer = new DataOutputBuffer();
         ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         ColumnIndex columnIndex = new ColumnIndex.Builder(cf, ByteBufferUtil.bytes("a"), new DataOutputBuffer())
         {{
             int idx = 0, size = 0;
-            Column column;
+            Cell column;
             do
             {
-                column = new Column(ByteBufferUtil.bytes("c" + idx++), ByteBufferUtil.bytes("v"), FBUtilities.timestampMicros());
-                size += column.serializedSize(TypeSizes.NATIVE);
+                column = new BufferCell(CellNames.simpleDense(ByteBufferUtil.bytes("c" + idx++)), ByteBufferUtil.bytes("v"), FBUtilities.timestampMicros());
+                size += column.serializedSize(new SimpleDenseCellNameType(UTF8Type.instance), TypeSizes.NATIVE);
 
                 add(column);
             }
@@ -58,7 +63,7 @@
 
         RowIndexEntry withIndex = RowIndexEntry.create(0xdeadbeef, DeletionTime.LIVE, columnIndex);
 
-        RowIndexEntry.serializer.serialize(withIndex, buffer);
-        Assert.assertEquals(buffer.size(), withIndex.serializedSize());
+        serializer.serialize(withIndex, buffer);
+        Assert.assertEquals(buffer.getLength(), serializer.serializedSize(withIndex));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RowIterationTest.java b/test/unit/org/apache/cassandra/db/RowIterationTest.java
index 6353a8c..245a6a7 100644
--- a/test/unit/org/apache/cassandra/db/RowIterationTest.java
+++ b/test/unit/org/apache/cassandra/db/RowIterationTest.java

@@ -18,10 +18,8 @@
 */
 package org.apache.cassandra.db;
 
-import java.io.IOException;
 import java.net.InetAddress;
 import java.nio.ByteBuffer;
-import java.util.concurrent.ExecutionException;
 import java.util.Set;
 import java.util.HashSet;
 
@@ -30,7 +28,7 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.utils.FBUtilities;
 import static org.junit.Assert.assertEquals;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -42,7 +40,7 @@
     public static final InetAddress LOCAL = FBUtilities.getBroadcastAddress();
 
     @Test
-    public void testRowIteration() throws IOException, ExecutionException, InterruptedException
+    public void testRowIteration()
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Super3");
@@ -51,8 +49,8 @@
         Set<DecoratedKey> inserted = new HashSet<DecoratedKey>();
         for (int i = 0; i < ROWS_PER_SSTABLE; i++) {
             DecoratedKey key = Util.dk(String.valueOf(i));
-            RowMutation rm = new RowMutation(KEYSPACE1, key.key);
-            rm.add("Super3", CompositeType.build(ByteBufferUtil.bytes("sc"), ByteBufferUtil.bytes(String.valueOf(i))), ByteBuffer.wrap(new byte[ROWS_PER_SSTABLE * 10 - i * 2]), i);
+            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            rm.add("Super3", CellNames.compositeDense(ByteBufferUtil.bytes("sc"), ByteBufferUtil.bytes(String.valueOf(i))), ByteBuffer.wrap(new byte[ROWS_PER_SSTABLE * 10 - i * 2]), i);
             rm.apply();
             inserted.add(key);
         }
@@ -61,7 +59,7 @@
     }
 
     @Test
-    public void testRowIterationDeletionTime() throws IOException, ExecutionException, InterruptedException
+    public void testRowIterationDeletionTime()
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         String CF_NAME = "Standard3";
@@ -69,17 +67,16 @@
         DecoratedKey key = Util.dk("key");
 
         // Delete row in first sstable
-        RowMutation rm = new RowMutation(KEYSPACE1, key.key);
+        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
         rm.delete(CF_NAME, 0);
-        rm.add(CF_NAME, ByteBufferUtil.bytes("c"), ByteBufferUtil.bytes("values"), 0L);
-        DeletionInfo delInfo1 = rm.getColumnFamilies().iterator().next().deletionInfo();
+        rm.add(CF_NAME, Util.cellname("c"), ByteBufferUtil.bytes("values"), 0L);
         rm.apply();
         store.forceBlockingFlush();
 
         // Delete row in second sstable with higher timestamp
-        rm = new RowMutation(KEYSPACE1, key.key);
+        rm = new Mutation(KEYSPACE1, key.getKey());
         rm.delete(CF_NAME, 1);
-        rm.add(CF_NAME, ByteBufferUtil.bytes("c"), ByteBufferUtil.bytes("values"), 1L);
+        rm.add(CF_NAME, Util.cellname("c"), ByteBufferUtil.bytes("values"), 1L);
         DeletionInfo delInfo2 = rm.getColumnFamilies().iterator().next().deletionInfo();
         assert delInfo2.getTopLevelDeletion().markedForDeleteAt == 1L;
         rm.apply();
@@ -90,7 +87,7 @@
     }
 
     @Test
-    public void testRowIterationDeletion() throws IOException, ExecutionException, InterruptedException
+    public void testRowIterationDeletion()
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         String CF_NAME = "Standard3";
@@ -98,7 +95,7 @@
         DecoratedKey key = Util.dk("key");
 
         // Delete a row in first sstable
-        RowMutation rm = new RowMutation(KEYSPACE1, key.key);
+        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
         rm.delete(CF_NAME, 0);
         rm.apply();
         store.forceBlockingFlush();

diff --git a/test/unit/org/apache/cassandra/db/RowTest.java b/test/unit/org/apache/cassandra/db/RowTest.java
index 2571fa5..22e112e 100644
--- a/test/unit/org/apache/cassandra/db/RowTest.java
+++ b/test/unit/org/apache/cassandra/db/RowTest.java

@@ -21,58 +21,93 @@
 import java.util.Arrays;
 import java.util.concurrent.TimeUnit;
 
-import org.apache.cassandra.SchemaLoader;
+import com.google.common.util.concurrent.Uninterruptibles;
 import org.junit.Test;
 
-import static org.junit.Assert.assertEquals;
-import static org.apache.cassandra.Util.column;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-import com.google.common.util.concurrent.Uninterruptibles;
-
+import static org.apache.cassandra.Util.column;
+import static org.apache.cassandra.Util.tombstone;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
 public class RowTest extends SchemaLoader
 {
     @Test
     public void testDiffColumnFamily()
     {
-        ColumnFamily cf1 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf1.addColumn(column("one", "onev", 0));
 
-        ColumnFamily cf2 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         DeletionInfo delInfo = new DeletionInfo(0, 0);
         cf2.delete(delInfo);
 
         ColumnFamily cfDiff = cf1.diff(cf2);
-        assertEquals(cfDiff.getColumnCount(), 0);
+        assertFalse(cfDiff.hasColumns());
         assertEquals(cfDiff.deletionInfo(), delInfo);
+
+        RangeTombstone tombstone1 = tombstone("1", "11", (long) 123, 123);
+        RangeTombstone tombstone1_2 = tombstone("111", "112", (long) 1230, 123);
+        RangeTombstone tombstone2_1 = tombstone("2", "22", (long) 123, 123);
+        RangeTombstone tombstone2_2 = tombstone("2", "24", (long) 123, 123);
+        RangeTombstone tombstone3_1 = tombstone("3", "31", (long) 123, 123);
+        RangeTombstone tombstone3_2 = tombstone("3", "31", (long) 1230, 123);
+        RangeTombstone tombstone4_1 = tombstone("4", "41", (long) 123, 123);
+        RangeTombstone tombstone4_2 = tombstone("4", "41", (long) 123, 1230);
+        RangeTombstone tombstone5_2 = tombstone("5", "51", (long) 123, 1230);
+        cf1.delete(tombstone1);
+        cf1.delete(tombstone2_1);
+        cf1.delete(tombstone3_1);
+        cf1.delete(tombstone4_1);
+
+        cf2.delete(tombstone1);
+        cf2.delete(tombstone1_2);
+        cf2.delete(tombstone2_2);
+        cf2.delete(tombstone3_2);
+        cf2.delete(tombstone4_2);
+        cf2.delete(tombstone5_2);
+
+        cfDiff = cf1.diff(cf2);
+        assertEquals(0, cfDiff.getColumnCount());
+
+        // only tmbstones which differ in superset or have more recent timestamp to be in diff
+        delInfo.add(tombstone1_2, cf1.getComparator());
+        delInfo.add(tombstone2_2, cf1.getComparator());
+        delInfo.add(tombstone3_2, cf1.getComparator());
+        delInfo.add(tombstone5_2, cf1.getComparator());
+
+        assertEquals(delInfo, cfDiff.deletionInfo());
     }
 
     @Test
     public void testResolve()
     {
-        ColumnFamily cf1 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf1.addColumn(column("one", "A", 0));
 
-        ColumnFamily cf2 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf2.addColumn(column("one", "B", 1));
         cf2.addColumn(column("two", "C", 1));
 
-        cf1.resolve(cf2);
-        assert Arrays.equals(cf1.getColumn(ByteBufferUtil.bytes("one")).value().array(), "B".getBytes());
-        assert Arrays.equals(cf1.getColumn(ByteBufferUtil.bytes("two")).value().array(), "C".getBytes());
+        cf1.addAll(cf2);
+        assert Arrays.equals(cf1.getColumn(CellNames.simpleDense(ByteBufferUtil.bytes("one"))).value().array(), "B".getBytes());
+        assert Arrays.equals(cf1.getColumn(CellNames.simpleDense(ByteBufferUtil.bytes("two"))).value().array(), "C".getBytes());
     }
 
     @Test
     public void testExpiringColumnExpiration()
     {
-        Column c = new ExpiringColumn(ByteBufferUtil.bytes("one"), ByteBufferUtil.bytes("A"), 0, 1);
-        assert !c.isMarkedForDelete(System.currentTimeMillis());
+        Cell c = new BufferExpiringCell(CellNames.simpleDense(ByteBufferUtil.bytes("one")), ByteBufferUtil.bytes("A"), 0, 1);
+        assertTrue(c.isLive());
 
         // Because we keep the local deletion time with a precision of a
         // second, we could have to wait 2 seconds in worst case scenario.
         Uninterruptibles.sleepUninterruptibly(2, TimeUnit.SECONDS);
 
-        assert c.isMarkedForDelete(System.currentTimeMillis()) && c.getMarkedForDeleteAt() == 0;
+        assert !c.isLive() && c.timestamp() == 0;
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
index 23e9381..571fe0e 100644
--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
+++ b/test/unit/org/apache/cassandra/db/ScrubTest.java

@@ -20,8 +20,10 @@
  *
  */
 
-import java.io.*;
-import java.util.Collections;
+import java.io.File;
+import java.io.IOError;
+import java.io.IOException;
+import java.io.RandomAccessFile;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -29,7 +31,6 @@
 import java.util.concurrent.ExecutionException;
 
 import org.apache.cassandra.cql3.QueryProcessor;
-import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.commons.lang3.StringUtils;
@@ -37,19 +38,21 @@
 import org.junit.runner.RunWith;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
-import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.compaction.Scrubber;
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.db.compaction.Scrubber;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static org.apache.cassandra.Util.cellname;
 import static org.apache.cassandra.Util.column;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
@@ -63,7 +66,7 @@
     public String COUNTER_CF = "Counter1";
 
     @Test
-    public void testScrubOneRow() throws IOException, ExecutionException, InterruptedException, ConfigurationException
+    public void testScrubOneRow() throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
         Keyspace keyspace = Keyspace.open(KEYSPACE);
@@ -85,7 +88,7 @@
     }
 
     @Test
-    public void testScrubCorruptedCounterRow() throws IOException, InterruptedException, ExecutionException
+    public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutException
     {
         CompactionManager.instance.disableAutoCompaction();
         Keyspace keyspace = Keyspace.open(KEYSPACE);
@@ -100,8 +103,8 @@
         SSTableReader sstable = cfs.getSSTables().iterator().next();
 
         // overwrite one row with garbage
-        long row0Start = sstable.getPosition(RowPosition.forKey(ByteBufferUtil.bytes("0"), sstable.partitioner), SSTableReader.Operator.EQ).position;
-        long row1Start = sstable.getPosition(RowPosition.forKey(ByteBufferUtil.bytes("1"), sstable.partitioner), SSTableReader.Operator.EQ).position;
+        long row0Start = sstable.getPosition(RowPosition.ForKey.get(ByteBufferUtil.bytes("0"), sstable.partitioner), SSTableReader.Operator.EQ).position;
+        long row1Start = sstable.getPosition(RowPosition.ForKey.get(ByteBufferUtil.bytes("1"), sstable.partitioner), SSTableReader.Operator.EQ).position;
         long startPosition = row0Start < row1Start ? row0Start : row1Start;
         long endPosition = row0Start < row1Start ? row1Start : row0Start;
 
@@ -111,7 +114,7 @@
         file.close();
 
         // with skipCorrupted == false, the scrub is expected to fail
-        Scrubber scrubber = new Scrubber(cfs, sstable, false);
+        Scrubber scrubber = new Scrubber(cfs, sstable, false, false);
         try
         {
             scrubber.scrub();
@@ -120,10 +123,9 @@
         catch (IOError err) {}
 
         // with skipCorrupted == true, the corrupt row will be skipped
-        scrubber = new Scrubber(cfs, sstable, true);
+        scrubber = new Scrubber(cfs, sstable, true, false);
         scrubber.scrub();
         scrubber.close();
-        cfs.replaceCompactedSSTables(Collections.singletonList(sstable), Collections.singletonList(scrubber.getNewSSTable()), OperationType.SCRUB);
         assertEquals(1, cfs.getSSTables().size());
 
         // verify that we can read all of the rows, and there is now one less row
@@ -132,16 +134,16 @@
     }
 
     @Test
-    public void testScrubDeletedRow() throws IOException, ExecutionException, InterruptedException, ConfigurationException
+    public void testScrubDeletedRow() throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF3);
         cfs.clearUnsafe();
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(KEYSPACE, CF3);
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, CF3);
         cf.delete(new DeletionInfo(0, 1)); // expired tombstone
-        RowMutation rm = new RowMutation(KEYSPACE, ByteBufferUtil.bytes(1), cf);
+        Mutation rm = new Mutation(KEYSPACE, ByteBufferUtil.bytes(1), cf);
         rm.applyUnsafe();
         cfs.forceBlockingFlush();
 
@@ -150,7 +152,7 @@
     }
 
     @Test
-    public void testScrubMultiRow() throws IOException, ExecutionException, InterruptedException, ConfigurationException
+    public void testScrubMultiRow() throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
         Keyspace keyspace = Keyspace.open(KEYSPACE);
@@ -185,7 +187,7 @@
          * The test also assumes an ordered partitioner.
          *
         ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cf.addColumn(new Column(ByteBufferUtil.bytes("someName"), ByteBufferUtil.bytes("someValue"), 0L));
+        cf.addColumn(new Cell(ByteBufferUtil.bytes("someName"), ByteBufferUtil.bytes("someValue"), 0L));
 
         SSTableWriter writer = new SSTableWriter(cfs.getTempSSTablePath(new File(System.getProperty("corrupt-sstable-root"))),
                                                  cfs.metadata.getIndexInterval(),
@@ -205,7 +207,7 @@
         assert root != null;
         File rootDir = new File(root);
         assert rootDir.isDirectory();
-        Descriptor desc = new Descriptor(new Descriptor.Version("jb"), rootDir, KEYSPACE, columnFamily, 1, false);
+        Descriptor desc = new Descriptor(new Descriptor.Version("jb"), rootDir, KEYSPACE, columnFamily, 1, Descriptor.Type.FINAL);
         CFMetaData metadata = Schema.instance.getCFMetaData(desc.ksname, desc.cfname);
 
         try
@@ -226,7 +228,7 @@
         components.add(Component.TOC);
         SSTableReader sstable = SSTableReader.openNoValidation(desc, components, metadata);
 
-        Scrubber scrubber = new Scrubber(cfs, sstable, false);
+        Scrubber scrubber = new Scrubber(cfs, sstable, false, true);
         scrubber.scrub();
 
         cfs.loadNewSSTables();
@@ -247,30 +249,30 @@
         return true;
     }
 
-    protected void fillCF(ColumnFamilyStore cfs, int rowsPerSSTable) throws ExecutionException, InterruptedException, IOException
+    protected void fillCF(ColumnFamilyStore cfs, int rowsPerSSTable)
     {
         for (int i = 0; i < rowsPerSSTable; i++)
         {
             String key = String.valueOf(i);
             // create a row and update the birthdate value, test that the index query fetches the new version
-            ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(KEYSPACE, CF);
+            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, CF);
             cf.addColumn(column("c1", "1", 1L));
             cf.addColumn(column("c2", "2", 1L));
-            RowMutation rm = new RowMutation(KEYSPACE, ByteBufferUtil.bytes(key), cf);
+            Mutation rm = new Mutation(KEYSPACE, ByteBufferUtil.bytes(key), cf);
             rm.applyUnsafe();
         }
 
         cfs.forceBlockingFlush();
     }
 
-    protected void fillCounterCF(ColumnFamilyStore cfs, int rowsPerSSTable) throws ExecutionException, InterruptedException, IOException
+    protected void fillCounterCF(ColumnFamilyStore cfs, int rowsPerSSTable) throws WriteTimeoutException
     {
         for (int i = 0; i < rowsPerSSTable; i++)
         {
             String key = String.valueOf(i);
-            ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(KEYSPACE, COUNTER_CF);
-            RowMutation rm = new RowMutation(KEYSPACE, ByteBufferUtil.bytes(key), cf);
-            rm.addCounter(COUNTER_CF, ByteBufferUtil.bytes("Column1"), 100);
+            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, COUNTER_CF);
+            Mutation rm = new Mutation(KEYSPACE, ByteBufferUtil.bytes(key), cf);
+            rm.addCounter(COUNTER_CF, cellname("Column1"), 100);
             CounterMutation cm = new CounterMutation(rm, ConsistencyLevel.ONE);
             cm.apply();
         }
@@ -286,7 +288,7 @@
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("test_compact_static_columns");
 
-        QueryProcessor.processInternal("INSERT INTO \"Keyspace1\".test_compact_static_columns (a, b, c, d) VALUES (123, c3db07e8-b602-11e3-bc6b-e0b9a54a6d93, true, 'foobar')");
+        QueryProcessor.executeInternal("INSERT INTO \"Keyspace1\".test_compact_static_columns (a, b, c, d) VALUES (123, c3db07e8-b602-11e3-bc6b-e0b9a54a6d93, true, 'foobar')");
         cfs.forceBlockingFlush();
         CompactionManager.instance.performScrub(cfs, false);
     }
@@ -300,10 +302,10 @@
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("UUIDKeys");
 
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create("Keyspace1", "UUIDKeys");
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "UUIDKeys");
         cf.addColumn(column(CFMetaData.DEFAULT_KEY_ALIAS, "not a uuid", 1L));
-        RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes(UUIDGen.getTimeUUID()), cf);
-        rm.applyUnsafe();
+        Mutation mutation = new Mutation("Keyspace1", ByteBufferUtil.bytes(UUIDGen.getTimeUUID()), cf);
+        mutation.applyUnsafe();
         cfs.forceBlockingFlush();
         CompactionManager.instance.performScrub(cfs, false);
 
@@ -322,14 +324,14 @@
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("test_compact_dynamic_columns");
 
-        QueryProcessor.processInternal("INSERT INTO \"Keyspace1\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'a', 'foo')");
-        QueryProcessor.processInternal("INSERT INTO \"Keyspace1\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'b', 'bar')");
-        QueryProcessor.processInternal("INSERT INTO \"Keyspace1\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'c', 'boo')");
+        QueryProcessor.executeInternal("INSERT INTO \"Keyspace1\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'a', 'foo')");
+        QueryProcessor.executeInternal("INSERT INTO \"Keyspace1\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'b', 'bar')");
+        QueryProcessor.executeInternal("INSERT INTO \"Keyspace1\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'c', 'boo')");
         cfs.forceBlockingFlush();
         CompactionManager.instance.performScrub(cfs, true);
 
         // Scrub is silent, but it will remove broken records. So reading everything back to make sure nothing to "scrubbed away"
-        UntypedResultSet rs = QueryProcessor.processInternal("SELECT * FROM \"Keyspace1\".test_compact_dynamic_columns");
+        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * FROM \"Keyspace1\".test_compact_dynamic_columns");
         assertEquals(3, rs.size());
 
         Iterator<UntypedResultSet.Row> iter = rs.iterator();

diff --git a/test/unit/org/apache/cassandra/db/SecondaryIndexColumnSizeTest.java b/test/unit/org/apache/cassandra/db/SecondaryIndexCellSizeTest.java
similarity index 74%
rename from test/unit/org/apache/cassandra/db/SecondaryIndexColumnSizeTest.java
rename to test/unit/org/apache/cassandra/db/SecondaryIndexCellSizeTest.java
index 51d9ee8..45f9e05 100644
--- a/test/unit/org/apache/cassandra/db/SecondaryIndexColumnSizeTest.java
+++ b/test/unit/org/apache/cassandra/db/SecondaryIndexCellSizeTest.java

@@ -23,16 +23,20 @@
 
 import org.junit.Test;
 
-import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.db.index.PerColumnSecondaryIndex;
 import org.apache.cassandra.db.index.PerRowSecondaryIndex;
 import org.apache.cassandra.db.index.SecondaryIndexSearcher;
+import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
 
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
-public class SecondaryIndexColumnSizeTest
+public class SecondaryIndexCellSizeTest
 {
     @Test
     public void test64kColumn()
@@ -47,13 +51,13 @@
 
         // for read
         buffer.flip();
-        Column column = new Column(ByteBufferUtil.bytes("test"), buffer, 0);
+        Cell cell = new BufferCell(CellNames.simpleDense(ByteBufferUtil.bytes("test")), buffer, 0);
 
-        SecondaryIndexColumnSizeTest.MockRowIndex mockRowIndex = new SecondaryIndexColumnSizeTest.MockRowIndex();
-        SecondaryIndexColumnSizeTest.MockColumnIndex mockColumnIndex = new SecondaryIndexColumnSizeTest.MockColumnIndex();
+        SecondaryIndexCellSizeTest.MockRowIndex mockRowIndex = new SecondaryIndexCellSizeTest.MockRowIndex();
+        SecondaryIndexCellSizeTest.MockColumnIndex mockColumnIndex = new SecondaryIndexCellSizeTest.MockColumnIndex();
 
-        assertTrue(mockRowIndex.validate(column));
-        assertFalse(mockColumnIndex.validate(column));
+        assertTrue(mockRowIndex.validate(cell));
+        assertFalse(mockColumnIndex.validate(cell));
 
         // test less than 64k value
         buffer.flip();
@@ -61,62 +65,47 @@
         buffer.putInt(20);
         buffer.flip();
 
-        assertTrue(mockRowIndex.validate(column));
-        assertTrue(mockColumnIndex.validate(column));
+        assertTrue(mockRowIndex.validate(cell));
+        assertTrue(mockColumnIndex.validate(cell));
     }
 
     private class MockRowIndex extends PerRowSecondaryIndex
     {
-        @Override
         public void init()
         {
         }
 
-        @Override
         public void validateOptions() throws ConfigurationException
         {
         }
 
-        @Override
         public String getIndexName()
         {
             return null;
         }
 
-        @Override
         protected SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns)
         {
             return null;
         }
 
-        @Override
         public void forceBlockingFlush()
         {
         }
 
-        @Override
-        public long getLiveSize()
-        {
-            return 0;
-        }
-
-        @Override
         public ColumnFamilyStore getIndexCfs()
         {
             return null;
         }
 
-        @Override
         public void removeIndex(ByteBuffer columnName)
         {
         }
 
-        @Override
         public void invalidate()
         {
         }
 
-        @Override
         public void truncateBlocking(long truncatedAt)
         {
         }
@@ -125,17 +114,26 @@
         {
         }
 
+        public void delete(DecoratedKey key, OpOrder.Group opGroup)
+        {
+        }
+
         public void index(ByteBuffer rowKey)
         {
         }
 
-        public void delete(DecoratedKey key)
+        public void reload()
         {
         }
 
-        @Override
-        public void reload()
+        public boolean indexes(CellName name)
         {
+            return true;
+        }
+
+        @Override
+        public long estimateResultRows() {
+            return 0;
         }
     }
 
@@ -170,12 +168,6 @@
         }
 
         @Override
-        public long getLiveSize()
-        {
-            return 0;
-        }
-
-        @Override
         public ColumnFamilyStore getIndexCfs()
         {
             return null;
@@ -197,17 +189,17 @@
         }
 
         @Override
-        public void delete(ByteBuffer rowKey, Column col)
+        public void delete(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup)
         {
         }
 
         @Override
-        public void insert(ByteBuffer rowKey, Column col)
+        public void insert(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup)
         {
         }
 
         @Override
-        public void update(ByteBuffer rowKey, Column col)
+        public void update(ByteBuffer rowKey, Cell oldCol, Cell col, OpOrder.Group opGroup)
         {
         }
 
@@ -215,5 +207,15 @@
         public void reload()
         {
         }
+
+        public boolean indexes(CellName name)
+        {
+            return true;
+        }
+
+        @Override
+        public long estimateResultRows() {
+            return 0;
+        }
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/SerializationsTest.java b/test/unit/org/apache/cassandra/db/SerializationsTest.java
index 119b47a..0dd6b8f 100644
--- a/test/unit/org/apache/cassandra/db/SerializationsTest.java
+++ b/test/unit/org/apache/cassandra/db/SerializationsTest.java

@@ -20,14 +20,15 @@
 
 import org.apache.cassandra.AbstractSerializationsTester;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.net.CallbackInfo;
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessageOut;
@@ -35,11 +36,9 @@
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-import org.junit.BeforeClass;
 import org.junit.Test;
 
 import java.io.DataInputStream;
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
@@ -48,20 +47,14 @@
 {
     Statics statics = new Statics();
 
-    @BeforeClass
-    public static void loadSchema() throws IOException, ConfigurationException
-    {
-        loadSchema(true);
-    }
-
     private ByteBuffer startCol = ByteBufferUtil.bytes("Start");
     private ByteBuffer stopCol = ByteBufferUtil.bytes("Stop");
-    private ByteBuffer emptyCol = ByteBufferUtil.bytes("");
+    private Composite emptyCol = Composites.EMPTY;
     public NamesQueryFilter namesPred = new NamesQueryFilter(statics.NamedCols);
     public NamesQueryFilter namesSCPred = new NamesQueryFilter(statics.NamedSCCols);
     public SliceQueryFilter emptyRangePred = new SliceQueryFilter(emptyCol, emptyCol, false, 100);
-    public SliceQueryFilter nonEmptyRangePred = new SliceQueryFilter(startCol, stopCol, true, 100);
-    public SliceQueryFilter nonEmptyRangeSCPred = new SliceQueryFilter(CompositeType.build(statics.SC, startCol), CompositeType.build(statics.SC, stopCol), true, 100);
+    public SliceQueryFilter nonEmptyRangePred = new SliceQueryFilter(CellNames.simpleDense(startCol), CellNames.simpleDense(stopCol), true, 100);
+    public SliceQueryFilter nonEmptyRangeSCPred = new SliceQueryFilter(CellNames.compositeDense(statics.SC, startCol), CellNames.compositeDense(statics.SC, stopCol), true, 100);
 
     private void testRangeSliceCommandWrite() throws IOException
     {
@@ -81,7 +74,7 @@
         RangeSliceCommand regRangeCmdSup = new RangeSliceCommand(statics.KS, "Super1", statics.readTs, nonEmptyRangeSCPred, bounds, 100);
         MessageOut<RangeSliceCommand> regRangeCmdSupMsg = regRangeCmdSup.createMessage();
 
-        DataOutputStream out = getOutput("db.RangeSliceCommand.bin");
+        DataOutputStreamAndChannel out = getOutput("db.RangeSliceCommand.bin");
         namesCmdMsg.serialize(out, getVersion());
         emptyRangeCmdMsg.serialize(out, getVersion());
         regRangeCmdMsg.serialize(out, getVersion());
@@ -116,7 +109,7 @@
         SliceByNamesReadCommand standardCmd = new SliceByNamesReadCommand(statics.KS, statics.Key, statics.StandardCF, statics.readTs, namesPred);
         SliceByNamesReadCommand superCmd = new SliceByNamesReadCommand(statics.KS, statics.Key, statics.SuperCF, statics.readTs, namesSCPred);
 
-        DataOutputStream out = getOutput("db.SliceByNamesReadCommand.bin");
+        DataOutputStreamAndChannel out = getOutput("db.SliceByNamesReadCommand.bin");
         SliceByNamesReadCommand.serializer.serialize(standardCmd, out, getVersion());
         SliceByNamesReadCommand.serializer.serialize(superCmd, out, getVersion());
         ReadCommand.serializer.serialize(standardCmd, out, getVersion());
@@ -151,7 +144,7 @@
         SliceFromReadCommand standardCmd = new SliceFromReadCommand(statics.KS, statics.Key, statics.StandardCF, statics.readTs, nonEmptyRangePred);
         SliceFromReadCommand superCmd = new SliceFromReadCommand(statics.KS, statics.Key, statics.SuperCF, statics.readTs, nonEmptyRangeSCPred);
         
-        DataOutputStream out = getOutput("db.SliceFromReadCommand.bin");
+        DataOutputStreamAndChannel out = getOutput("db.SliceFromReadCommand.bin");
         SliceFromReadCommand.serializer.serialize(standardCmd, out, getVersion());
         SliceFromReadCommand.serializer.serialize(superCmd, out, getVersion());
         ReadCommand.serializer.serialize(standardCmd, out, getVersion());
@@ -184,7 +177,7 @@
 
     private void testRowWrite() throws IOException
     {
-        DataOutputStream out = getOutput("db.Row.bin");
+        DataOutputStreamAndChannel out = getOutput("db.Row.bin");
         Row.serializer.serialize(statics.StandardRow, out, getVersion());
         Row.serializer.serialize(statics.SuperRow, out, getVersion());
         Row.serializer.serialize(statics.NullRow, out, getVersion());
@@ -199,8 +192,9 @@
     @Test
     public void testRowRead() throws IOException
     {
-        if (EXECUTE_WRITES)
-            testRowWrite();
+        // Since every table creation generates different CF ID,
+        // we need to generate file every time
+        testRowWrite();
 
         DataInputStream in = getInput("db.Row.bin");
         assert Row.serializer.deserialize(in, getVersion()) != null;
@@ -209,23 +203,23 @@
         in.close();
     }
 
-    private void testRowMutationWrite() throws IOException
+    private void testMutationWrite() throws IOException
     {
-        RowMutation standardRowRm = new RowMutation(statics.KS, statics.StandardRow);
-        RowMutation superRowRm = new RowMutation(statics.KS, statics.SuperRow);
-        RowMutation standardRm = new RowMutation(statics.KS, statics.Key, statics.StandardCf);
-        RowMutation superRm = new RowMutation(statics.KS, statics.Key, statics.SuperCf);
+        Mutation standardRowRm = new Mutation(statics.KS, statics.StandardRow);
+        Mutation superRowRm = new Mutation(statics.KS, statics.SuperRow);
+        Mutation standardRm = new Mutation(statics.KS, statics.Key, statics.StandardCf);
+        Mutation superRm = new Mutation(statics.KS, statics.Key, statics.SuperCf);
         Map<UUID, ColumnFamily> mods = new HashMap<UUID, ColumnFamily>();
         mods.put(statics.StandardCf.metadata().cfId, statics.StandardCf);
         mods.put(statics.SuperCf.metadata().cfId, statics.SuperCf);
-        RowMutation mixedRm = new RowMutation(statics.KS, statics.Key, mods);
+        Mutation mixedRm = new Mutation(statics.KS, statics.Key, mods);
 
-        DataOutputStream out = getOutput("db.RowMutation.bin");
-        RowMutation.serializer.serialize(standardRowRm, out, getVersion());
-        RowMutation.serializer.serialize(superRowRm, out, getVersion());
-        RowMutation.serializer.serialize(standardRm, out, getVersion());
-        RowMutation.serializer.serialize(superRm, out, getVersion());
-        RowMutation.serializer.serialize(mixedRm, out, getVersion());
+        DataOutputStreamAndChannel out = getOutput("db.RowMutation.bin");
+        Mutation.serializer.serialize(standardRowRm, out, getVersion());
+        Mutation.serializer.serialize(superRowRm, out, getVersion());
+        Mutation.serializer.serialize(standardRm, out, getVersion());
+        Mutation.serializer.serialize(superRm, out, getVersion());
+        Mutation.serializer.serialize(mixedRm, out, getVersion());
 
         standardRowRm.createMessage().serialize(out, getVersion());
         superRowRm.createMessage().serialize(out, getVersion());
@@ -236,27 +230,26 @@
         out.close();
 
         // test serializedSize
-        testSerializedSize(standardRowRm, RowMutation.serializer);
-        testSerializedSize(superRowRm, RowMutation.serializer);
-        testSerializedSize(standardRm, RowMutation.serializer);
-        testSerializedSize(superRm, RowMutation.serializer);
-        testSerializedSize(mixedRm, RowMutation.serializer);
+        testSerializedSize(standardRowRm, Mutation.serializer);
+        testSerializedSize(superRowRm, Mutation.serializer);
+        testSerializedSize(standardRm, Mutation.serializer);
+        testSerializedSize(superRm, Mutation.serializer);
+        testSerializedSize(mixedRm, Mutation.serializer);
     }
 
     @Test
-    public void testRowMutationRead() throws IOException
+    public void testMutationRead() throws IOException
     {
-        // row mutation deserialization requires being able to look up the keyspace in the schema,
-        // so we need to rewrite this each time.  We can go back to testing on-disk data
-        // once we pull RM.keyspace field out.
-        testRowMutationWrite();
+        // mutation deserialization requires being able to look up the keyspace in the schema,
+        // so we need to rewrite this each time. plus, CF ID is different for every run.
+        testMutationWrite();
 
         DataInputStream in = getInput("db.RowMutation.bin");
-        assert RowMutation.serializer.deserialize(in, getVersion()) != null;
-        assert RowMutation.serializer.deserialize(in, getVersion()) != null;
-        assert RowMutation.serializer.deserialize(in, getVersion()) != null;
-        assert RowMutation.serializer.deserialize(in, getVersion()) != null;
-        assert RowMutation.serializer.deserialize(in, getVersion()) != null;
+        assert Mutation.serializer.deserialize(in, getVersion()) != null;
+        assert Mutation.serializer.deserialize(in, getVersion()) != null;
+        assert Mutation.serializer.deserialize(in, getVersion()) != null;
+        assert Mutation.serializer.deserialize(in, getVersion()) != null;
+        assert Mutation.serializer.deserialize(in, getVersion()) != null;
         assert MessageIn.read(in, getVersion(), -1) != null;
         assert MessageIn.read(in, getVersion(), -1) != null;
         assert MessageIn.read(in, getVersion(), -1) != null;
@@ -270,7 +263,7 @@
         Truncation tr = new Truncation(statics.KS, "Doesn't Really Matter");
         TruncateResponse aff = new TruncateResponse(statics.KS, "Doesn't Matter Either", true);
         TruncateResponse neg = new TruncateResponse(statics.KS, "Still Doesn't Matter", false);
-        DataOutputStream out = getOutput("db.Truncation.bin");
+        DataOutputStreamAndChannel out = getOutput("db.Truncation.bin");
         Truncation.serializer.serialize(tr, out, getVersion());
         TruncateResponse.serializer.serialize(aff, out, getVersion());
         TruncateResponse.serializer.serialize(neg, out, getVersion());
@@ -312,7 +305,7 @@
     {
         WriteResponse aff = new WriteResponse();
         WriteResponse neg = new WriteResponse();
-        DataOutputStream out = getOutput("db.WriteResponse.bin");
+        DataOutputStreamAndChannel out = getOutput("db.WriteResponse.bin");
         WriteResponse.serializer.serialize(aff, out, getVersion());
         WriteResponse.serializer.serialize(neg, out, getVersion());
         out.close();
@@ -334,34 +327,40 @@
         in.close();
     }
 
-    private static ByteBuffer bb(String s) {
+    private static ByteBuffer bb(String s)
+    {
         return ByteBufferUtil.bytes(s);
     }
 
+    private static CellName cn(String s)
+    {
+        return CellNames.simpleDense(ByteBufferUtil.bytes(s));
+    }
+
     private static class Statics
     {
         private final String KS = "Keyspace1";
         private final ByteBuffer Key = ByteBufferUtil.bytes("Key01");
-        private final SortedSet<ByteBuffer> NamedCols = new TreeSet<ByteBuffer>(BytesType.instance)
+        private final SortedSet<CellName> NamedCols = new TreeSet<CellName>(new SimpleDenseCellNameType(BytesType.instance))
         {{
-            add(ByteBufferUtil.bytes("AAA"));
-            add(ByteBufferUtil.bytes("BBB"));
-            add(ByteBufferUtil.bytes("CCC"));
+            add(CellNames.simpleDense(ByteBufferUtil.bytes("AAA")));
+            add(CellNames.simpleDense(ByteBufferUtil.bytes("BBB")));
+            add(CellNames.simpleDense(ByteBufferUtil.bytes("CCC")));
         }};
         private final ByteBuffer SC = ByteBufferUtil.bytes("SCName");
-        private final SortedSet<ByteBuffer> NamedSCCols = new TreeSet<ByteBuffer>(BytesType.instance)
+        private final SortedSet<CellName> NamedSCCols = new TreeSet<CellName>(new CompoundDenseCellNameType(Arrays.<AbstractType<?>>asList(BytesType.instance, BytesType.instance)))
         {{
-            add(CompositeType.build(SC, ByteBufferUtil.bytes("AAA")));
-            add(CompositeType.build(SC, ByteBufferUtil.bytes("BBB")));
-            add(CompositeType.build(SC, ByteBufferUtil.bytes("CCC")));
+            add(CellNames.compositeDense(SC, ByteBufferUtil.bytes("AAA")));
+            add(CellNames.compositeDense(SC, ByteBufferUtil.bytes("BBB")));
+            add(CellNames.compositeDense(SC, ByteBufferUtil.bytes("CCC")));
         }};
         private final String StandardCF = "Standard1";
         private final String SuperCF = "Super1";
 
         private final long readTs = 1369935512292L;
 
-        private final ColumnFamily StandardCf = TreeMapBackedSortedColumns.factory.create(KS, StandardCF);
-        private final ColumnFamily SuperCf = TreeMapBackedSortedColumns.factory.create(KS, SuperCF);
+        private final ColumnFamily StandardCf = ArrayBackedSortedColumns.factory.create(KS, StandardCF);
+        private final ColumnFamily SuperCf = ArrayBackedSortedColumns.factory.create(KS, SuperCF);
 
         private final Row StandardRow = new Row(Util.dk("key0"), StandardCf);
         private final Row SuperRow = new Row(Util.dk("key1"), SuperCf);
@@ -369,21 +368,21 @@
 
         private Statics()
         {
-            StandardCf.addColumn(new Column(bb("aaaa")));
-            StandardCf.addColumn(new Column(bb("bbbb"), bb("bbbbb-value")));
-            StandardCf.addColumn(new Column(bb("cccc"), bb("ccccc-value"), 1000L));
-            StandardCf.addColumn(new DeletedColumn(bb("dddd"), 500, 1000));
-            StandardCf.addColumn(new DeletedColumn(bb("eeee"), bb("eeee-value"), 1001));
-            StandardCf.addColumn(new ExpiringColumn(bb("ffff"), bb("ffff-value"), 2000, 1000));
-            StandardCf.addColumn(new ExpiringColumn(bb("gggg"), bb("gggg-value"), 2001, 1000, 2002));
+            StandardCf.addColumn(new BufferCell(cn("aaaa")));
+            StandardCf.addColumn(new BufferCell(cn("bbbb"), bb("bbbbb-value")));
+            StandardCf.addColumn(new BufferCell(cn("cccc"), bb("ccccc-value"), 1000L));
+            StandardCf.addColumn(new BufferDeletedCell(cn("dddd"), 500, 1000));
+            StandardCf.addColumn(new BufferDeletedCell(cn("eeee"), bb("eeee-value"), 1001));
+            StandardCf.addColumn(new BufferExpiringCell(cn("ffff"), bb("ffff-value"), 2000, 1000));
+            StandardCf.addColumn(new BufferExpiringCell(cn("gggg"), bb("gggg-value"), 2001, 1000, 2002));
 
-            SuperCf.addColumn(new Column(CompositeType.build(SC, bb("aaaa"))));
-            SuperCf.addColumn(new Column(CompositeType.build(SC, bb("bbbb")), bb("bbbbb-value")));
-            SuperCf.addColumn(new Column(CompositeType.build(SC, bb("cccc")), bb("ccccc-value"), 1000L));
-            SuperCf.addColumn(new DeletedColumn(CompositeType.build(SC, bb("dddd")), 500, 1000));
-            SuperCf.addColumn(new DeletedColumn(CompositeType.build(SC, bb("eeee")), bb("eeee-value"), 1001));
-            SuperCf.addColumn(new ExpiringColumn(CompositeType.build(SC, bb("ffff")), bb("ffff-value"), 2000, 1000));
-            SuperCf.addColumn(new ExpiringColumn(CompositeType.build(SC, bb("gggg")), bb("gggg-value"), 2001, 1000, 2002));
+            SuperCf.addColumn(new BufferCell(CellNames.compositeDense(SC, bb("aaaa"))));
+            SuperCf.addColumn(new BufferCell(CellNames.compositeDense(SC, bb("bbbb")), bb("bbbbb-value")));
+            SuperCf.addColumn(new BufferCell(CellNames.compositeDense(SC, bb("cccc")), bb("ccccc-value"), 1000L));
+            SuperCf.addColumn(new BufferDeletedCell(CellNames.compositeDense(SC, bb("dddd")), 500, 1000));
+            SuperCf.addColumn(new BufferDeletedCell(CellNames.compositeDense(SC, bb("eeee")), bb("eeee-value"), 1001));
+            SuperCf.addColumn(new BufferExpiringCell(CellNames.compositeDense(SC, bb("ffff")), bb("ffff-value"), 2000, 1000));
+            SuperCf.addColumn(new BufferExpiringCell(CellNames.compositeDense(SC, bb("gggg")), bb("gggg-value"), 2001, 1000, 2002));
         }
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/TimeSortTest.java b/test/unit/org/apache/cassandra/db/TimeSortTest.java
index 37966de..80c0ff1 100644
--- a/test/unit/org/apache/cassandra/db/TimeSortTest.java
+++ b/test/unit/org/apache/cassandra/db/TimeSortTest.java

@@ -19,58 +19,57 @@
 package org.apache.cassandra.db;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.concurrent.ExecutionException;
 import java.util.*;
 
 import org.junit.Test;
 import static org.junit.Assert.assertEquals;
 
 import org.apache.cassandra.SchemaLoader;
+import static org.apache.cassandra.Util.cellname;
 import static org.apache.cassandra.Util.getBytes;
 import org.apache.cassandra.Util;
 
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 
 public class TimeSortTest extends SchemaLoader
 {
     @Test
-    public void testMixedSources() throws IOException, ExecutionException, InterruptedException
+    public void testMixedSources()
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("StandardLong1");
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey key = Util.dk("key0");
 
-        rm = new RowMutation("Keyspace1", key.key);
-        rm.add("StandardLong1", getBytes(100), ByteBufferUtil.bytes("a"), 100);
+        rm = new Mutation("Keyspace1", key.getKey());
+        rm.add("StandardLong1", cellname(100), ByteBufferUtil.bytes("a"), 100);
         rm.apply();
         cfStore.forceBlockingFlush();
 
-        rm = new RowMutation("Keyspace1", key.key);
-        rm.add("StandardLong1", getBytes(0), ByteBufferUtil.bytes("b"), 0);
+        rm = new Mutation("Keyspace1", key.getKey());
+        rm.add("StandardLong1", cellname(0), ByteBufferUtil.bytes("b"), 0);
         rm.apply();
 
-        ColumnFamily cf = cfStore.getColumnFamily(key, getBytes(10), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1000, System.currentTimeMillis());
-        Collection<Column> columns = cf.getSortedColumns();
-        assert columns.size() == 1;
+        ColumnFamily cf = cfStore.getColumnFamily(key, cellname(10), Composites.EMPTY, false, 1000, System.currentTimeMillis());
+        Collection<Cell> cells = cf.getSortedColumns();
+        assert cells.size() == 1;
     }
 
     @Test
-    public void testTimeSort() throws IOException, ExecutionException, InterruptedException
+    public void testTimeSort() throws IOException
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("StandardLong1");
 
         for (int i = 900; i < 1000; ++i)
         {
-            RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes(Integer.toString(i)));
+            Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes(Integer.toString(i)));
             for (int j = 0; j < 8; ++j)
             {
-                rm.add("StandardLong1", getBytes(j * 2), ByteBufferUtil.bytes("a"), j * 2);
+                rm.add("StandardLong1", cellname(j * 2), ByteBufferUtil.bytes("a"), j * 2);
             }
             rm.apply();
         }
@@ -82,54 +81,50 @@
 
         // interleave some new data to test memtable + sstable
         DecoratedKey key = Util.dk("900");
-        RowMutation rm = new RowMutation("Keyspace1", key.key);
+        Mutation rm = new Mutation("Keyspace1", key.getKey());
         for (int j = 0; j < 4; ++j)
         {
-            rm.add("StandardLong1", getBytes(j * 2 + 1), ByteBufferUtil.bytes("b"), j * 2 + 1);
+            rm.add("StandardLong1", cellname(j * 2 + 1), ByteBufferUtil.bytes("b"), j * 2 + 1);
         }
         rm.apply();
         // and some overwrites
-        rm = new RowMutation("Keyspace1", key.key);
-        rm.add("StandardLong1", getBytes(0), ByteBufferUtil.bytes("c"), 100);
-        rm.add("StandardLong1", getBytes(10), ByteBufferUtil.bytes("c"), 100);
+        rm = new Mutation("Keyspace1", key.getKey());
+        rm.add("StandardLong1", cellname(0), ByteBufferUtil.bytes("c"), 100);
+        rm.add("StandardLong1", cellname(10), ByteBufferUtil.bytes("c"), 100);
         rm.apply();
 
         // verify
-        ColumnFamily cf = cfStore.getColumnFamily(key, getBytes(0), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1000, System.currentTimeMillis());
-        Collection<Column> columns = cf.getSortedColumns();
-        assertEquals(12, columns.size());
-        Iterator<Column> iter = columns.iterator();
-        Column column;
+        ColumnFamily cf = cfStore.getColumnFamily(key, cellname(0), Composites.EMPTY, false, 1000, System.currentTimeMillis());
+        Collection<Cell> cells = cf.getSortedColumns();
+        assertEquals(12, cells.size());
+        Iterator<Cell> iter = cells.iterator();
+        Cell cell;
         for (int j = 0; j < 8; j++)
         {
-            column = iter.next();
-            assert column.name().equals(getBytes(j));
+            cell = iter.next();
+            assert cell.name().toByteBuffer().equals(getBytes(j));
         }
-        TreeSet<ByteBuffer> columnNames = new TreeSet<ByteBuffer>(LongType.instance);
-        columnNames.add(getBytes(10));
-        columnNames.add(getBytes(0));
+        TreeSet<CellName> columnNames = new TreeSet<CellName>(cfStore.getComparator());
+        columnNames.add(cellname(10));
+        columnNames.add(cellname(0));
         cf = cfStore.getColumnFamily(QueryFilter.getNamesFilter(Util.dk("900"), "StandardLong1", columnNames, System.currentTimeMillis()));
-        assert "c".equals(ByteBufferUtil.string(cf.getColumn(getBytes(0)).value()));
-        assert "c".equals(ByteBufferUtil.string(cf.getColumn(getBytes(10)).value()));
+        assert "c".equals(ByteBufferUtil.string(cf.getColumn(cellname(0)).value()));
+        assert "c".equals(ByteBufferUtil.string(cf.getColumn(cellname(10)).value()));
     }
 
-    private void validateTimeSort(Keyspace keyspace) throws IOException
+    private void validateTimeSort(Keyspace keyspace)
     {
         for (int i = 900; i < 1000; ++i)
         {
             DecoratedKey key = Util.dk(Integer.toString(i));
             for (int j = 0; j < 8; j += 3)
             {
-                ColumnFamily cf = keyspace.getColumnFamilyStore("StandardLong1").getColumnFamily(key,
-                                                                                                 getBytes(j * 2),
-                                                                                                 ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                                                 false,
-                                                                                                 1000,
-                                                                                                 System.currentTimeMillis());
-                Collection<Column> columns = cf.getSortedColumns();
-                assert columns.size() == 8 - j;
+                ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("StandardLong1");
+                ColumnFamily cf = cfs.getColumnFamily(key, cellname(j * 2), Composites.EMPTY, false, 1000, System.currentTimeMillis());
+                Collection<Cell> cells = cf.getSortedColumns();
+                assert cells.size() == 8 - j;
                 int k = j;
-                for (Column c : columns)
+                for (Cell c : cells)
                 {
                     assertEquals((k++) * 2, c.timestamp());
 

diff --git a/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java
new file mode 100644
index 0000000..6e1ac5f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java

@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.compaction;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.dht.BytesToken;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
+import org.apache.cassandra.io.sstable.SSTableReader;
+import org.apache.cassandra.io.sstable.SSTableScanner;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.junit.After;
+import org.junit.Test;
+
+import com.google.common.collect.Iterables;
+
+public class AntiCompactionTest extends SchemaLoader
+{
+    private static final String KEYSPACE1 = "Keyspace1";
+    private static final String CF = "Standard1";
+
+    @Test
+    public void antiCompactOne() throws InterruptedException, ExecutionException, IOException
+    {
+        ColumnFamilyStore store = prepareColumnFamilyStore();
+        Collection<SSTableReader> sstables = store.getUnrepairedSSTables();
+        assertEquals(store.getSSTables().size(), sstables.size());
+        Range<Token> range = new Range<Token>(new BytesToken("0".getBytes()), new BytesToken("4".getBytes()));
+        List<Range<Token>> ranges = Arrays.asList(range);
+
+        SSTableReader.acquireReferences(sstables);
+        long repairedAt = 1000;
+        CompactionManager.instance.performAnticompaction(store, ranges, sstables, repairedAt);
+
+        assertEquals(2, store.getSSTables().size());
+        int repairedKeys = 0;
+        int nonRepairedKeys = 0;
+        for (SSTableReader sstable : store.getSSTables())
+        {
+            SSTableScanner scanner = sstable.getScanner();
+            while (scanner.hasNext())
+            {
+                SSTableIdentityIterator row = (SSTableIdentityIterator) scanner.next();
+                if (sstable.isRepaired())
+                {
+                    assertTrue(range.contains(row.getKey().getToken()));
+                    repairedKeys++;
+                }
+                else
+                {
+                    assertFalse(range.contains(row.getKey().getToken()));
+                    nonRepairedKeys++;
+                }
+            }
+        }
+        assertEquals(repairedKeys, 4);
+        assertEquals(nonRepairedKeys, 6);
+    }
+    
+    @Test
+    public void shouldSkipAntiCompactionForNonIntersectingRange() throws InterruptedException, ExecutionException, IOException
+    {
+        ColumnFamilyStore store = prepareColumnFamilyStore();
+        Collection<SSTableReader> sstables = store.getUnrepairedSSTables();
+        assertEquals(store.getSSTables().size(), sstables.size());
+        Range<Token> range = new Range<Token>(new BytesToken("-10".getBytes()), new BytesToken("-1".getBytes()));
+        List<Range<Token>> ranges = Arrays.asList(range);
+
+        SSTableReader.acquireReferences(sstables);
+        CompactionManager.instance.performAnticompaction(store, ranges, sstables, 0);
+
+        assertThat(store.getSSTables().size(), is(1));
+        assertThat(Iterables.get(store.getSSTables(), 0).isRepaired(), is(false));
+    }
+
+    private ColumnFamilyStore prepareColumnFamilyStore()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF);
+        store.disableAutoCompaction();
+        long timestamp = System.currentTimeMillis();
+        for (int i = 0; i < 10; i++)
+        {
+            DecoratedKey key = Util.dk(Integer.toString(i));
+            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            for (int j = 0; j < 10; j++)
+                rm.add("Standard1", Util.cellname(Integer.toString(j)),
+                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
+                       timestamp,
+                       0);
+            rm.apply();
+        }
+        store.forceBlockingFlush();
+        return store;
+    }
+    
+    @After
+    public void truncateCF()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF);
+        store.truncateBlocking();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/compaction/BlacklistingCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/BlacklistingCompactionsTest.java
index e392a4b..e6626ea 100644
--- a/test/unit/org/apache/cassandra/db/compaction/BlacklistingCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/BlacklistingCompactionsTest.java

@@ -31,16 +31,14 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
+import static org.apache.cassandra.Util.cellname;
 
 public class BlacklistingCompactionsTest extends SchemaLoader
 {
@@ -76,7 +74,7 @@
         final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
 
         final int ROWS_PER_SSTABLE = 10;
-        final int SSTABLES = cfs.metadata.getIndexInterval() * 2 / ROWS_PER_SSTABLE;
+        final int SSTABLES = cfs.metadata.getMinIndexInterval() * 2 / ROWS_PER_SSTABLE;
 
         cfs.setCompactionStrategyClass(compactionStrategy);
 
@@ -91,11 +89,9 @@
             for (int i = 0; i < ROWS_PER_SSTABLE; i++)
             {
                 DecoratedKey key = Util.dk(String.valueOf(i % 2));
-                RowMutation rm = new RowMutation(KEYSPACE, key.key);
+                Mutation rm = new Mutation(KEYSPACE, key.getKey());
                 long timestamp = j * ROWS_PER_SSTABLE + i;
-                rm.add("Standard1", ByteBufferUtil.bytes(String.valueOf(i / 2)),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp);
+                rm.add("Standard1", cellname(i / 2), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp);
                 maxTimestampExpected = Math.max(timestamp, maxTimestampExpected);
                 rm.apply();
                 inserted.add(key);

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
index 8461023..912c7f1 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java

@@ -18,31 +18,28 @@
 */
 package org.apache.cassandra.db.compaction;
 
-import java.io.IOException;
 import java.util.Collection;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 
+import org.apache.cassandra.db.*;
+
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.Column;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.Util;
 
 import static org.junit.Assert.assertEquals;
+import static org.apache.cassandra.db.KeyspaceTest.assertColumns;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
-import static org.apache.cassandra.db.KeyspaceTest.assertColumns;
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
 
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+
+import static org.apache.cassandra.Util.cellname;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 
@@ -52,7 +49,7 @@
     public static final String KEYSPACE2 = "Keyspace2";
 
     @Test
-    public void testMajorCompactionPurge() throws IOException, ExecutionException, InterruptedException
+    public void testMajorCompactionPurge() throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -61,13 +58,13 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
         DecoratedKey key = Util.dk("key1");
-        RowMutation rm;
+        Mutation rm;
 
         // inserts
-        rm = new RowMutation(KEYSPACE1, key.key);
+        rm = new Mutation(KEYSPACE1, key.getKey());
         for (int i = 0; i < 10; i++)
         {
-            rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
         }
         rm.apply();
         cfs.forceBlockingFlush();
@@ -75,15 +72,15 @@
         // deletes
         for (int i = 0; i < 10; i++)
         {
-            rm = new RowMutation(KEYSPACE1, key.key);
-            rm.delete(cfName, ByteBufferUtil.bytes(String.valueOf(i)), 1);
+            rm = new Mutation(KEYSPACE1, key.getKey());
+            rm.delete(cfName, cellname(String.valueOf(i)), 1);
             rm.apply();
         }
         cfs.forceBlockingFlush();
 
         // resurrect one column
-        rm = new RowMutation(KEYSPACE1, key.key);
-        rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
+        rm = new Mutation(KEYSPACE1, key.getKey());
+        rm.add(cfName, cellname(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
         rm.apply();
         cfs.forceBlockingFlush();
 
@@ -92,11 +89,11 @@
         cfs.invalidateCachedRow(key);
         ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
         assertColumns(cf, "5");
-        assert cf.getColumn(ByteBufferUtil.bytes(String.valueOf(5))) != null;
+        assert cf.getColumn(cellname(String.valueOf(5))) != null;
     }
 
     @Test
-    public void testMinorCompactionPurge() throws IOException, ExecutionException, InterruptedException
+    public void testMinorCompactionPurge()
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -104,15 +101,15 @@
         String cfName = "Standard1";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
-        RowMutation rm;
+        Mutation rm;
         for (int k = 1; k <= 2; ++k) {
             DecoratedKey key = Util.dk("key" + k);
 
             // inserts
-            rm = new RowMutation(KEYSPACE2, key.key);
+            rm = new Mutation(KEYSPACE2, key.getKey());
             for (int i = 0; i < 10; i++)
             {
-                rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+                rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
             }
             rm.apply();
             cfs.forceBlockingFlush();
@@ -120,8 +117,8 @@
             // deletes
             for (int i = 0; i < 10; i++)
             {
-                rm = new RowMutation(KEYSPACE2, key.key);
-                rm.delete(cfName, ByteBufferUtil.bytes(String.valueOf(i)), 1);
+                rm = new Mutation(KEYSPACE2, key.getKey());
+                rm.delete(cfName, cellname(String.valueOf(i)), 1);
                 rm.apply();
             }
             cfs.forceBlockingFlush();
@@ -134,8 +131,8 @@
         // for first key. Then submit minor compaction on remembered sstables.
         cfs.forceBlockingFlush();
         Collection<SSTableReader> sstablesIncomplete = cfs.getSSTables();
-        rm = new RowMutation(KEYSPACE2, key1.key);
-        rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
+        rm = new Mutation(KEYSPACE2, key1.getKey());
+        rm.add(cfName, cellname(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
         rm.apply();
         cfs.forceBlockingFlush();
         cfs.getCompactionStrategy().getUserDefinedTask(sstablesIncomplete, Integer.MAX_VALUE).execute(null);
@@ -151,46 +148,51 @@
         assertEquals(1, cf.getColumnCount());
     }
 
+    /**
+     * verify that we don't drop tombstones during a minor compaction that might still be relevant
+     */
     @Test
-    public void testMinTimestampPurge() throws IOException, ExecutionException, InterruptedException
+    public void testMinTimestampPurge()
     {
-        // verify that we don't drop tombstones during a minor compaction that might still be relevant
         CompactionManager.instance.disableAutoCompaction();
+
         Keyspace keyspace = Keyspace.open(KEYSPACE2);
         String cfName = "Standard1";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-
-        RowMutation rm;
+        Mutation rm;
         DecoratedKey key3 = Util.dk("key3");
+
         // inserts
-        rm = new RowMutation(KEYSPACE2, key3.key);
-        rm.add(cfName, ByteBufferUtil.bytes("c1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 8);
-        rm.add(cfName, ByteBufferUtil.bytes("c2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 8);
+        rm = new Mutation(KEYSPACE2, key3.getKey());
+        rm.add(cfName, cellname("c1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 8);
+        rm.add(cfName, cellname("c2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 8);
         rm.apply();
         cfs.forceBlockingFlush();
-        // deletes
-        rm = new RowMutation(KEYSPACE2, key3.key);
-        rm.delete(cfName, ByteBufferUtil.bytes("c1"), 10);
+        // delete c1
+        rm = new Mutation(KEYSPACE2, key3.getKey());
+        rm.delete(cfName, cellname("c1"), 10);
         rm.apply();
         cfs.forceBlockingFlush();
         Collection<SSTableReader> sstablesIncomplete = cfs.getSSTables();
 
-        // delete so we have new delete in a diffrent SST.
-        rm = new RowMutation(KEYSPACE2, key3.key);
-        rm.delete(cfName, ByteBufferUtil.bytes("c2"), 9);
+        // delete c2 so we have new delete in a diffrent SSTable
+        rm = new Mutation(KEYSPACE2, key3.getKey());
+        rm.delete(cfName, cellname("c2"), 9);
         rm.apply();
         cfs.forceBlockingFlush();
+
+        // compact the sstables with the c1/c2 data and the c1 tombstone
         cfs.getCompactionStrategy().getUserDefinedTask(sstablesIncomplete, Integer.MAX_VALUE).execute(null);
 
-        // we should have both the c1 and c2 tombstones still, since the c2 timestamp is older than the c1 tombstone
-        // so it would be invalid to assume we can throw out the c1 entry.
+        // We should have both the c1 and c2 tombstones still. Since the min timestamp in the c2 tombstone
+        // sstable is older than the c1 tombstone, it is invalid to throw out the c1 tombstone.
         ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key3, cfName, System.currentTimeMillis()));
-        assertFalse(cf.getColumn(ByteBufferUtil.bytes("c2")).isLive(System.currentTimeMillis()));
+        assertFalse(cf.getColumn(cellname("c2")).isLive());
         assertEquals(2, cf.getColumnCount());
     }
 
     @Test
-    public void testCompactionPurgeOneFile() throws IOException, ExecutionException, InterruptedException
+    public void testCompactionPurgeOneFile() throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -199,21 +201,21 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
         DecoratedKey key = Util.dk("key1");
-        RowMutation rm;
+        Mutation rm;
 
         // inserts
-        rm = new RowMutation(KEYSPACE1, key.key);
+        rm = new Mutation(KEYSPACE1, key.getKey());
         for (int i = 0; i < 5; i++)
         {
-            rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
         }
         rm.apply();
 
         // deletes
         for (int i = 0; i < 5; i++)
         {
-            rm = new RowMutation(KEYSPACE1, key.key);
-            rm.delete(cfName, ByteBufferUtil.bytes(String.valueOf(i)), 1);
+            rm = new Mutation(KEYSPACE1, key.getKey());
+            rm.delete(cfName, cellname(String.valueOf(i)), 1);
             rm.apply();
         }
         cfs.forceBlockingFlush();
@@ -227,7 +229,7 @@
     }
 
     @Test
-    public void testCompactionPurgeCachedRow() throws IOException, ExecutionException, InterruptedException
+    public void testCompactionPurgeCachedRow() throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -237,13 +239,13 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
         DecoratedKey key = Util.dk("key3");
-        RowMutation rm;
+        Mutation rm;
 
         // inserts
-        rm = new RowMutation(keyspaceName, key.key);
+        rm = new Mutation(keyspaceName, key.getKey());
         for (int i = 0; i < 10; i++)
         {
-            rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
         }
         rm.apply();
 
@@ -251,7 +253,7 @@
         cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
 
         // deletes row
-        rm = new RowMutation(keyspaceName, key.key);
+        rm = new Mutation(keyspaceName, key.getKey());
         rm.delete(cfName, 1);
         rm.apply();
 
@@ -260,22 +262,22 @@
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
 
         // re-inserts with timestamp lower than delete
-        rm = new RowMutation(keyspaceName, key.key);
+        rm = new Mutation(keyspaceName, key.getKey());
         for (int i = 0; i < 10; i++)
         {
-            rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
         }
         rm.apply();
 
         // Check that the second insert did went in
         ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
         assertEquals(10, cf.getColumnCount());
-        for (Column c : cf)
-            assert !c.isMarkedForDelete(System.currentTimeMillis());
+        for (Cell c : cf)
+            assert c.isLive();
     }
 
     @Test
-    public void testCompactionPurgeTombstonedRow() throws IOException, ExecutionException, InterruptedException
+    public void testCompactionPurgeTombstonedRow() throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -283,21 +285,18 @@
         String cfName = "Standard1";
         Keyspace keyspace = Keyspace.open(keyspaceName);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-
         DecoratedKey key = Util.dk("key3");
-        RowMutation rm;
+        Mutation rm;
         QueryFilter filter = QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis());
 
         // inserts
-        rm = new RowMutation(keyspaceName, key.key);
+        rm = new Mutation(keyspaceName, key.getKey());
         for (int i = 0; i < 10; i++)
-        {
-            rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
-        }
+            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
         rm.apply();
 
         // deletes row with timestamp such that not all columns are deleted
-        rm = new RowMutation(keyspaceName, key.key);
+        rm = new Mutation(keyspaceName, key.getKey());
         rm.delete(cfName, 4);
         rm.apply();
         ColumnFamily cf = cfs.getColumnFamily(filter);
@@ -309,22 +308,20 @@
         assertFalse(cfs.getColumnFamily(filter).isMarkedForDelete());
 
         // re-inserts with timestamp lower than delete
-        rm = new RowMutation(keyspaceName, key.key);
+        rm = new Mutation(keyspaceName, key.getKey());
         for (int i = 0; i < 5; i++)
-        {
-            rm.add(cfName, ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
-        }
+            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
         rm.apply();
 
         // Check that the second insert went in
         cf = cfs.getColumnFamily(filter);
         assertEquals(10, cf.getColumnCount());
-        for (Column c : cf)
-            assert !c.isMarkedForDelete(System.currentTimeMillis());
+        for (Cell c : cf)
+            assert c.isLive();
     }
 
     @Test
-    public void testRowTombstoneObservedBeforePurging() throws InterruptedException, ExecutionException, IOException
+    public void testRowTombstoneObservedBeforePurging() throws InterruptedException, ExecutionException
     {
         String keyspace = "cql_keyspace";
         String table = "table1";
@@ -332,41 +329,41 @@
         cfs.disableAutoCompaction();
 
         // write a row out to one sstable
-        processInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
+        executeInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
                                       keyspace, table, 1, "foo", 1));
         cfs.forceBlockingFlush();
 
-        UntypedResultSet result = processInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        UntypedResultSet result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(1, result.size());
 
         // write a row tombstone out to a second sstable
-        processInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        executeInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
         cfs.forceBlockingFlush();
 
         // basic check that the row is considered deleted
         assertEquals(2, cfs.getSSTables().size());
-        result = processInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(0, result.size());
 
         // compact the two sstables with a gcBefore that does *not* allow the row tombstone to be purged
-        Future future = CompactionManager.instance.submitMaximal(cfs, (int) (System.currentTimeMillis() / 1000) - 10000);
+        Future<?> future = CompactionManager.instance.submitMaximal(cfs, (int) (System.currentTimeMillis() / 1000) - 10000);
         future.get();
 
         // the data should be gone, but the tombstone should still exist
         assertEquals(1, cfs.getSSTables().size());
-        result = processInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(0, result.size());
 
         // write a row out to one sstable
-        processInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
+        executeInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
                                       keyspace, table, 1, "foo", 1));
         cfs.forceBlockingFlush();
         assertEquals(2, cfs.getSSTables().size());
-        result = processInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(1, result.size());
 
         // write a row tombstone out to a different sstable
-        processInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        executeInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
         cfs.forceBlockingFlush();
 
         // compact the two sstables with a gcBefore that *does* allow the row tombstone to be purged
@@ -375,7 +372,7 @@
 
         // both the data and the tombstone should be gone this time
         assertEquals(0, cfs.getSSTables().size());
-        result = processInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(0, result.size());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
index 1879838..979b079 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java

@@ -38,15 +38,15 @@
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.BytesToken;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.Component;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.sstable.SSTableScanner;
 import org.apache.cassandra.io.sstable.SSTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
@@ -72,6 +72,7 @@
         store.disableAutoCompaction();
 
         long timestamp = populate(KEYSPACE1, STANDARD1, 0, 9, 3); //ttl=3s
+
         store.forceBlockingFlush();
         assertEquals(1, store.getSSTables().size());
         long originalSize = store.getSSTables().iterator().next().uncompressedLength();
@@ -101,9 +102,9 @@
         for (int i = startRowKey; i <= endRowKey; i++)
         {
             DecoratedKey key = Util.dk(Integer.toString(i));
-            RowMutation rm = new RowMutation(ks, key.key);
+            Mutation rm = new Mutation(ks, key.getKey());
             for (int j = 0; j < 10; j++)
-                rm.add(cf, ByteBufferUtil.bytes(Integer.toString(j)),
+                rm.add(cf,  Util.cellname(Integer.toString(j)),
                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
                        timestamp,
                        j > 0 ? ttl : 0); // let first column never expire, since deleting all columns does not produce sstable
@@ -131,7 +132,7 @@
     }
 
     @Test
-    public void testSuperColumnTombstones() throws IOException, ExecutionException, InterruptedException
+    public void testSuperColumnTombstones() throws ExecutionException, InterruptedException
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Super1");
@@ -141,15 +142,15 @@
         ByteBuffer scName = ByteBufferUtil.bytes("TestSuperColumn");
 
         // a subcolumn
-        RowMutation rm = new RowMutation(KEYSPACE1, key.key);
-        rm.add("Super1", CompositeType.build(scName, ByteBufferUtil.bytes(0)),
+        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+        rm.add("Super1", Util.cellname(scName, ByteBufferUtil.bytes(0)),
                ByteBufferUtil.EMPTY_BYTE_BUFFER,
                FBUtilities.timestampMicros());
         rm.apply();
         cfs.forceBlockingFlush();
 
         // shadow the subcolumn with a supercolumn tombstone
-        rm = new RowMutation(KEYSPACE1, key.key);
+        rm = new Mutation(KEYSPACE1, key.getKey());
         rm.deleteRange("Super1", SuperColumns.startOf(scName), SuperColumns.endOf(scName), FBUtilities.timestampMicros());
         rm.apply();
         cfs.forceBlockingFlush();
@@ -246,7 +247,7 @@
     }
 
     @Test
-    public void testEchoedRow() throws IOException, ExecutionException, InterruptedException
+    public void testEchoedRow()
     {
         // This test check that EchoedRow doesn't skipp rows: see CASSANDRA-2653
 
@@ -261,8 +262,8 @@
         for (int i=1; i < 5; i++)
         {
             DecoratedKey key = Util.dk(String.valueOf(i));
-            RowMutation rm = new RowMutation(KEYSPACE1, key.key);
-            rm.add("Standard2", ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
+            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            rm.add("Standard2", Util.cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
             rm.apply();
 
             if (i % 2 == 0)
@@ -276,8 +277,8 @@
         for (int i=1; i < 5; i++)
         {
             DecoratedKey key = Util.dk(String.valueOf(i));
-            RowMutation rm = new RowMutation(KEYSPACE1, key.key);
-            rm.add("Standard2", ByteBufferUtil.bytes(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
+            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            rm.add("Standard2", Util.cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
             rm.apply();
         }
         cfs.forceBlockingFlush();
@@ -300,7 +301,7 @@
     }
 
     @Test
-    public void testDontPurgeAccidentaly() throws IOException, ExecutionException, InterruptedException
+    public void testDontPurgeAccidentaly() throws InterruptedException
     {
         testDontPurgeAccidentaly("test1", "Super5");
 
@@ -321,8 +322,8 @@
         final int ROWS_PER_SSTABLE = 10;
         for (int i = 0; i < ROWS_PER_SSTABLE; i++) {
             DecoratedKey key = Util.dk(String.valueOf(i));
-            RowMutation rm = new RowMutation(KEYSPACE1, key.key);
-            rm.add(cfname, ByteBufferUtil.bytes("col"),
+            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            rm.add(cfname, Util.cellname("col"),
                    ByteBufferUtil.EMPTY_BYTE_BUFFER,
                    System.currentTimeMillis());
             rm.apply();
@@ -351,91 +352,81 @@
     @Test
     public void testRangeTombstones() throws IOException, ExecutionException, InterruptedException
     {
-        boolean lazy = false;
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard2");
+        cfs.clearUnsafe();
 
-        do
+        // disable compaction while flushing
+        cfs.disableAutoCompaction();
+
+        final CFMetaData cfmeta = cfs.metadata;
+        Directories dir = cfs.directories;
+
+        ArrayList<DecoratedKey> keys = new ArrayList<DecoratedKey>();
+
+        for (int i=0; i < 4; i++)
         {
-            Keyspace keyspace = Keyspace.open(KEYSPACE1);
-            ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard2");
-            cfs.clearUnsafe();
-
-            // disable compaction while flushing
-            cfs.disableAutoCompaction();
-
-            final CFMetaData cfmeta = cfs.metadata;
-            Directories dir = Directories.create(cfmeta.ksName, cfmeta.cfName);
-
-            ArrayList<DecoratedKey> keys = new ArrayList<DecoratedKey>();
-
-            for (int i=0; i < 4; i++)
-            {
-                keys.add(Util.dk(""+i));
-            }
-
-            ArrayBackedSortedColumns cf = ArrayBackedSortedColumns.factory.create(cfmeta);
-            cf.addColumn(Util.column("01", "a", 1)); // this must not resurrect
-            cf.addColumn(Util.column("a", "a", 3));
-            cf.deletionInfo().add(new RangeTombstone(ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("b"), 2, (int) (System.currentTimeMillis()/1000)),cfmeta.comparator);
-
-            SSTableWriter writer = new SSTableWriter(cfs.getTempSSTablePath(dir.getDirectoryForNewSSTables()),
-                                                     0,
-                                                     cfs.metadata,
-                                                     StorageService.getPartitioner(),
-                                                     SSTableMetadata.createCollector(cfs.metadata.comparator));
-
-
-            writer.append(Util.dk("0"), cf);
-            writer.append(Util.dk("1"), cf);
-            writer.append(Util.dk("3"), cf);
-
-            cfs.addSSTable(writer.closeAndOpenReader());
-            writer = new SSTableWriter(cfs.getTempSSTablePath(dir.getDirectoryForNewSSTables()),
-                                       0,
-                                       cfs.metadata,
-                                       StorageService.getPartitioner(),
-                                       SSTableMetadata.createCollector(cfs.metadata.comparator));
-
-            writer.append(Util.dk("0"), cf);
-            writer.append(Util.dk("1"), cf);
-            writer.append(Util.dk("2"), cf);
-            writer.append(Util.dk("3"), cf);
-            cfs.addSSTable(writer.closeAndOpenReader());
-
-            Collection<SSTableReader> toCompact = cfs.getSSTables();
-            assert toCompact.size() == 2;
-
-            // forcing lazy comapction
-            if (lazy)
-                DatabaseDescriptor.setInMemoryCompactionLimit(0);
-
-            // Force compaction on first sstables. Since each row is in only one sstable, we will be using EchoedRow.
-            Util.compact(cfs, toCompact);
-            assertEquals(1, cfs.getSSTables().size());
-
-            // Now assert we do have the 4 keys
-            assertEquals(4, Util.getRangeSlice(cfs).size());
-
-            ArrayList<DecoratedKey> k = new ArrayList<DecoratedKey>();
-            for (Row r : Util.getRangeSlice(cfs))
-            {
-                k.add(r.key);
-                assertEquals(ByteBufferUtil.bytes("a"),r.cf.getColumn(ByteBufferUtil.bytes("a")).value());
-                assertNull(r.cf.getColumn(ByteBufferUtil.bytes("01")));
-                assertEquals(3,r.cf.getColumn(ByteBufferUtil.bytes("a")).timestamp());
-            }
-
-            for (SSTableReader sstable : cfs.getSSTables())
-            {
-                SSTableMetadata stats = sstable.getSSTableMetadata();
-                assertEquals(ByteBufferUtil.bytes("0"), stats.minColumnNames.get(0));
-                assertEquals(ByteBufferUtil.bytes("b"), stats.maxColumnNames.get(0));
-            }
-
-            assertEquals(keys, k);
-
-            lazy=!lazy;
+            keys.add(Util.dk(""+i));
         }
-        while (lazy);
+
+        ArrayBackedSortedColumns cf = ArrayBackedSortedColumns.factory.create(cfmeta);
+        cf.addColumn(Util.column("01", "a", 1)); // this must not resurrect
+        cf.addColumn(Util.column("a", "a", 3));
+        cf.deletionInfo().add(new RangeTombstone(Util.cellname("0"), Util.cellname("b"), 2, (int) (System.currentTimeMillis()/1000)),cfmeta.comparator);
+
+        SSTableWriter writer = new SSTableWriter(cfs.getTempSSTablePath(dir.getDirectoryForNewSSTables()),
+                                                 0,
+                                                 0,
+                                                 cfs.metadata,
+                                                 StorageService.getPartitioner(),
+                                                 new MetadataCollector(cfs.metadata.comparator));
+
+
+        writer.append(Util.dk("0"), cf);
+        writer.append(Util.dk("1"), cf);
+        writer.append(Util.dk("3"), cf);
+
+        cfs.addSSTable(writer.closeAndOpenReader());
+        writer = new SSTableWriter(cfs.getTempSSTablePath(dir.getDirectoryForNewSSTables()),
+                                   0,
+                                   0,
+                                   cfs.metadata,
+                                   StorageService.getPartitioner(),
+                                   new MetadataCollector(cfs.metadata.comparator));
+
+        writer.append(Util.dk("0"), cf);
+        writer.append(Util.dk("1"), cf);
+        writer.append(Util.dk("2"), cf);
+        writer.append(Util.dk("3"), cf);
+        cfs.addSSTable(writer.closeAndOpenReader());
+
+        Collection<SSTableReader> toCompact = cfs.getSSTables();
+        assert toCompact.size() == 2;
+
+        // Force compaction on first sstables. Since each row is in only one sstable, we will be using EchoedRow.
+        Util.compact(cfs, toCompact);
+        assertEquals(1, cfs.getSSTables().size());
+
+        // Now assert we do have the 4 keys
+        assertEquals(4, Util.getRangeSlice(cfs).size());
+
+        ArrayList<DecoratedKey> k = new ArrayList<DecoratedKey>();
+        for (Row r : Util.getRangeSlice(cfs))
+        {
+            k.add(r.key);
+            assertEquals(ByteBufferUtil.bytes("a"),r.cf.getColumn(Util.cellname("a")).value());
+            assertNull(r.cf.getColumn(Util.cellname("01")));
+            assertEquals(3,r.cf.getColumn(Util.cellname("a")).timestamp());
+        }
+
+        for (SSTableReader sstable : cfs.getSSTables())
+        {
+            StatsMetadata stats = sstable.getSSTableMetadata();
+            assertEquals(ByteBufferUtil.bytes("0"), stats.minColumnNames.get(0));
+            assertEquals(ByteBufferUtil.bytes("b"), stats.maxColumnNames.get(0));
+        }
+
+        assertEquals(keys, k);
     }
 
     @Test
@@ -467,7 +458,7 @@
         assert !compactionLogs.containsKey(Pair.create(KEYSPACE1, cf));
     }
 
-    private void testDontPurgeAccidentaly(String k, String cfname) throws IOException, ExecutionException, InterruptedException
+    private void testDontPurgeAccidentaly(String k, String cfname) throws InterruptedException
     {
         // This test catches the regression of CASSANDRA-2786
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
@@ -479,8 +470,8 @@
 
         // Add test row
         DecoratedKey key = Util.dk(k);
-        RowMutation rm = new RowMutation(KEYSPACE1, key.key);
-        rm.add(cfname, CompositeType.build(ByteBufferUtil.bytes("sc"), ByteBufferUtil.bytes("c")), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+        rm.add(cfname, Util.cellname(ByteBufferUtil.bytes("sc"), ByteBufferUtil.bytes("c")), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
         rm.apply();
 
         cfs.forceBlockingFlush();
@@ -488,15 +479,15 @@
         Collection<SSTableReader> sstablesBefore = cfs.getSSTables();
 
         QueryFilter filter = QueryFilter.getIdentityFilter(key, cfname, System.currentTimeMillis());
-        assert !(cfs.getColumnFamily(filter).getColumnCount() == 0);
+        assertTrue(cfs.getColumnFamily(filter).hasColumns());
 
         // Remove key
-        rm = new RowMutation(KEYSPACE1, key.key);
+        rm = new Mutation(KEYSPACE1, key.getKey());
         rm.delete(cfname, 2);
         rm.apply();
 
         ColumnFamily cf = cfs.getColumnFamily(filter);
-        assert cf == null || cf.getColumnCount() == 0 : "should be empty: " + cf;
+        assertTrue("should be empty: " + cf, cf == null || !cf.hasColumns());
 
         // Sleep one second so that the removal is indeed purgeable even with gcgrace == 0
         Thread.sleep(1000);
@@ -512,7 +503,7 @@
         Util.compact(cfs, toCompact);
 
         cf = cfs.getColumnFamily(filter);
-        assert cf == null || cf.getColumnCount() == 0 : "should be empty: " + cf;
+        assertTrue("should be empty: " + cf, cf == null || !cf.hasColumns());
     }
 
     private static Range<Token> rangeFor(int start, int end)
@@ -533,13 +524,13 @@
     {
         long timestamp = System.currentTimeMillis();
         DecoratedKey decoratedKey = Util.dk(String.format("%03d", key));
-        RowMutation rm = new RowMutation(KEYSPACE1, decoratedKey.key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col"), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, 1000);
+        Mutation rm = new Mutation(KEYSPACE1, decoratedKey.getKey());
+        rm.add("Standard1", Util.cellname("col"), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, 1000);
         rm.apply();
     }
 
     @Test
-    public void testNeedsCleanup() throws IOException
+    public void testNeedsCleanup()
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");

diff --git a/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
index 299e1af..ef189ba 100644
--- a/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java

@@ -30,7 +30,7 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.RowMutation;
+import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -173,8 +173,8 @@
         for (int r = 0; r < numSSTables; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            RowMutation rm = new RowMutation(KEYSPACE1, key.key);
-            rm.add(CF_STANDARD1, ByteBufferUtil.bytes("column"), value, r);
+            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            rm.add(CF_STANDARD1, Util.cellname("column"), value, r);
             rm.apply();
             cfs.forceBlockingFlush();
         }
@@ -217,8 +217,8 @@
         for (int r = 0; r < numSSTables; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            RowMutation rm = new RowMutation(KEYSPACE1, key.key);
-            rm.add(CF_STANDARD1, ByteBufferUtil.bytes("column"), value, r);
+            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            rm.add(CF_STANDARD1, Util.cellname("column"), value, r);
             rm.apply();
             cfs.forceBlockingFlush();
         }

diff --git a/test/unit/org/apache/cassandra/db/compaction/LegacyLeveledManifestTest.java b/test/unit/org/apache/cassandra/db/compaction/LegacyLeveledManifestTest.java
deleted file mode 100644
index 7fd6c10..0000000
--- a/test/unit/org/apache/cassandra/db/compaction/LegacyLeveledManifestTest.java
+++ /dev/null

@@ -1,117 +0,0 @@
-package org.apache.cassandra.db.compaction;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.Pair;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.apache.cassandra.db.compaction.LegacyLeveledManifestTestHelper.*;
-
-public class LegacyLeveledManifestTest
-{
-    private final static String LEGACY_VERSION = "ic";
-
-    private File destDir;
-    @Before
-    public void setup()
-    {
-        destDir = Directories.create(KS, CF).getDirectoryForNewSSTables();
-        FileUtils.createDirectory(destDir);
-        for (File srcFile : getLegacySSTableDir(LEGACY_VERSION).listFiles())
-        {
-            File destFile = new File(destDir, srcFile.getName());
-            FileUtils.createHardLink(srcFile,destFile);
-            assert destFile.exists() : destFile.getAbsoluteFile();
-        }
-    }
-    @After
-    public void tearDown()
-    {
-        FileUtils.deleteRecursive(destDir);
-    }
-
-    @Test
-    public void migrateTest() throws IOException
-    {
-        assertTrue(LegacyLeveledManifest.manifestNeedsMigration(KS, CF));
-    }
-
-    @Test
-    public void doMigrationTest() throws IOException, InterruptedException
-    {
-        LegacyLeveledManifest.migrateManifests(KS, CF);
-
-        for (int i = 0; i <= 2; i++)
-        {
-            Descriptor descriptor = Descriptor.fromFilename(destDir+File.separator+KS+"-"+CF+"-"+LEGACY_VERSION+"-"+i+"-Statistics.db");
-            SSTableMetadata metadata = SSTableMetadata.serializer.deserialize(descriptor).left;
-            assertEquals(metadata.sstableLevel, i);
-        }
-    }
-
-    /**
-     * Validate that the rewritten stats file is the same as the original one.
-     * @throws IOException
-     */
-    @Test
-    public void validateSSTableMetadataTest() throws IOException
-    {
-        Map<Descriptor, Pair<SSTableMetadata, Set<Integer>>> beforeMigration = new HashMap<>();
-        for (int i = 0; i <= 2; i++)
-        {
-            Descriptor descriptor = Descriptor.fromFilename(destDir+File.separator+KS+"-"+CF+"-"+LEGACY_VERSION+"-"+i+"-Statistics.db");
-            beforeMigration.put(descriptor, SSTableMetadata.serializer.deserialize(descriptor, false));
-        }
-
-        LegacyLeveledManifest.migrateManifests(KS, CF);
-
-        for (Map.Entry<Descriptor, Pair<SSTableMetadata, Set<Integer>>> entry : beforeMigration.entrySet())
-        {
-            Pair<SSTableMetadata, Set<Integer>> newMetaPair = SSTableMetadata.serializer.deserialize(entry.getKey());
-            SSTableMetadata newMetadata = newMetaPair.left;
-            SSTableMetadata oldMetadata = entry.getValue().left;
-            assertEquals(newMetadata.estimatedRowSize, oldMetadata.estimatedRowSize);
-            assertEquals(newMetadata.estimatedColumnCount, oldMetadata.estimatedColumnCount);
-            assertEquals(newMetadata.replayPosition, oldMetadata.replayPosition);
-            assertEquals(newMetadata.minTimestamp, oldMetadata.minTimestamp);
-            assertEquals(newMetadata.maxTimestamp, oldMetadata.maxTimestamp);
-            assertEquals(newMetadata.compressionRatio, oldMetadata.compressionRatio, 0.01);
-            assertEquals(newMetadata.partitioner, oldMetadata.partitioner);
-            assertEquals(newMetadata.estimatedTombstoneDropTime, oldMetadata.estimatedTombstoneDropTime);
-            assertEquals(entry.getValue().right, newMetaPair.right);
-        }
-    }
-
-}

diff --git a/test/unit/org/apache/cassandra/db/compaction/LegacyLeveledManifestTestHelper.java b/test/unit/org/apache/cassandra/db/compaction/LegacyLeveledManifestTestHelper.java
deleted file mode 100644
index 4ee92fe..0000000
--- a/test/unit/org/apache/cassandra/db/compaction/LegacyLeveledManifestTestHelper.java
+++ /dev/null

@@ -1,78 +0,0 @@
-package org.apache.cassandra.db.compaction;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
-import org.junit.Ignore;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.io.sstable.SSTableUtils;
-import org.apache.cassandra.io.util.FileUtils;
-
-@Ignore
-public class LegacyLeveledManifestTestHelper extends SchemaLoader
-{
-    public final static String PROP = "migration-sstable-root";
-    public final static String KS = "Keyspace1";
-    public final static String CF = "legacyleveled";
-    /**
-     * Generates two sstables to be used to test migrating from a .json manifest to keeping the level in the sstable
-     * metadata.
-     *
-     * Do this:
-     * 1. remove @Ignore
-     * 2. comment out the @Before and @After methods above
-     * 3. run this method
-     * 4. checkout trunk
-     * 5. copy the .json file from the previous version to the current one
-     *    (ie; test/data/migration-sstables/ic/Keyspace1/legacyleveled/legacyleveled.json)
-     * 6. update LegacyLeveledManifestTest to use the new version.
-     */
-    @Test
-    public void generateSSTable() throws IOException
-    {
-        File legacySSTableDir = getLegacySSTableDir(Descriptor.Version.current_version);
-        FileUtils.createDirectory(legacySSTableDir);
-        Set<String> keys = new HashSet<String>();
-        for(int i = 0; i < 10; i++)
-        {
-            keys.add("key"+i);
-        }
-        for(int i = 0; i < 3; i++)
-        {
-            SSTableReader ssTable = SSTableUtils.prepare().ks(KS).cf(CF).dest(new Descriptor(legacySSTableDir, KS, CF, i, false)).write(keys);
-            System.out.println(ssTable);
-        }
-    }
-    public static File getLegacySSTableDir(String version)
-    {
-        return new File(System.getProperty(PROP) + File.separator + version + File.separator + KS + File.separator + CF);
-    }
-
-}

diff --git a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
index b60f6d9..65c7b69 100644
--- a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java

@@ -20,50 +20,59 @@
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Set;
-import java.util.concurrent.ExecutionException;
 import java.util.UUID;
 
+import org.junit.After;
+import org.junit.Before;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.sstable.Component;
-import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.repair.RepairJobDesc;
 import org.apache.cassandra.repair.Validator;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class LeveledCompactionStrategyTest extends SchemaLoader
 {
+    private String ksname = "Keyspace1";
+    private String cfname = "StandardLeveled";
+    private Keyspace keyspace = Keyspace.open(ksname);
+    private ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+
+    @Before
+    public void enableCompaction()
+    {
+        cfs.enableAutoCompaction();
+    }
+
+    /**
+     * Since we use StandardLeveled CF for every test, we want to clean up after the test.
+     */
+    @After
+    public void truncateSTandardLeveled()
+    {
+        cfs.truncateBlocking();
+    }
+
     /*
      * This exercises in particular the code of #4142
      */
     @Test
     public void testValidationMultipleSSTablePerLevel() throws Exception
     {
-        String ksname = "Keyspace1";
-        String cfname = "StandardLeveled";
-        Keyspace keyspace = Keyspace.open(ksname);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
-
         ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files
 
         // Enough data to have a level 1 and 2
@@ -74,10 +83,10 @@
         for (int r = 0; r < rows; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            RowMutation rm = new RowMutation(ksname, key.key);
+            Mutation rm = new Mutation(ksname, key.getKey());
             for (int c = 0; c < columns; c++)
             {
-                rm.add(cfname, ByteBufferUtil.bytes("column" + c), value, 0);
+                rm.add(cfname, Util.cellname("column" + c), value, 0);
             }
             rm.apply();
             cfs.forceBlockingFlush();
@@ -89,9 +98,11 @@
         assert strategy.getLevelSize(1) > 0;
         assert strategy.getLevelSize(2) > 0;
 
-        Range<Token> range = new Range<Token>(Util.token(""), Util.token(""));
+        Range<Token> range = new Range<>(Util.token(""), Util.token(""));
         int gcBefore = keyspace.getColumnFamilyStore(cfname).gcBefore(System.currentTimeMillis());
-        RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), ksname, cfname, range);
+        UUID parentRepSession = UUID.randomUUID();
+        ActiveRepairService.instance.registerParentRepairSession(parentRepSession, Arrays.asList(cfs), Arrays.asList(range));
+        RepairJobDesc desc = new RepairJobDesc(parentRepSession, UUID.randomUUID(), ksname, cfname, range);
         Validator validator = new Validator(desc, FBUtilities.getBroadcastAddress(), gcBefore);
         CompactionManager.instance.submitValidation(cfs, validator).get();
     }
@@ -99,7 +110,7 @@
     /**
      * wait for leveled compaction to quiesce on the given columnfamily
      */
-    private void waitForLeveling(ColumnFamilyStore cfs) throws InterruptedException, ExecutionException
+    private void waitForLeveling(ColumnFamilyStore cfs) throws InterruptedException
     {
         LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) cfs.getCompactionStrategy();
         // L0 is the lowest priority, so when that's done, we know everything is done
@@ -110,11 +121,6 @@
     @Test
     public void testCompactionProgress() throws Exception
     {
-        String ksname = "Keyspace1";
-        String cfname = "StandardLeveled";
-        Keyspace keyspace = Keyspace.open(ksname);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
-
         // make sure we have SSTables in L1
         ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]);
         int rows = 2;
@@ -122,10 +128,10 @@
         for (int r = 0; r < rows; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            RowMutation rm = new RowMutation(ksname, key.key);
+            Mutation rm = new Mutation(ksname, key.getKey());
             for (int c = 0; c < columns; c++)
             {
-                rm.add(cfname, ByteBufferUtil.bytes("column" + c), value, 0);
+                rm.add(cfname, Util.cellname("column" + c), value, 0);
             }
             rm.apply();
             cfs.forceBlockingFlush();
@@ -137,7 +143,7 @@
 
         // get LeveledScanner for level 1 sstables
         Collection<SSTableReader> sstables = strategy.manifest.getLevel(1);
-        List<ICompactionScanner> scanners = strategy.getScanners(sstables);
+        List<ICompactionScanner> scanners = strategy.getScanners(sstables).scanners;
         assertEquals(1, scanners.size()); // should be one per level
         ICompactionScanner scanner = scanners.get(0);
         // scan through to the end
@@ -145,17 +151,12 @@
             scanner.next();
 
         // scanner.getCurrentPosition should be equal to total bytes of L1 sstables
-        assert scanner.getCurrentPosition() == SSTable.getTotalBytes(sstables);
+        assert scanner.getCurrentPosition() == SSTableReader.getTotalBytes(sstables);
     }
 
     @Test
     public void testMutateLevel() throws Exception
     {
-        String ksname = "Keyspace1";
-        String cfname = "StandardLeveled";
-        Keyspace keyspace = Keyspace.open(ksname);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
-
         ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files
 
         // Enough data to have a level 1 and 2
@@ -166,10 +167,10 @@
         for (int r = 0; r < rows; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            RowMutation rm = new RowMutation(ksname, key.key);
+            Mutation rm = new Mutation(ksname, key.getKey());
             for (int c = 0; c < columns; c++)
             {
-                rm.add(cfname, ByteBufferUtil.bytes("column" + c), value, 0);
+                rm.add(cfname, Util.cellname("column" + c), value, 0);
             }
             rm.apply();
             cfs.forceBlockingFlush();
@@ -186,7 +187,7 @@
         {
             assertTrue(s.getSSTableLevel() != 6);
             strategy.manifest.remove(s);
-            LeveledManifest.mutateLevel(Pair.create(s.getSSTableMetadata(), s.getAncestors()), s.descriptor, s.descriptor.filenameFor(Component.STATS), 6);
+            s.descriptor.getMetadataSerializer().mutateLevel(s.descriptor, 6);
             s.reloadSSTableMetadata();
             strategy.manifest.add(s);
         }
@@ -198,4 +199,89 @@
         // verify that the manifest has correct amount of sstables
         assertEquals(cfs.getSSTables().size(), levels[6]);
     }
+
+    @Test
+    public void testNewRepairedSSTable() throws Exception
+    {
+        ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files
+
+        // Enough data to have a level 1 and 2
+        int rows = 20;
+        int columns = 10;
+
+        // Adds enough data to trigger multiple sstable per level
+        for (int r = 0; r < rows; r++)
+        {
+            DecoratedKey key = Util.dk(String.valueOf(r));
+            Mutation rm = new Mutation(ksname, key.getKey());
+            for (int c = 0; c < columns; c++)
+            {
+                rm.add(cfname, Util.cellname("column" + c), value, 0);
+            }
+            rm.apply();
+            cfs.forceBlockingFlush();
+        }
+        waitForLeveling(cfs);
+        cfs.disableAutoCompaction();
+
+        while(CompactionManager.instance.isCompacting(Arrays.asList(cfs)))
+            Thread.sleep(100);
+
+        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) cfs.getCompactionStrategy();
+        assertTrue(strategy.getLevelSize(1) > 0);
+        assertTrue(strategy.getLevelSize(2) > 0);
+
+        for (SSTableReader sstable : cfs.getSSTables())
+        {
+            assertFalse(sstable.isRepaired());
+        }
+        int sstableCount = 0;
+        for (List<SSTableReader> level : strategy.manifest.generations)
+            sstableCount += level.size();
+
+        assertEquals(sstableCount, cfs.getSSTables().size());
+
+        assertFalse(strategy.manifest.hasRepairedData());
+        assertTrue(strategy.manifest.unrepairedL0.size() == 0);
+
+        SSTableReader sstable1 = strategy.manifest.generations[2].get(0);
+        SSTableReader sstable2 = strategy.manifest.generations[1].get(0);
+
+        // "repair" an sstable:
+        strategy.manifest.remove(sstable1);
+        sstable1.descriptor.getMetadataSerializer().mutateRepairedAt(sstable1.descriptor, System.currentTimeMillis());
+        sstable1.reloadSSTableMetadata();
+        assertTrue(sstable1.isRepaired());
+
+        // make sure adding a repaired sstable makes the manifest contain only repaired data;
+        strategy.manifest.add(sstable1);
+        assertTrue(strategy.manifest.hasRepairedData());
+        assertTrue(strategy.manifest.generations[2].contains(sstable1));
+        assertFalse(strategy.manifest.generations[1].contains(sstable2));
+        assertTrue(strategy.manifest.unrepairedL0.contains(sstable2));
+        sstableCount = 0;
+        for (int i = 0; i < strategy.manifest.generations.length; i++)
+        {
+            sstableCount += strategy.manifest.generations[i].size();
+            if (i != 2)
+                assertEquals(strategy.manifest.generations[i].size(), 0);
+            else
+                assertEquals(strategy.manifest.generations[i].size(), 1);
+        }
+        assertEquals(1, sstableCount);
+
+        // make sure adding an unrepaired sstable puts it in unrepairedL0:
+        strategy.manifest.remove(sstable2);
+        strategy.manifest.add(sstable2);
+        assertTrue(strategy.manifest.unrepairedL0.contains(sstable2));
+        assertEquals(strategy.manifest.unrepairedL0.size(), cfs.getSSTables().size() - 1);
+
+        // make sure repairing an sstable takes it away from unrepairedL0 and puts it in the correct level:
+        strategy.manifest.remove(sstable2);
+        sstable2.descriptor.getMetadataSerializer().mutateRepairedAt(sstable2.descriptor, System.currentTimeMillis());
+        sstable2.reloadSSTableMetadata();
+        strategy.manifest.add(sstable2);
+        assertFalse(strategy.manifest.unrepairedL0.contains(sstable2));
+        assertTrue(strategy.manifest.generations[1].contains(sstable2));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java b/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java
index 444e30c..bc3fe35 100644
--- a/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java

@@ -18,7 +18,6 @@
 */
 package org.apache.cassandra.db.compaction;
 
-import java.io.IOException;
 import java.util.concurrent.ExecutionException;
 import java.util.Set;
 import java.util.HashSet;
@@ -28,10 +27,8 @@
 import org.junit.Test;
 
 import static org.junit.Assert.assertEquals;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.ColumnFamilyStore;
+
+import org.apache.cassandra.db.*;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -39,7 +36,7 @@
 
 public class OneCompactionTest extends SchemaLoader
 {
-    private void testCompaction(String columnFamilyName, int insertsPerTable) throws IOException, ExecutionException, InterruptedException
+    private void testCompaction(String columnFamilyName, int insertsPerTable) throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -49,8 +46,8 @@
         Set<DecoratedKey> inserted = new HashSet<DecoratedKey>();
         for (int j = 0; j < insertsPerTable; j++) {
             DecoratedKey key = Util.dk(String.valueOf(j));
-            RowMutation rm = new RowMutation("Keyspace1", key.key);
-            rm.add(columnFamilyName, ByteBufferUtil.bytes("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
+            Mutation rm = new Mutation("Keyspace1", key.getKey());
+            rm.add(columnFamilyName, Util.cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
             rm.apply();
             inserted.add(key);
             store.forceBlockingFlush();
@@ -61,13 +58,13 @@
     }
 
     @Test
-    public void testCompaction1() throws IOException, ExecutionException, InterruptedException
+    public void testCompaction1() throws ExecutionException, InterruptedException
     {
         testCompaction("Standard1", 1);
     }
 
     @Test
-    public void testCompaction2() throws IOException, ExecutionException, InterruptedException
+    public void testCompaction2() throws ExecutionException, InterruptedException
     {
         testCompaction("Standard2", 2);
     }

diff --git a/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
index 6bfa4e8..6132dad 100644
--- a/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java

@@ -24,14 +24,10 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.RowMutation;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.metrics.RestorableMeter;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.getBuckets;
@@ -164,8 +160,8 @@
         for (int r = 0; r < numSSTables; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            RowMutation rm = new RowMutation(ksname, key.key);
-            rm.add(cfname, ByteBufferUtil.bytes("column"), value, 0);
+            Mutation rm = new Mutation(ksname, key.getKey());
+            rm.add(cfname, Util.cellname("column"), value, 0);
             rm.apply();
             cfs.forceBlockingFlush();
         }
@@ -208,8 +204,8 @@
         for (int r = 0; r < numSSTables; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            RowMutation rm = new RowMutation(ksname, key.key);
-            rm.add(cfname, ByteBufferUtil.bytes("column"), value, 0);
+            Mutation rm = new Mutation(ksname, key.getKey());
+            rm.add(cfname, Util.cellname("column"), value, 0);
             rm.apply();
             cfs.forceBlockingFlush();
         }
@@ -259,4 +255,4 @@
         filtered = filterColdSSTables(sstrs, 1.0);
         assertTrue(filtered.isEmpty());
     }
-}
\ No newline at end of file
+}

diff --git a/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java b/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java
index 7666922..b98af68 100644
--- a/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java

@@ -20,24 +20,14 @@
  * 
  */
 
-
-
-import java.util.concurrent.ExecutionException;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DataRange;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.sstable.SSTableScanner;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -48,18 +38,18 @@
 public class TTLExpiryTest extends SchemaLoader
 {
     @Test
-    public void testSimpleExpire() throws ExecutionException, InterruptedException
+    public void testSimpleExpire() throws InterruptedException
     {
         ColumnFamilyStore cfs = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard1");
         cfs.disableAutoCompaction();
         cfs.metadata.gcGraceSeconds(0);
         long timestamp = System.currentTimeMillis();
-        RowMutation rm = new RowMutation("Keyspace1", Util.dk("ttl").key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col"),
+        Mutation rm = new Mutation("Keyspace1", Util.dk("ttl").getKey());
+        rm.add("Standard1", Util.cellname("col"),
                ByteBufferUtil.EMPTY_BYTE_BUFFER,
                timestamp,
                1);
-        rm.add("Standard1", ByteBufferUtil.bytes("col7"),
+        rm.add("Standard1", Util.cellname("col7"),
                ByteBufferUtil.EMPTY_BYTE_BUFFER,
                timestamp,
                1);
@@ -67,22 +57,22 @@
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation("Keyspace1", Util.dk("ttl").key);
-                rm.add("Standard1", ByteBufferUtil.bytes("col2"),
+        rm = new Mutation("Keyspace1", Util.dk("ttl").getKey());
+                rm.add("Standard1", Util.cellname("col2"),
                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
                        timestamp,
                        1);
                 rm.apply();
         cfs.forceBlockingFlush();
-        rm = new RowMutation("Keyspace1", Util.dk("ttl").key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col3"),
+        rm = new Mutation("Keyspace1", Util.dk("ttl").getKey());
+        rm.add("Standard1", Util.cellname("col3"),
                    ByteBufferUtil.EMPTY_BYTE_BUFFER,
                    timestamp,
                    1);
         rm.apply();
         cfs.forceBlockingFlush();
-        rm = new RowMutation("Keyspace1", Util.dk("ttl").key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col311"),
+        rm = new Mutation("Keyspace1", Util.dk("ttl").getKey());
+        rm.add("Standard1", Util.cellname("col311"),
                    ByteBufferUtil.EMPTY_BYTE_BUFFER,
                    timestamp,
                    1);
@@ -96,18 +86,18 @@
     }
 
     @Test
-    public void testNoExpire() throws ExecutionException, InterruptedException
+    public void testNoExpire() throws InterruptedException
     {
         ColumnFamilyStore cfs = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard1");
         cfs.disableAutoCompaction();
         cfs.metadata.gcGraceSeconds(0);
         long timestamp = System.currentTimeMillis();
-        RowMutation rm = new RowMutation("Keyspace1", Util.dk("ttl").key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col"),
+        Mutation rm = new Mutation("Keyspace1", Util.dk("ttl").getKey());
+        rm.add("Standard1", Util.cellname("col"),
                ByteBufferUtil.EMPTY_BYTE_BUFFER,
                timestamp,
                1);
-        rm.add("Standard1", ByteBufferUtil.bytes("col7"),
+        rm.add("Standard1", Util.cellname("col7"),
                ByteBufferUtil.EMPTY_BYTE_BUFFER,
                timestamp,
                1);
@@ -115,23 +105,23 @@
         rm.apply();
         cfs.forceBlockingFlush();
 
-        rm = new RowMutation("Keyspace1", Util.dk("ttl").key);
-                rm.add("Standard1", ByteBufferUtil.bytes("col2"),
+        rm = new Mutation("Keyspace1", Util.dk("ttl").getKey());
+                rm.add("Standard1", Util.cellname("col2"),
                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
                        timestamp,
                        1);
                 rm.apply();
         cfs.forceBlockingFlush();
-        rm = new RowMutation("Keyspace1", Util.dk("ttl").key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col3"),
+        rm = new Mutation("Keyspace1", Util.dk("ttl").getKey());
+        rm.add("Standard1", Util.cellname("col3"),
                    ByteBufferUtil.EMPTY_BYTE_BUFFER,
                    timestamp,
                    1);
         rm.apply();
         cfs.forceBlockingFlush();
         DecoratedKey noTTLKey = Util.dk("nottl");
-        rm = new RowMutation("Keyspace1", noTTLKey.key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col311"),
+        rm = new Mutation("Keyspace1", noTTLKey.getKey());
+        rm.add("Standard1", Util.cellname("col311"),
                    ByteBufferUtil.EMPTY_BYTE_BUFFER,
                    timestamp);
         rm.apply();

diff --git a/test/unit/org/apache/cassandra/db/composites/CTypeTest.java b/test/unit/org/apache/cassandra/db/composites/CTypeTest.java
new file mode 100644
index 0000000..496a2dc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/composites/CTypeTest.java

@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.composites;
+
+import com.google.common.collect.Lists;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.junit.Test;
+
+import java.util.List;
+
+public class CTypeTest
+{
+    static final List<AbstractType<?>> types = Lists.newArrayList();
+    static
+    {
+        types.add(UTF8Type.instance);
+        types.add(UUIDType.instance);
+        types.add(Int32Type.instance);
+    }
+
+    static final CellNameType cdtype = new CompoundDenseCellNameType(types);
+    static final CellNameType stype1 = new SimpleDenseCellNameType(BytesType.instance);
+    static final CellNameType stype2 = new SimpleDenseCellNameType(UUIDType.instance);
+
+    @Test
+    public void testCompoundType()
+    {
+        Composite a1 = cdtype.makeCellName("a",UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"), 1);
+        Composite a2 = cdtype.makeCellName("a",UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"), 100);
+        Composite b1 = cdtype.makeCellName("a",UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 1);
+        Composite b2 = cdtype.makeCellName("a",UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 100);
+        Composite c1 = cdtype.makeCellName("z",UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"), 1);
+        Composite c2 = cdtype.makeCellName("z",UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"), 100);
+        Composite d1 = cdtype.makeCellName("z",UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 1);
+        Composite d2 = cdtype.makeCellName("z",UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 100);
+
+        Composite z1 = cdtype.makeCellName(ByteBufferUtil.EMPTY_BYTE_BUFFER,UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 100);
+
+        assert cdtype.compare(a1,a2) < 0;
+        assert cdtype.compare(a2,b1) < 0;
+        assert cdtype.compare(b1,b2) < 0;
+        assert cdtype.compare(b2,c1) < 0;
+        assert cdtype.compare(c1,c2) < 0;
+        assert cdtype.compare(c2,d1) < 0;
+        assert cdtype.compare(d1,d2) < 0;
+
+        assert cdtype.compare(a2,a1) > 0;
+        assert cdtype.compare(b1,a2) > 0;
+        assert cdtype.compare(b2,b1) > 0;
+        assert cdtype.compare(c1,b2) > 0;
+        assert cdtype.compare(c2,c1) > 0;
+        assert cdtype.compare(d1,c2) > 0;
+        assert cdtype.compare(d2,d1) > 0;
+
+        assert cdtype.compare(z1,a1) < 0;
+        assert cdtype.compare(z1,a2) < 0;
+        assert cdtype.compare(z1,b1) < 0;
+        assert cdtype.compare(z1,b2) < 0;
+        assert cdtype.compare(z1,c1) < 0;
+        assert cdtype.compare(z1,c2) < 0;
+        assert cdtype.compare(z1,d1) < 0;
+        assert cdtype.compare(z1,d2) < 0;
+
+        assert cdtype.compare(a1,a1) == 0;
+        assert cdtype.compare(a2,a2) == 0;
+        assert cdtype.compare(b1,b1) == 0;
+        assert cdtype.compare(b2,b2) == 0;
+        assert cdtype.compare(c1,c1) == 0;
+        assert cdtype.compare(c2,c2) == 0;
+        assert cdtype.compare(z1,z1) == 0;
+    }
+
+    @Test
+    public void testSimpleType2()
+    {
+        CellName a = stype2.makeCellName(UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"));
+        CellName z = stype2.makeCellName(UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"));
+
+        assert stype2.compare(a,z) < 0;
+        assert stype2.compare(z,a) > 0;
+        assert stype2.compare(a,a) == 0;
+        assert stype2.compare(z,z) == 0;
+    }
+
+
+    @Test
+    public void testSimpleType1()
+    {
+        CellName a = stype1.makeCellName(ByteBufferUtil.bytes("a"));
+        CellName z = stype1.makeCellName(ByteBufferUtil.bytes("z"));
+
+        assert stype1.compare(a,z) < 0;
+        assert stype1.compare(z,a) > 0;
+        assert stype1.compare(a,a) == 0;
+        assert stype1.compare(z,z) == 0;
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/db/context/CounterContextTest.java b/test/unit/org/apache/cassandra/db/context/CounterContextTest.java
index 5c88fd6..a72d30d 100644
--- a/test/unit/org/apache/cassandra/db/context/CounterContextTest.java
+++ b/test/unit/org/apache/cassandra/db/context/CounterContextTest.java

@@ -20,17 +20,22 @@
  */
 package org.apache.cassandra.db.context;
 
-import static org.junit.Assert.*;
-
 import java.nio.ByteBuffer;
 
 import org.junit.Test;
 
-import org.apache.cassandra.db.context.IContext.ContextRelationship;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.utils.*;
+import org.apache.cassandra.db.ClockAndCount;
+import org.apache.cassandra.db.context.CounterContext.Relationship;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.CounterId;
 
 import static org.apache.cassandra.db.context.CounterContext.ContextState;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
 
 public class CounterContextTest
 {
@@ -43,239 +48,213 @@
     private static final int countLength = 8;
     private static final int stepLength = idLength + clockLength + countLength;
 
-    /** Allocates 1 byte from a new SlabAllocator and returns it. */
-    private Allocator bumpedSlab()
-    {
-        SlabAllocator allocator = new SlabAllocator();
-        allocator.allocate(1);
-        return allocator;
-    }
-
     @Test
     public void testAllocate()
     {
-        runAllocate(HeapAllocator.instance);
-        runAllocate(bumpedSlab());
-    }
-
-    private void runAllocate(Allocator allocator)
-    {
-        ContextState allGlobal = ContextState.allocate(3, 0, 0, allocator);
+        ContextState allGlobal = ContextState.allocate(3, 0, 0);
         assertEquals(headerSizeLength + 3 * headerEltLength + 3 * stepLength, allGlobal.context.remaining());
 
-        ContextState allLocal = ContextState.allocate(0, 3, 0, allocator);
+        ContextState allLocal = ContextState.allocate(0, 3, 0);
         assertEquals(headerSizeLength + 3 * headerEltLength + 3 * stepLength, allLocal.context.remaining());
 
-        ContextState allRemote = ContextState.allocate(0, 0, 3, allocator);
+        ContextState allRemote = ContextState.allocate(0, 0, 3);
         assertEquals(headerSizeLength + 3 * stepLength, allRemote.context.remaining());
 
-        ContextState mixed = ContextState.allocate(1, 1, 1, allocator);
+        ContextState mixed = ContextState.allocate(1, 1, 1);
         assertEquals(headerSizeLength + 2 * headerEltLength + 3 * stepLength, mixed.context.remaining());
     }
 
     @Test
     public void testDiff()
     {
-        runDiff(HeapAllocator.instance);
-        runDiff(bumpedSlab());
-    }
-
-    private void runDiff(Allocator allocator)
-    {
         ContextState left;
         ContextState right;
 
         // equality: equal nodes, all counts same
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 3L, 0L);
         left.writeRemote(CounterId.fromInt(6), 2L, 0L);
         left.writeRemote(CounterId.fromInt(9), 1L, 0L);
         right = ContextState.wrap(ByteBufferUtil.clone(left.context));
 
-        assertEquals(ContextRelationship.EQUAL, cc.diff(left.context, right.context));
+        assertEquals(Relationship.EQUAL, cc.diff(left.context, right.context));
 
         // greater than: left has superset of nodes (counts equal)
-        left = ContextState.allocate(0, 0, 4, allocator);
+        left = ContextState.allocate(0, 0, 4);
         left.writeRemote(CounterId.fromInt(3),  3L, 0L);
         left.writeRemote(CounterId.fromInt(6),  2L, 0L);
         left.writeRemote(CounterId.fromInt(9),  1L, 0L);
         left.writeRemote(CounterId.fromInt(12), 0L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(3), 3L, 0L);
         right.writeRemote(CounterId.fromInt(6), 2L, 0L);
         right.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        assertEquals(ContextRelationship.GREATER_THAN, cc.diff(left.context, right.context));
+        assertEquals(Relationship.GREATER_THAN, cc.diff(left.context, right.context));
 
         // less than: left has subset of nodes (counts equal)
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 3L, 0L);
         left.writeRemote(CounterId.fromInt(6), 2L, 0L);
         left.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        right = ContextState.allocate(0, 0, 4, allocator);
+        right = ContextState.allocate(0, 0, 4);
         right.writeRemote(CounterId.fromInt(3),  3L, 0L);
         right.writeRemote(CounterId.fromInt(6),  2L, 0L);
         right.writeRemote(CounterId.fromInt(9),  1L, 0L);
         right.writeRemote(CounterId.fromInt(12), 0L, 0L);
 
-        assertEquals(ContextRelationship.LESS_THAN, cc.diff(left.context, right.context));
+        assertEquals(Relationship.LESS_THAN, cc.diff(left.context, right.context));
 
         // greater than: equal nodes, but left has higher counts
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 3L, 0L);
         left.writeRemote(CounterId.fromInt(6), 2L, 0L);
         left.writeRemote(CounterId.fromInt(9), 3L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(3), 3L, 0L);
         right.writeRemote(CounterId.fromInt(6), 2L, 0L);
         right.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        assertEquals(ContextRelationship.GREATER_THAN, cc.diff(left.context, right.context));
+        assertEquals(Relationship.GREATER_THAN, cc.diff(left.context, right.context));
 
         // less than: equal nodes, but right has higher counts
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 3L, 0L);
         left.writeRemote(CounterId.fromInt(6), 2L, 0L);
         left.writeRemote(CounterId.fromInt(9), 3L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(3), 3L, 0L);
         right.writeRemote(CounterId.fromInt(6), 9L, 0L);
         right.writeRemote(CounterId.fromInt(9), 3L, 0L);
 
-        assertEquals(ContextRelationship.LESS_THAN, cc.diff(left.context, right.context));
+        assertEquals(Relationship.LESS_THAN, cc.diff(left.context, right.context));
 
         // disjoint: right and left have disjoint node sets
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 1L, 0L);
         left.writeRemote(CounterId.fromInt(4), 1L, 0L);
         left.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(3), 1L, 0L);
         right.writeRemote(CounterId.fromInt(6), 1L, 0L);
         right.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        assertEquals(ContextRelationship.DISJOINT, cc.diff(left.context, right.context));
+        assertEquals(Relationship.DISJOINT, cc.diff(left.context, right.context));
 
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 1L, 0L);
         left.writeRemote(CounterId.fromInt(4), 1L, 0L);
         left.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(2),  1L, 0L);
         right.writeRemote(CounterId.fromInt(6),  1L, 0L);
         right.writeRemote(CounterId.fromInt(12), 1L, 0L);
 
-        assertEquals(ContextRelationship.DISJOINT, cc.diff(left.context, right.context));
+        assertEquals(Relationship.DISJOINT, cc.diff(left.context, right.context));
 
         // disjoint: equal nodes, but right and left have higher counts in differing nodes
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 1L, 0L);
         left.writeRemote(CounterId.fromInt(6), 3L, 0L);
         left.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(3), 1L, 0L);
         right.writeRemote(CounterId.fromInt(6), 1L, 0L);
         right.writeRemote(CounterId.fromInt(9), 5L, 0L);
 
-        assertEquals(ContextRelationship.DISJOINT, cc.diff(left.context, right.context));
+        assertEquals(Relationship.DISJOINT, cc.diff(left.context, right.context));
 
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 2L, 0L);
         left.writeRemote(CounterId.fromInt(6), 3L, 0L);
         left.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(3), 1L, 0L);
         right.writeRemote(CounterId.fromInt(6), 9L, 0L);
         right.writeRemote(CounterId.fromInt(9), 5L, 0L);
 
-        assertEquals(ContextRelationship.DISJOINT, cc.diff(left.context, right.context));
+        assertEquals(Relationship.DISJOINT, cc.diff(left.context, right.context));
 
         // disjoint: left has more nodes, but lower counts
-        left = ContextState.allocate(0, 0, 4, allocator);
+        left = ContextState.allocate(0, 0, 4);
         left.writeRemote(CounterId.fromInt(3),  2L, 0L);
         left.writeRemote(CounterId.fromInt(6),  3L, 0L);
         left.writeRemote(CounterId.fromInt(9),  1L, 0L);
         left.writeRemote(CounterId.fromInt(12), 1L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(3), 4L, 0L);
         right.writeRemote(CounterId.fromInt(6), 9L, 0L);
         right.writeRemote(CounterId.fromInt(9), 5L, 0L);
 
-        assertEquals(ContextRelationship.DISJOINT, cc.diff(left.context, right.context));
+        assertEquals(Relationship.DISJOINT, cc.diff(left.context, right.context));
 
         // disjoint: left has less nodes, but higher counts
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 5L, 0L);
         left.writeRemote(CounterId.fromInt(6), 3L, 0L);
         left.writeRemote(CounterId.fromInt(9), 2L, 0L);
 
-        right = ContextState.allocate(0, 0, 4, allocator);
+        right = ContextState.allocate(0, 0, 4);
         right.writeRemote(CounterId.fromInt(3),  4L, 0L);
         right.writeRemote(CounterId.fromInt(6),  3L, 0L);
         right.writeRemote(CounterId.fromInt(9),  2L, 0L);
         right.writeRemote(CounterId.fromInt(12), 1L, 0L);
 
-        assertEquals(ContextRelationship.DISJOINT, cc.diff(left.context, right.context));
+        assertEquals(Relationship.DISJOINT, cc.diff(left.context, right.context));
 
         // disjoint: mixed nodes and counts
-        left = ContextState.allocate(0, 0, 3, allocator);
+        left = ContextState.allocate(0, 0, 3);
         left.writeRemote(CounterId.fromInt(3), 5L, 0L);
         left.writeRemote(CounterId.fromInt(6), 2L, 0L);
         left.writeRemote(CounterId.fromInt(9), 2L, 0L);
 
-        right = ContextState.allocate(0, 0, 4, allocator);
+        right = ContextState.allocate(0, 0, 4);
         right.writeRemote(CounterId.fromInt(3),  4L, 0L);
         right.writeRemote(CounterId.fromInt(6),  3L, 0L);
         right.writeRemote(CounterId.fromInt(9),  2L, 0L);
         right.writeRemote(CounterId.fromInt(12), 1L, 0L);
 
-        assertEquals(ContextRelationship.DISJOINT, cc.diff(left.context, right.context));
+        assertEquals(Relationship.DISJOINT, cc.diff(left.context, right.context));
 
-        left = ContextState.allocate(0, 0, 4, allocator);
+        left = ContextState.allocate(0, 0, 4);
         left.writeRemote(CounterId.fromInt(3), 5L, 0L);
         left.writeRemote(CounterId.fromInt(6), 2L, 0L);
         left.writeRemote(CounterId.fromInt(7), 2L, 0L);
         left.writeRemote(CounterId.fromInt(9), 2L, 0L);
 
-        right = ContextState.allocate(0, 0, 3, allocator);
+        right = ContextState.allocate(0, 0, 3);
         right.writeRemote(CounterId.fromInt(3), 4L, 0L);
         right.writeRemote(CounterId.fromInt(6), 3L, 0L);
         right.writeRemote(CounterId.fromInt(9), 2L, 0L);
 
-        assertEquals(ContextRelationship.DISJOINT, cc.diff(left.context, right.context));
+        assertEquals(Relationship.DISJOINT, cc.diff(left.context, right.context));
     }
 
     @Test
     public void testMerge()
     {
-        runMerge(HeapAllocator.instance);
-        runMerge(bumpedSlab());
-    }
-
-    private void runMerge(Allocator allocator)
-    {
         // note: local counts aggregated; remote counts are reconciled (i.e. take max)
-        ContextState left = ContextState.allocate(0, 1, 3, allocator);
+        ContextState left = ContextState.allocate(0, 1, 3);
         left.writeRemote(CounterId.fromInt(1), 1L, 1L);
         left.writeRemote(CounterId.fromInt(2), 2L, 2L);
         left.writeRemote(CounterId.fromInt(4), 6L, 3L);
         left.writeLocal(CounterId.getLocalId(), 7L, 3L);
 
-        ContextState right = ContextState.allocate(0, 1, 2, allocator);
+        ContextState right = ContextState.allocate(0, 1, 2);
         right.writeRemote(CounterId.fromInt(4), 4L, 4L);
         right.writeRemote(CounterId.fromInt(5), 5L, 5L);
         right.writeLocal(CounterId.getLocalId(), 2L, 9L);
 
-        ByteBuffer merged = cc.merge(left.context, right.context, allocator);
+        ByteBuffer merged = cc.merge(left.context, right.context);
         int hd = 4;
 
         assertEquals(hd + 5 * stepLength, merged.remaining());
@@ -304,17 +283,17 @@
         //
         // Test merging two exclusively global contexts
         //
-        left = ContextState.allocate(3, 0, 0, allocator);
+        left = ContextState.allocate(3, 0, 0);
         left.writeGlobal(CounterId.fromInt(1), 1L, 1L);
         left.writeGlobal(CounterId.fromInt(2), 2L, 2L);
         left.writeGlobal(CounterId.fromInt(3), 3L, 3L);
 
-        right = ContextState.allocate(3, 0, 0, allocator);
+        right = ContextState.allocate(3, 0, 0);
         right.writeGlobal(CounterId.fromInt(3), 6L, 6L);
         right.writeGlobal(CounterId.fromInt(4), 4L, 4L);
         right.writeGlobal(CounterId.fromInt(5), 5L, 5L);
 
-        merged = cc.merge(left.context, right.context, allocator);
+        merged = cc.merge(left.context, right.context);
         assertEquals(headerSizeLength + 5 * headerEltLength + 5 * stepLength, merged.remaining());
         assertEquals(18L, cc.total(merged));
         assertEquals(5, merged.getShort(merged.position()));
@@ -340,13 +319,13 @@
         //
         // Test merging two global contexts w/ 'invalid shards'
         //
-        left = ContextState.allocate(1, 0, 0, allocator);
+        left = ContextState.allocate(1, 0, 0);
         left.writeGlobal(CounterId.fromInt(1), 10L, 20L);
 
-        right = ContextState.allocate(1, 0, 0, allocator);
+        right = ContextState.allocate(1, 0, 0);
         right.writeGlobal(CounterId.fromInt(1), 10L, 30L);
 
-        merged = cc.merge(left.context, right.context, allocator);
+        merged = cc.merge(left.context, right.context);
         headerLength = headerSizeLength + headerEltLength;
         assertEquals(headerLength + stepLength, merged.remaining());
         assertEquals(30L, cc.total(merged));
@@ -359,16 +338,16 @@
         //
         // Test merging global w/ mixed contexts
         //
-        left = ContextState.allocate(2, 0, 0, allocator);
+        left = ContextState.allocate(2, 0, 0);
         left.writeGlobal(CounterId.fromInt(1), 1L, 1L);
         left.writeGlobal(CounterId.fromInt(2), 1L, 1L);
 
-        right = ContextState.allocate(0, 1, 1, allocator);
+        right = ContextState.allocate(0, 1, 1);
         right.writeLocal(CounterId.fromInt(1), 100L, 100L);
         right.writeRemote(CounterId.fromInt(2), 100L, 100L);
 
         // global shards should dominate local/remote, even with lower clock and value
-        merged = cc.merge(left.context, right.context, allocator);
+        merged = cc.merge(left.context, right.context);
         headerLength = headerSizeLength + 2 * headerEltLength;
         assertEquals(headerLength + 2 * stepLength, merged.remaining());
         assertEquals(2L, cc.total(merged));
@@ -384,13 +363,7 @@
     @Test
     public void testTotal()
     {
-        runTotal(HeapAllocator.instance);
-        runTotal(bumpedSlab());
-    }
-
-    private void runTotal(Allocator allocator)
-    {
-        ContextState mixed = ContextState.allocate(0, 1, 4, allocator);
+        ContextState mixed = ContextState.allocate(0, 1, 4);
         mixed.writeRemote(CounterId.fromInt(1), 1L, 1L);
         mixed.writeRemote(CounterId.fromInt(2), 2L, 2L);
         mixed.writeRemote(CounterId.fromInt(4), 4L, 4L);
@@ -398,7 +371,7 @@
         mixed.writeLocal(CounterId.getLocalId(), 12L, 12L);
         assertEquals(24L, cc.total(mixed.context));
 
-        ContextState global = ContextState.allocate(3, 0, 0, allocator);
+        ContextState global = ContextState.allocate(3, 0, 0);
         global.writeGlobal(CounterId.fromInt(1), 1L, 1L);
         global.writeGlobal(CounterId.fromInt(2), 2L, 2L);
         global.writeGlobal(CounterId.fromInt(3), 3L, 3L);
@@ -411,10 +384,9 @@
         ContextState state;
         ByteBuffer marked;
         ByteBuffer cleared;
-        Allocator allocator = HeapAllocator.instance;
 
         // mark/clear for remote-only contexts is a no-op
-        state = ContextState.allocate(0, 0, 1, allocator);
+        state = ContextState.allocate(0, 0, 1);
         state.writeRemote(CounterId.fromInt(1), 1L, 1L);
 
         assertFalse(cc.shouldClearLocal(state.context));
@@ -426,7 +398,7 @@
         assertSame(cleared, marked); // shouldn't alter anything either
 
         // a single local shard
-        state = ContextState.allocate(0, 1, 0, allocator);
+        state = ContextState.allocate(0, 1, 0);
         state.writeLocal(CounterId.fromInt(1), 1L, 1L);
 
         assertFalse(cc.shouldClearLocal(state.context));
@@ -440,7 +412,7 @@
         assertEquals(0, cleared.getShort(cleared.position()));
 
         // 2 global + 1 local shard
-        state = ContextState.allocate(2, 1, 0, allocator);
+        state = ContextState.allocate(2, 1, 0);
         state.writeLocal(CounterId.fromInt(1), 1L, 1L);
         state.writeGlobal(CounterId.fromInt(2), 2L, 2L);
         state.writeGlobal(CounterId.fromInt(3), 3L, 3L);
@@ -488,7 +460,7 @@
         assertEquals(3L, cleared.getLong(cleared.position() + headerLength + 2 * stepLength + idLength + clockLength));
 
         // a single global shard - no-op
-        state = ContextState.allocate(1, 0, 0, allocator);
+        state = ContextState.allocate(1, 0, 0);
         state.writeGlobal(CounterId.fromInt(1), 1L, 1L);
 
         assertFalse(cc.shouldClearLocal(state.context));
@@ -499,4 +471,67 @@
         cleared = cc.clearAllLocal(marked);
         assertSame(cleared, marked);
     }
+
+    @Test
+    public void testFindPositionOf()
+    {
+        ContextState state = ContextState.allocate(3, 3, 3);
+
+        state.writeGlobal(CounterId.fromInt(1), 1L, 1L);
+        state.writeRemote(CounterId.fromInt(2), 2L, 2L);
+        state.writeLocal( CounterId.fromInt(3), 3L, 3L);
+        state.writeGlobal(CounterId.fromInt(4), 4L, 4L);
+        state.writeRemote(CounterId.fromInt(5), 5L, 5L);
+        state.writeLocal( CounterId.fromInt(6), 6L, 6L);
+        state.writeGlobal(CounterId.fromInt(7), 7L, 7L);
+        state.writeRemote(CounterId.fromInt(8), 8L, 8L);
+        state.writeLocal(CounterId.fromInt(9), 9L, 9L);
+
+        int headerLength = headerSizeLength + 6 * headerEltLength;
+        assertEquals(headerLength, cc.findPositionOf(state.context, CounterId.fromInt(1)));
+        assertEquals(headerLength + stepLength, cc.findPositionOf(state.context, CounterId.fromInt(2)));
+        assertEquals(headerLength + 2 * stepLength, cc.findPositionOf(state.context, CounterId.fromInt(3)));
+        assertEquals(headerLength + 3 * stepLength, cc.findPositionOf(state.context, CounterId.fromInt(4)));
+        assertEquals(headerLength + 4 * stepLength, cc.findPositionOf(state.context, CounterId.fromInt(5)));
+        assertEquals(headerLength + 5 * stepLength, cc.findPositionOf(state.context, CounterId.fromInt(6)));
+        assertEquals(headerLength + 6 * stepLength, cc.findPositionOf(state.context, CounterId.fromInt(7)));
+        assertEquals(headerLength + 7 * stepLength, cc.findPositionOf(state.context, CounterId.fromInt(8)));
+        assertEquals(headerLength + 8 * stepLength, cc.findPositionOf(state.context, CounterId.fromInt(9)));
+
+        assertEquals(-1, cc.findPositionOf(state.context, CounterId.fromInt(0)));
+        assertEquals(-1, cc.findPositionOf(state.context, CounterId.fromInt(10)));
+        assertEquals(-1, cc.findPositionOf(state.context, CounterId.fromInt(15)));
+        assertEquals(-1, cc.findPositionOf(state.context, CounterId.fromInt(20)));
+    }
+
+    @Test
+    public void testGetGlockAndCountOf()
+    {
+        ContextState state = ContextState.allocate(3, 3, 3);
+
+        state.writeGlobal(CounterId.fromInt(1), 1L, 1L);
+        state.writeRemote(CounterId.fromInt(2), 2L, 2L);
+        state.writeLocal( CounterId.fromInt(3), 3L, 3L);
+        state.writeGlobal(CounterId.fromInt(4), 4L, 4L);
+        state.writeRemote(CounterId.fromInt(5), 5L, 5L);
+        state.writeLocal( CounterId.fromInt(6), 6L, 6L);
+        state.writeGlobal(CounterId.fromInt(7), 7L, 7L);
+        state.writeRemote(CounterId.fromInt(8), 8L, 8L);
+        state.writeLocal(CounterId.fromInt(9), 9L, 9L);
+
+        assertEquals(ClockAndCount.create(1L, 1L), cc.getClockAndCountOf(state.context, CounterId.fromInt(1)));
+        assertEquals(ClockAndCount.create(2L, 2L), cc.getClockAndCountOf(state.context, CounterId.fromInt(2)));
+        assertEquals(ClockAndCount.create(3L, 3L), cc.getClockAndCountOf(state.context, CounterId.fromInt(3)));
+        assertEquals(ClockAndCount.create(4L, 4L), cc.getClockAndCountOf(state.context, CounterId.fromInt(4)));
+        assertEquals(ClockAndCount.create(5L, 5L), cc.getClockAndCountOf(state.context, CounterId.fromInt(5)));
+        assertEquals(ClockAndCount.create(6L, 6L), cc.getClockAndCountOf(state.context, CounterId.fromInt(6)));
+        assertEquals(ClockAndCount.create(7L, 7L), cc.getClockAndCountOf(state.context, CounterId.fromInt(7)));
+        assertEquals(ClockAndCount.create(8L, 8L), cc.getClockAndCountOf(state.context, CounterId.fromInt(8)));
+        assertEquals(ClockAndCount.create(9L, 9L), cc.getClockAndCountOf(state.context, CounterId.fromInt(9)));
+
+        assertEquals(ClockAndCount.create(0L, 0L), cc.getClockAndCountOf(state.context, CounterId.fromInt(0)));
+        assertEquals(ClockAndCount.create(0L, 0L), cc.getClockAndCountOf(state.context, CounterId.fromInt(10)));
+        assertEquals(ClockAndCount.create(0L, 0L), cc.getClockAndCountOf(state.context, CounterId.fromInt(15)));
+        assertEquals(ClockAndCount.create(0L, 0L), cc.getClockAndCountOf(state.context, CounterId.fromInt(20)));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/filter/ColumnSliceTest.java b/test/unit/org/apache/cassandra/db/filter/ColumnSliceTest.java
new file mode 100644
index 0000000..f1be21c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/filter/ColumnSliceTest.java

@@ -0,0 +1,481 @@
+/*
+ * * Licensed to the Apache Software Foundation (ASF) under one
+ * * or more contributor license agreements.  See the NOTICE file
+ * * distributed with this work for additional information
+ * * regarding copyright ownership.  The ASF licenses this file
+ * * to you under the Apache License, Version 2.0 (the
+ * * "License"); you may not use this file except in compliance
+ * * with the License.  You may obtain a copy of the License at
+ * *
+ * *    http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing,
+ * * software distributed under the License is distributed on an
+ * * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * * KIND, either express or implied.  See the License for the
+ * * specific language governing permissions and limitations
+ * * under the License.
+ * */
+package org.apache.cassandra.db.filter;
+
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.junit.Assert.*;
+
+public class ColumnSliceTest
+{
+    private static final CellNameType simpleIntType = new SimpleDenseCellNameType(Int32Type.instance);
+
+    @Test
+    public void testIntersectsSingleSlice()
+    {
+        List<AbstractType<?>> types = new ArrayList<>();
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        CompoundDenseCellNameType nameType = new CompoundDenseCellNameType(types);
+
+        // filter falls entirely before sstable
+        ColumnSlice slice = new ColumnSlice(composite(0, 0, 0), composite(1, 0, 0));
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
+
+        // same case, but with empty start
+        slice = new ColumnSlice(composite(), composite(1, 0, 0));
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
+
+        // same case, but with missing components for start
+        slice = new ColumnSlice(composite(0), composite(1, 0, 0));
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
+
+        // same case, but with missing components for start and end
+        slice = new ColumnSlice(composite(0), composite(1, 0));
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
+
+
+        // end of slice matches start of sstable for the first component, but not the second component
+        slice = new ColumnSlice(composite(0, 0, 0), composite(1, 0, 0));
+        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), nameType, false));
+
+        // same case, but with missing components for start
+        slice = new ColumnSlice(composite(0), composite(1, 0, 0));
+        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), nameType, false));
+
+        // same case, but with missing components for start and end
+        slice = new ColumnSlice(composite(0), composite(1, 0));
+        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), nameType, false));
+
+        // first two components match, but not the last
+        slice = new ColumnSlice(composite(0, 0, 0), composite(1, 1, 0));
+        assertFalse(slice.intersects(columnNames(1, 1, 1), columnNames(3, 1, 1), nameType, false));
+
+        // all three components in slice end match the start of the sstable
+        slice = new ColumnSlice(composite(0, 0, 0), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 1, 1), columnNames(3, 1, 1), nameType, false));
+
+
+        // filter falls entirely after sstable
+        slice = new ColumnSlice(composite(4, 0, 0), composite(4, 0, 0));
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
+
+        // same case, but with empty end
+        slice = new ColumnSlice(composite(4, 0, 0), composite());
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
+
+        // same case, but with missing components for end
+        slice = new ColumnSlice(composite(4, 0, 0), composite(1));
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
+
+        // same case, but with missing components for start and end
+        slice = new ColumnSlice(composite(4, 0), composite(1));
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
+
+
+        // start of slice matches end of sstable for the first component, but not the second component
+        slice = new ColumnSlice(composite(1, 1, 1), composite(2, 0, 0));
+        assertFalse(slice.intersects(columnNames(0, 0, 0), columnNames(1, 0, 0), nameType, false));
+
+        // start of slice matches end of sstable for the first two components, but not the last component
+        slice = new ColumnSlice(composite(1, 1, 1), composite(2, 0, 0));
+        assertFalse(slice.intersects(columnNames(0, 0, 0), columnNames(1, 1, 0), nameType, false));
+
+        // all three components in the slice start match the end of the sstable
+        slice = new ColumnSlice(composite(1, 1, 1), composite(2, 0, 0));
+        assertTrue(slice.intersects(columnNames(0, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+
+        // slice covers entire sstable (with no matching edges)
+        slice = new ColumnSlice(composite(0, 0, 0), composite(2, 0, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+        // same case, but with empty ends
+        slice = new ColumnSlice(composite(), composite());
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+        // same case, but with missing components
+        slice = new ColumnSlice(composite(0), composite(2, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+        // slice covers entire sstable (with matching start)
+        slice = new ColumnSlice(composite(1, 0, 0), composite(2, 0, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+        // slice covers entire sstable (with matching end)
+        slice = new ColumnSlice(composite(0, 0, 0), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+        // slice covers entire sstable (with matching start and end)
+        slice = new ColumnSlice(composite(1, 0, 0), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+
+        // slice falls entirely within sstable (with matching start)
+        slice = new ColumnSlice(composite(1, 0, 0), composite(1, 1, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+        // same case, but with a missing end component
+        slice = new ColumnSlice(composite(1, 0, 0), composite(1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+        // slice falls entirely within sstable (with matching end)
+        slice = new ColumnSlice(composite(1, 1, 0), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+        // same case, but with a missing start component
+        slice = new ColumnSlice(composite(1, 1), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
+
+
+        // slice falls entirely within sstable
+        slice = new ColumnSlice(composite(1, 1, 0), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
+
+        // same case, but with a missing start component
+        slice = new ColumnSlice(composite(1, 1), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
+
+        // same case, but with a missing start and end components
+        slice = new ColumnSlice(composite(1), composite(1, 2));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
+
+        // same case, but with an equal first component and missing start and end components
+        slice = new ColumnSlice(composite(1), composite(1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
+
+        // slice falls entirely within sstable (slice start and end are the same)
+        slice = new ColumnSlice(composite(1, 1, 1), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
+
+
+        // slice starts within sstable, empty end
+        slice = new ColumnSlice(composite(1, 1, 1), composite());
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // same case, but with missing end components
+        slice = new ColumnSlice(composite(1, 1, 1), composite(3));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // slice starts within sstable (matching sstable start), empty end
+        slice = new ColumnSlice(composite(1, 0, 0), composite());
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // same case, but with missing end components
+        slice = new ColumnSlice(composite(1, 0, 0), composite(3));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // slice starts within sstable (matching sstable end), empty end
+        slice = new ColumnSlice(composite(2, 0, 0), composite());
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // same case, but with missing end components
+        slice = new ColumnSlice(composite(2, 0, 0), composite(3));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+
+        // slice ends within sstable, empty end
+        slice = new ColumnSlice(composite(), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // same case, but with missing start components
+        slice = new ColumnSlice(composite(0), composite(1, 1, 1));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // slice ends within sstable (matching sstable start), empty start
+        slice = new ColumnSlice(composite(), composite(1, 0, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // same case, but with missing start components
+        slice = new ColumnSlice(composite(0), composite(1, 0, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // slice ends within sstable (matching sstable end), empty start
+        slice = new ColumnSlice(composite(), composite(2, 0, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+        // same case, but with missing start components
+        slice = new ColumnSlice(composite(0), composite(2, 0, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
+
+
+        // the slice technically falls within the sstable range, but since the first component is restricted to
+        // a single value, we can check that the second component does not fall within its min/max
+        slice = new ColumnSlice(composite(1, 2, 0), composite(1, 3, 0));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+        // same case, but with a missing start component
+        slice = new ColumnSlice(composite(1, 2), composite(1, 3, 0));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+        // same case, but with a missing end component
+        slice = new ColumnSlice(composite(1, 2, 0), composite(1, 3));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+        // same case, but with a missing start and end components
+        slice = new ColumnSlice(composite(1, 2), composite(1, 3));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+        // same case, but with missing start and end components and different lengths for start and end
+        slice = new ColumnSlice(composite(1, 2), composite(1));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+
+        // same as the previous set of tests, but the second component is equal in the slice start and end
+        slice = new ColumnSlice(composite(1, 2, 0), composite(1, 2, 0));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+        // same case, but with a missing start component
+        slice = new ColumnSlice(composite(1, 2), composite(1, 2, 0));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+        // same case, but with a missing end component
+        slice = new ColumnSlice(composite(1, 2, 0), composite(1, 2));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+        // same case, but with a missing start and end components
+        slice = new ColumnSlice(composite(1, 2), composite(1, 2));
+        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
+
+        // same as the previous tests, but it's the third component that doesn't fit in its range this time
+        slice = new ColumnSlice(composite(1, 1, 2), composite(1, 1, 3));
+        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(2, 2, 1), nameType, false));
+
+        // empty min/max column names
+        slice = new ColumnSlice(composite(), composite());
+        assertTrue(slice.intersects(columnNames(), columnNames(), nameType, false));
+
+        slice = new ColumnSlice(composite(1), composite());
+        assertTrue(slice.intersects(columnNames(), columnNames(), nameType, false));
+
+        slice = new ColumnSlice(composite(), composite(1));
+        assertTrue(slice.intersects(columnNames(), columnNames(), nameType, false));
+
+        slice = new ColumnSlice(composite(1), composite(1));
+        assertTrue(slice.intersects(columnNames(), columnNames(), nameType, false));
+
+        slice = new ColumnSlice(composite(), composite());
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(), composite(1));
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(), composite(1));
+        assertTrue(slice.intersects(columnNames(), columnNames(2), nameType, false));
+
+        slice = new ColumnSlice(composite(), composite(2));
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(2), composite(3));
+        assertFalse(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        // basic check on reversed slices
+        slice = new ColumnSlice(composite(1, 0, 0), composite(0, 0, 0));
+        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, true));
+
+        slice = new ColumnSlice(composite(1, 0, 0), composite(0, 0, 0));
+        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), nameType, true));
+
+        slice = new ColumnSlice(composite(1, 1, 1), composite(1, 1, 0));
+        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, true));
+    }
+
+    @Test
+    public void testDifferentMinMaxLengths()
+    {
+        List<AbstractType<?>> types = new ArrayList<>();
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        CompoundDenseCellNameType nameType = new CompoundDenseCellNameType(types);
+
+        // slice does intersect
+        ColumnSlice slice = new ColumnSlice(composite(), composite());
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(), composite());
+        assertTrue(slice.intersects(columnNames(1), columnNames(1, 2), nameType, false));
+
+        slice = new ColumnSlice(composite(), composite(1));
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(1), composite());
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(1), composite(1));
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(0), composite(1, 2, 3));
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(1, 2, 3), composite(2));
+        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        // slice does not intersect
+        slice = new ColumnSlice(composite(2), composite(3, 4, 5));
+        assertFalse(slice.intersects(columnNames(), columnNames(1), nameType, false));
+
+        slice = new ColumnSlice(composite(0), composite(0, 1, 2));
+        assertFalse(slice.intersects(columnNames(1), columnNames(1, 2), nameType, false));
+    }
+
+    @Test
+    public void testDeoverlapSlices()
+    {
+        ColumnSlice[] slices;
+        ColumnSlice[] deoverlapped;
+
+        // Preserve correct slices
+        slices = slices(s(0, 3), s(4, 5), s(6, 9));
+        assertSlicesValid(slices);
+        assertSlicesEquals(slices, deoverlapSlices(slices));
+
+        // Simple overlap
+        slices = slices(s(0, 3), s(2, 5), s(8, 9));
+        assertSlicesInvalid(slices);
+        assertSlicesEquals(slices(s(0, 5), s(8, 9)), deoverlapSlices(slices));
+
+        // Slice overlaps others fully
+        slices = slices(s(0, 10), s(2, 5), s(8, 9));
+        assertSlicesInvalid(slices);
+        assertSlicesEquals(slices(s(0, 10)), deoverlapSlices(slices));
+
+        // Slice with empty end overlaps others fully
+        slices = slices(s(0, -1), s(2, 5), s(8, 9));
+        assertSlicesInvalid(slices);
+        assertSlicesEquals(slices(s(0, -1)), deoverlapSlices(slices));
+
+        // Overlap with slices selecting only one element
+        slices = slices(s(0, 4), s(4, 4), s(4, 8));
+        assertSlicesInvalid(slices);
+        assertSlicesEquals(slices(s(0, 8)), deoverlapSlices(slices));
+
+        // Unordered slices (without overlap)
+        slices = slices(s(4, 8), s(0, 3), s(9, 9));
+        assertSlicesInvalid(slices);
+        assertSlicesEquals(slices(s(0, 3), s(4, 8), s(9, 9)), deoverlapSlices(slices));
+
+        // All range select but not by a single slice
+        slices = slices(s(5, -1), s(2, 5), s(-1, 2));
+        assertSlicesInvalid(slices);
+        assertSlicesEquals(slices(s(-1, -1)), deoverlapSlices(slices));
+    }
+
+    @Test
+    public void testValidateSlices()
+    {
+        assertSlicesValid(slices(s(0, 3)));
+        assertSlicesValid(slices(s(3, 3)));
+        assertSlicesValid(slices(s(3, 3), s(4, 4)));
+        assertSlicesValid(slices(s(0, 3), s(4, 5), s(6, 9)));
+        assertSlicesValid(slices(s(-1, -1)));
+        assertSlicesValid(slices(s(-1, 3), s(4, -1)));
+
+        assertSlicesInvalid(slices(s(3, 0)));
+        assertSlicesInvalid(slices(s(0, 2), s(2, 4)));
+        assertSlicesInvalid(slices(s(0, 2), s(1, 4)));
+        assertSlicesInvalid(slices(s(0, 2), s(3, 4), s(3, 4)));
+        assertSlicesInvalid(slices(s(-1, 2), s(3, -1), s(5, 9)));
+    }
+
+    private static Composite composite(Integer ... components)
+    {
+        List<AbstractType<?>> types = new ArrayList<>();
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        CompoundDenseCellNameType nameType = new CompoundDenseCellNameType(types);
+        return nameType.make((Object[]) components);
+    }
+
+    private static List<ByteBuffer> columnNames(Integer ... components)
+    {
+        List<ByteBuffer> names = new ArrayList<>(components.length);
+        for (int component : components)
+            names.add(ByteBufferUtil.bytes(component));
+        return names;
+    }
+
+    private static Composite simpleComposite(int i)
+    {
+        // We special negative values to mean EMPTY for convenience sake
+        if (i < 0)
+            return Composites.EMPTY;
+
+        return simpleIntType.make(i);
+    }
+
+    private static ColumnSlice s(int start, int finish)
+    {
+        return new ColumnSlice(simpleComposite(start), simpleComposite(finish));
+    }
+
+    private static ColumnSlice[] slices(ColumnSlice... slices)
+    {
+        return slices;
+    }
+
+    private static ColumnSlice[] deoverlapSlices(ColumnSlice[] slices)
+    {
+        return ColumnSlice.deoverlapSlices(slices, simpleIntType);
+    }
+
+    private static void assertSlicesValid(ColumnSlice[] slices)
+    {
+        assertTrue("Slices " + toString(slices) + " should be valid", ColumnSlice.validateSlices(slices, simpleIntType, false));
+    }
+
+    private static void assertSlicesInvalid(ColumnSlice[] slices)
+    {
+        assertFalse("Slices " + toString(slices) + " shouldn't be valid", ColumnSlice.validateSlices(slices, simpleIntType, false));
+    }
+
+    private static void assertSlicesEquals(ColumnSlice[] expected, ColumnSlice[] actual)
+    {
+        assertTrue("Expected " + toString(expected) + " but got " + toString(actual), Arrays.equals(expected, actual));
+    }
+
+    private static String toString(ColumnSlice[] slices)
+    {
+        StringBuilder sb = new StringBuilder().append("[");
+        for (int i = 0; i < slices.length; i++)
+        {
+            if (i > 0)
+                sb.append(", ");
+
+            ColumnSlice slice = slices[i];
+            sb.append("(");
+            sb.append(slice.start.isEmpty() ? "-1" : simpleIntType.getString(slice.start));
+            sb.append(", ");
+            sb.append(slice.finish.isEmpty() ? "-1" : simpleIntType.getString(slice.finish));
+            sb.append(")");
+        }
+        return sb.append("]").toString();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/index/PerRowSecondaryIndexTest.java b/test/unit/org/apache/cassandra/db/index/PerRowSecondaryIndexTest.java
index b983e6e..6aeee67 100644
--- a/test/unit/org/apache/cassandra/db/index/PerRowSecondaryIndexTest.java
+++ b/test/unit/org/apache/cassandra/db/index/PerRowSecondaryIndexTest.java

@@ -21,21 +21,30 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
 import java.util.Set;
 
 import org.junit.Before;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
+import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.filter.ExtendedFilter;
 import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 public class PerRowSecondaryIndexTest extends SchemaLoader
 {
@@ -53,65 +62,87 @@
     }
 
     @Test
-    public void testIndexInsertAndUpdate() throws IOException
+    public void testIndexInsertAndUpdate()
     {
         // create a row then test that the configured index instance was able to read the row
-        RowMutation rm;
-        rm = new RowMutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("indexed"), ByteBufferUtil.bytes("foo"), 1);
+        Mutation rm;
+        rm = new Mutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", Util.cellname("indexed"), ByteBufferUtil.bytes("foo"), 1);
         rm.apply();
 
         ColumnFamily indexedRow = PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_ROW;
         assertNotNull(indexedRow);
-        assertEquals(ByteBufferUtil.bytes("foo"), indexedRow.getColumn(ByteBufferUtil.bytes("indexed")).value());
+        assertEquals(ByteBufferUtil.bytes("foo"), indexedRow.getColumn(Util.cellname("indexed")).value());
 
         // update the row and verify what was indexed
-        rm = new RowMutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", ByteBufferUtil.bytes("indexed"), ByteBufferUtil.bytes("bar"), 2);
+        rm = new Mutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k1"));
+        rm.add("Indexed1", Util.cellname("indexed"), ByteBufferUtil.bytes("bar"), 2);
         rm.apply();
 
         indexedRow = PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_ROW;
         assertNotNull(indexedRow);
-        assertEquals(ByteBufferUtil.bytes("bar"), indexedRow.getColumn(ByteBufferUtil.bytes("indexed")).value());
+        assertEquals(ByteBufferUtil.bytes("bar"), indexedRow.getColumn(Util.cellname("indexed")).value());
         assertTrue(Arrays.equals("k1".getBytes(), PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY.array()));
     }
 
     @Test
-    public void testColumnDelete() throws IOException
+    public void testColumnDelete()
     {
         // issue a column delete and test that the configured index instance was notified to update
-        RowMutation rm;
-        rm = new RowMutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k2"));
-        rm.delete("Indexed1", ByteBufferUtil.bytes("indexed"), 1);
+        Mutation rm;
+        rm = new Mutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k2"));
+        rm.delete("Indexed1", Util.cellname("indexed"), 1);
         rm.apply();
 
         ColumnFamily indexedRow = PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_ROW;
         assertNotNull(indexedRow);
 
-        for (Column column : indexedRow.getSortedColumns())
-        {
-            assertTrue(column.isMarkedForDelete(System.currentTimeMillis()));
-        }
+        for (Cell cell : indexedRow.getSortedColumns())
+            assertFalse(cell.isLive());
+
         assertTrue(Arrays.equals("k2".getBytes(), PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY.array()));
     }
 
     @Test
-    public void testRowDelete() throws IOException
+    public void testRowDelete()
     {
         // issue a row level delete and test that the configured index instance was notified to update
-        RowMutation rm;
-        rm = new RowMutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k3"));
+        Mutation rm;
+        rm = new Mutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k3"));
         rm.delete("Indexed1", 1);
         rm.apply();
 
         ColumnFamily indexedRow = PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_ROW;
         assertNotNull(indexedRow);
-        for (Column column : indexedRow.getSortedColumns())
-        {
-            assertTrue(column.isMarkedForDelete(System.currentTimeMillis()));
-        }
+        for (Cell cell : indexedRow.getSortedColumns())
+            assertFalse(cell.isLive());
+
         assertTrue(Arrays.equals("k3".getBytes(), PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY.array()));
     }
+    
+    @Test
+    public void testInvalidSearch() throws IOException
+    {
+        Mutation rm;
+        rm = new Mutation("PerRowSecondaryIndex", ByteBufferUtil.bytes("k4"));
+        rm.add("Indexed1", Util.cellname("indexed"), ByteBufferUtil.bytes("foo"), 1);
+        rm.apply();
+        
+        // test we can search:
+        UntypedResultSet result = QueryProcessor.executeInternal("SELECT * FROM \"PerRowSecondaryIndex\".\"Indexed1\" WHERE indexed = 'foo'");
+        assertEquals(1, result.size());
+
+        // test we can't search if the searcher doesn't validate the expression:
+        try
+        {
+            QueryProcessor.executeInternal("SELECT * FROM \"PerRowSecondaryIndex\".\"Indexed1\" WHERE indexed = 'invalid'");
+            fail("Query should have been invalid!");
+        }
+        catch (Exception e)
+        {
+            assertTrue(e instanceof InvalidRequestException || (e.getCause() != null && (e.getCause() instanceof InvalidRequestException)));
+        }
+    }
 
     public static class TestIndex extends PerRowSecondaryIndex
     {
@@ -130,12 +161,12 @@
             QueryFilter filter = QueryFilter.getIdentityFilter(DatabaseDescriptor.getPartitioner().decorateKey(rowKey),
                                                                baseCfs.getColumnFamilyName(),
                                                                System.currentTimeMillis());
-            LAST_INDEXED_ROW = baseCfs.getColumnFamily(filter);
+            LAST_INDEXED_ROW = cf;
             LAST_INDEXED_KEY = rowKey;
         }
 
         @Override
-        public void delete(DecoratedKey key)
+        public void delete(DecoratedKey key, OpOrder.Group opGroup)
         {
         }
 
@@ -163,7 +194,23 @@
         @Override
         protected SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns)
         {
-            return null;
+            return new SecondaryIndexSearcher(baseCfs.indexManager, columns)
+            {
+                
+                @Override
+                public List<Row> search(ExtendedFilter filter)
+                {
+                    return Arrays.asList(new Row(LAST_INDEXED_KEY, LAST_INDEXED_ROW));
+                }
+
+                @Override
+                public void validate(IndexExpression indexExpression) throws InvalidRequestException
+                {
+                    if (indexExpression.value.equals(ByteBufferUtil.bytes("invalid")))
+                        throw new InvalidRequestException("Invalid search!");
+                }
+                
+            };
         }
 
         @Override
@@ -172,15 +219,15 @@
         }
 
         @Override
-        public long getLiveSize()
+        public ColumnFamilyStore getIndexCfs()
         {
-            return 0;
+            return baseCfs;
         }
 
         @Override
-        public ColumnFamilyStore getIndexCfs()
+        public boolean indexes(CellName name)
         {
-            return null;
+            return true;
         }
 
         @Override
@@ -197,5 +244,10 @@
         public void truncateBlocking(long truncatedAt)
         {
         }
+
+        @Override
+        public long estimateResultRows() {
+            return 0;
+        }
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/marshal/CollectionTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/CollectionTypeTest.java
index fba4742..18156c3 100644
--- a/test/unit/org/apache/cassandra/db/marshal/CollectionTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/CollectionTypeTest.java

@@ -26,8 +26,10 @@
 
 import org.junit.Test;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.serializers.*;
 
 public class CollectionTypeTest
 {
@@ -52,8 +54,8 @@
         {
             for (int j = i+1; j < lists.length; j++)
             {
-                assertEquals(lt.compare(lists[i], lists[j]), -1);
-                assertEquals(lt.compare(lists[j], lists[i]), 1);
+                assertEquals(String.format("compare(lists[%d], lists[%d])", i, j), -1, lt.compare(lists[i], lists[j]));
+                assertEquals(String.format("compare(lists[%d], lists[%d])", j, i),  1, lt.compare(lists[j], lists[i]));
             }
         }
     }
@@ -79,8 +81,8 @@
         {
             for (int j = i+1; j < sets.length; j++)
             {
-                assertEquals(st.compare(sets[i], sets[j]), -1);
-                assertEquals(st.compare(sets[j], sets[i]), 1);
+                assertEquals(String.format("compare(sets[%d], sets[%d])", i, j), -1, st.compare(sets[i], sets[j]));
+                assertEquals(String.format("compare(sets[%d], sets[%d])", j, i),  1, st.compare(sets[j], sets[i]));
             }
         }
     }
@@ -108,9 +110,99 @@
         {
             for (int j = i+1; j < maps.length; j++)
             {
-                assertEquals(mt.compare(maps[i], maps[j]), -1);
-                assertEquals(mt.compare(maps[j], maps[i]), 1);
+                assertEquals(String.format("compare(maps[%d], maps[%d])", i, j), mt.compare(maps[i], maps[j]), -1);
+                assertEquals(String.format("compare(maps[%d], maps[%d])", j, i), mt.compare(maps[j], maps[i]), 1);
             }
         }
     }
+
+    @Test
+    public void listSerDerTest()
+    {
+        ListSerializer<String> sls = ListType.getInstance(UTF8Type.instance).getSerializer();
+        ListSerializer<Integer> ils = ListType.getInstance(Int32Type.instance).getSerializer();
+
+        List<String> sl = Arrays.asList("Foo", "Bar");
+        List<Integer> il = Arrays.asList(3, 1, 5);
+
+        ByteBuffer sb = sls.serialize(sl);
+        ByteBuffer ib = ils.serialize(il);
+
+        assertEquals(sls.deserialize(sb), sl);
+        assertEquals(ils.deserialize(ib), il);
+
+        sls.validate(sb);
+        ils.validate(ib);
+
+        // string list with integer list type
+        assertInvalid(ils, sb);
+        // non list value
+        assertInvalid(sls, UTF8Type.instance.getSerializer().serialize("foo"));
+    }
+
+    @Test
+    public void setSerDerTest()
+    {
+        SetSerializer<String> sss = SetType.getInstance(UTF8Type.instance).getSerializer();
+        SetSerializer<Integer> iss = SetType.getInstance(Int32Type.instance).getSerializer();
+
+        Set<String> ss = new HashSet(){{ add("Foo"); add("Bar"); }};
+        Set<Integer> is = new HashSet(){{ add(3); add(1); add(5); }};
+
+        ByteBuffer sb = sss.serialize(ss);
+        ByteBuffer ib = iss.serialize(is);
+
+        assertEquals(sss.deserialize(sb), ss);
+        assertEquals(iss.deserialize(ib), is);
+
+        sss.validate(sb);
+        iss.validate(ib);
+
+        // string set with integer set type
+        assertInvalid(iss, sb);
+        // non set value
+        assertInvalid(sss, UTF8Type.instance.getSerializer().serialize("foo"));
+    }
+
+    @Test
+    public void setMapDerTest()
+    {
+        MapSerializer<String, String> sms = MapType.getInstance(UTF8Type.instance, UTF8Type.instance).getSerializer();
+        MapSerializer<Integer, Integer> ims = MapType.getInstance(Int32Type.instance, Int32Type.instance).getSerializer();
+
+        Map<String, String> sm = new HashMap(){{ put("Foo", "xxx"); put("Bar", "yyy"); }};
+        Map<Integer, Integer> im = new HashMap(){{ put(3, 0); put(1, 8); put(5, 2); }};
+
+        ByteBuffer sb = sms.serialize(sm);
+        ByteBuffer ib = ims.serialize(im);
+
+        assertEquals(sms.deserialize(sb), sm);
+        assertEquals(ims.deserialize(ib), im);
+
+        sms.validate(sb);
+        ims.validate(ib);
+
+        // string map with integer map type
+        assertInvalid(ims, sb);
+        // non map value
+        assertInvalid(sms, UTF8Type.instance.getSerializer().serialize("foo"));
+
+        MapSerializer<Integer, String> sims = MapType.getInstance(Int32Type.instance, UTF8Type.instance).getSerializer();
+        MapSerializer<String, Integer> isms = MapType.getInstance(UTF8Type.instance, Int32Type.instance).getSerializer();
+
+        // only key are invalid
+        assertInvalid(isms, sb);
+        // only values are invalid
+        assertInvalid(sims, sb);
+    }
+
+    private void assertInvalid(TypeSerializer<?> type, ByteBuffer value)
+    {
+        try {
+            type.validate(value);
+            fail("Value " + ByteBufferUtil.bytesToHex(value) + " shouldn't be valid for type " + type);
+        } catch (MarshalException e) {
+            // ok, that's what we want
+        }
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java
index df6f5e1..f606780 100644
--- a/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java

@@ -24,20 +24,17 @@
 import java.util.List;
 import java.util.UUID;
 
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
 import org.apache.cassandra.serializers.MarshalException;
 import org.junit.Test;
 import static org.junit.Assert.fail;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.utils.*;
 
@@ -177,7 +174,7 @@
         ByteBuffer cname5 = createCompositeKey("test2", uuids[1], 42, false);
 
         ByteBuffer key = ByteBufferUtil.bytes("k");
-        RowMutation rm = new RowMutation("Keyspace1", key);
+        Mutation rm = new Mutation("Keyspace1", key);
         addColumn(rm, cname5);
         addColumn(rm, cname1);
         addColumn(rm, cname4);
@@ -187,13 +184,13 @@
 
         ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk("k"), cfName, System.currentTimeMillis()));
 
-        Iterator<Column> iter = cf.getSortedColumns().iterator();
+        Iterator<Cell> iter = cf.getSortedColumns().iterator();
 
-        assert iter.next().name().equals(cname1);
-        assert iter.next().name().equals(cname2);
-        assert iter.next().name().equals(cname3);
-        assert iter.next().name().equals(cname4);
-        assert iter.next().name().equals(cname5);
+        assert iter.next().name().toByteBuffer().equals(cname1);
+        assert iter.next().name().toByteBuffer().equals(cname2);
+        assert iter.next().name().toByteBuffer().equals(cname3);
+        assert iter.next().name().toByteBuffer().equals(cname4);
+        assert iter.next().name().toByteBuffer().equals(cname5);
     }
 
     @Test
@@ -259,303 +256,9 @@
         }
     }
 
-    @Test
-    public void testIntersectsSingleSlice()
+    private void addColumn(Mutation rm, ByteBuffer cname)
     {
-        CompositeType comparator = CompositeType.getInstance(Int32Type.instance, Int32Type.instance, Int32Type.instance);
-
-        // filter falls entirely before sstable
-        SliceQueryFilter filter = new SliceQueryFilter(composite(0, 0, 0), composite(1, 0, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-        // same case, but with empty start
-        filter = new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, composite(1, 0, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-        // same case, but with missing components for start
-        filter = new SliceQueryFilter(composite(0), composite(1, 0, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-        // same case, but with missing components for start and end
-        filter = new SliceQueryFilter(composite(0), composite(1, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-
-        // end of slice matches start of sstable for the first component, but not the second component
-        filter = new SliceQueryFilter(composite(0, 0, 0), composite(1, 0, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), filter));
-
-        // same case, but with missing components for start
-        filter = new SliceQueryFilter(composite(0), composite(1, 0, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), filter));
-
-        // same case, but with missing components for start and end
-        filter = new SliceQueryFilter(composite(0), composite(1, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), filter));
-
-        // first two components match, but not the last
-        filter = new SliceQueryFilter(composite(0, 0, 0), composite(1, 1, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 1, 1), columnNames(3, 1, 1), filter));
-
-        // all three components in slice end match the start of the sstable
-        filter = new SliceQueryFilter(composite(0, 0, 0), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 1, 1), columnNames(3, 1, 1), filter));
-
-
-        // filter falls entirely after sstable
-        filter = new SliceQueryFilter(composite(4, 0, 0), composite(4, 0, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-        // same case, but with empty end
-        filter = new SliceQueryFilter(composite(4, 0, 0), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-        // same case, but with missing components for end
-        filter = new SliceQueryFilter(composite(4, 0, 0), composite(1), false, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-        // same case, but with missing components for start and end
-        filter = new SliceQueryFilter(composite(4, 0), composite(1), false, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-
-        // start of slice matches end of sstable for the first component, but not the second component
-        filter = new SliceQueryFilter(composite(1, 1, 1), composite(2, 0, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(0, 0, 0), columnNames(1, 0, 0), filter));
-
-        // start of slice matches end of sstable for the first two components, but not the last component
-        filter = new SliceQueryFilter(composite(1, 1, 1), composite(2, 0, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(0, 0, 0), columnNames(1, 1, 0), filter));
-
-        // all three components in the slice start match the end of the sstable
-        filter = new SliceQueryFilter(composite(1, 1, 1), composite(2, 0, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(0, 0, 0), columnNames(1, 1, 1), filter));
-
-
-        // slice covers entire sstable (with no matching edges)
-        filter = new SliceQueryFilter(composite(0, 0, 0), composite(2, 0, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-        // same case, but with empty ends
-        filter = new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-        // same case, but with missing components
-        filter = new SliceQueryFilter(composite(0), composite(2, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-        // slice covers entire sstable (with matching start)
-        filter = new SliceQueryFilter(composite(1, 0, 0), composite(2, 0, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-        // slice covers entire sstable (with matching end)
-        filter = new SliceQueryFilter(composite(0, 0, 0), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-        // slice covers entire sstable (with matching start and end)
-        filter = new SliceQueryFilter(composite(1, 0, 0), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-
-        // slice falls entirely within sstable (with matching start)
-        filter = new SliceQueryFilter(composite(1, 0, 0), composite(1, 1, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-        // same case, but with a missing end component
-        filter = new SliceQueryFilter(composite(1, 0, 0), composite(1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-        // slice falls entirely within sstable (with matching end)
-        filter = new SliceQueryFilter(composite(1, 1, 0), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-        // same case, but with a missing start component
-        filter = new SliceQueryFilter(composite(1, 1), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), filter));
-
-
-        // slice falls entirely within sstable
-        filter = new SliceQueryFilter(composite(1, 1, 0), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), filter));
-
-        // same case, but with a missing start component
-        filter = new SliceQueryFilter(composite(1, 1), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), filter));
-
-        // same case, but with a missing start and end components
-        filter = new SliceQueryFilter(composite(1), composite(1, 2), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), filter));
-
-        // slice falls entirely within sstable (slice start and end are the same)
-        filter = new SliceQueryFilter(composite(1, 1, 1), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), filter));
-
-
-        // slice starts within sstable, empty end
-        filter = new SliceQueryFilter(composite(1, 1, 1), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // same case, but with missing end components
-        filter = new SliceQueryFilter(composite(1, 1, 1), composite(3), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // slice starts within sstable (matching sstable start), empty end
-        filter = new SliceQueryFilter(composite(1, 0, 0), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // same case, but with missing end components
-        filter = new SliceQueryFilter(composite(1, 0, 0), composite(3), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // slice starts within sstable (matching sstable end), empty end
-        filter = new SliceQueryFilter(composite(2, 0, 0), ByteBufferUtil.EMPTY_BYTE_BUFFER, false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // same case, but with missing end components
-        filter = new SliceQueryFilter(composite(2, 0, 0), composite(3), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-
-        // slice ends within sstable, empty end
-        filter = new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // same case, but with missing start components
-        filter = new SliceQueryFilter(composite(0), composite(1, 1, 1), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // slice ends within sstable (matching sstable start), empty start
-        filter = new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, composite(1, 0, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // same case, but with missing start components
-        filter = new SliceQueryFilter(composite(0), composite(1, 0, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // slice ends within sstable (matching sstable end), empty start
-        filter = new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER, composite(2, 0, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-        // same case, but with missing start components
-        filter = new SliceQueryFilter(composite(0), composite(2, 0, 0), false, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), filter));
-
-
-        // the slice technically falls within the sstable range, but since the first component is restricted to
-        // a single value, we can check that the second component does not fall within its min/max
-        filter = new SliceQueryFilter(composite(1, 2, 0), composite(1, 3, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-        // same case, but with a missing start component
-        filter = new SliceQueryFilter(composite(1, 2), composite(1, 3, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-        // same case, but with a missing end component
-        filter = new SliceQueryFilter(composite(1, 2, 0), composite(1, 3), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-        // same case, but with a missing start and end components
-        filter = new SliceQueryFilter(composite(1, 2), composite(1, 3), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-        // same case, but with missing start and end components and different lengths for start and end
-        filter = new SliceQueryFilter(composite(1, 2), composite(1), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-
-        // same as the previous set of tests, but the second component is equal in the slice start and end
-        filter = new SliceQueryFilter(composite(1, 2, 0), composite(1, 2, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-        // same case, but with a missing start component
-        filter = new SliceQueryFilter(composite(1, 2), composite(1, 2, 0), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-        // same case, but with a missing end component
-        filter = new SliceQueryFilter(composite(1, 2, 0), composite(1, 2), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-        // same case, but with a missing start and end components
-        filter = new SliceQueryFilter(composite(1, 2), composite(1, 2), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), filter));
-
-        // same as the previous tests, but it's the third component that doesn't fit in its range this time
-        filter = new SliceQueryFilter(composite(1, 1, 2), composite(1, 1, 3), false, 1);
-        assertFalse(comparator.intersects(columnNames(1, 1, 0), columnNames(2, 2, 1), filter));
-
-
-        // basic check on reversed slices
-        filter = new SliceQueryFilter(composite(1, 0, 0), composite(0, 0, 0), true, 1);
-        assertFalse(comparator.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), filter));
-
-        filter = new SliceQueryFilter(composite(1, 0, 0), composite(0, 0, 0), true, 1);
-        assertFalse(comparator.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), filter));
-
-        filter = new SliceQueryFilter(composite(1, 1, 1), composite(1, 1, 0), true, 1);
-        assertTrue(comparator.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), filter));
-    }
-
-    @Test
-    public void testIntersectsMultipleSlices()
-    {
-        CompositeType comparator = CompositeType.getInstance(Int32Type.instance, Int32Type.instance, Int32Type.instance);
-
-        // all slices intersect
-        SliceQueryFilter filter = new SliceQueryFilter(new ColumnSlice[]{
-            new ColumnSlice(composite(1, 0, 0), composite(2, 0, 0)),
-            new ColumnSlice(composite(3, 0, 0), composite(4, 0, 0)),
-            new ColumnSlice(composite(5, 0, 0), composite(6, 0, 0)),
-        }, false, 1);
-
-        // first slice doesn't intersect
-        assertTrue(comparator.intersects(columnNames(0, 0, 0), columnNames(7, 0, 0), filter));
-        filter = new SliceQueryFilter(new ColumnSlice[]{
-                new ColumnSlice(composite(1, 0, 0), composite(2, 0, 0)),
-                new ColumnSlice(composite(3, 0, 0), composite(4, 0, 0)),
-                new ColumnSlice(composite(5, 0, 0), composite(6, 0, 0)),
-        }, false, 1);
-        assertTrue(comparator.intersects(columnNames(3, 0, 0), columnNames(7, 0, 0), filter));
-
-        // first two slices don't intersect
-        assertTrue(comparator.intersects(columnNames(0, 0, 0), columnNames(7, 0, 0), filter));
-        filter = new SliceQueryFilter(new ColumnSlice[]{
-                new ColumnSlice(composite(1, 0, 0), composite(2, 0, 0)),
-                new ColumnSlice(composite(3, 0, 0), composite(4, 0, 0)),
-                new ColumnSlice(composite(5, 0, 0), composite(6, 0, 0)),
-        }, false, 1);
-        assertTrue(comparator.intersects(columnNames(5, 0, 0), columnNames(7, 0, 0), filter));
-
-        // none of the slices intersect
-        assertTrue(comparator.intersects(columnNames(0, 0, 0), columnNames(7, 0, 0), filter));
-        filter = new SliceQueryFilter(new ColumnSlice[]{
-                new ColumnSlice(composite(1, 0, 0), composite(2, 0, 0)),
-                new ColumnSlice(composite(3, 0, 0), composite(4, 0, 0)),
-                new ColumnSlice(composite(5, 0, 0), composite(6, 0, 0)),
-        }, false, 1);
-        assertFalse(comparator.intersects(columnNames(7, 0, 0), columnNames(8, 0, 0), filter));
-    }
-
-
-    private static ByteBuffer composite(Integer ... components)
-    {
-        CompositeType comparator = CompositeType.getInstance(Int32Type.instance, Int32Type.instance, Int32Type.instance);
-        CompositeType.Builder builder = comparator.builder();
-        for (int component : components)
-            builder.add(ByteBufferUtil.bytes(component));
-        return builder.build();
-    }
-
-    private static List<ByteBuffer> columnNames(Integer ... components)
-    {
-        List<ByteBuffer> names = new ArrayList<>(components.length);
-        for (int component : components)
-            names.add(ByteBufferUtil.bytes(component));
-        return names;
-    }
-
-    private void addColumn(RowMutation rm, ByteBuffer cname)
-    {
-        rm.add(cfName, cname, ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+        rm.add(cfName, CellNames.simpleDense(cname), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
     }
 
     private ByteBuffer createCompositeKey(String s, UUID uuid, int i, boolean lastIsOne)

diff --git a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java
index 763779d..e248eae 100644
--- a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java

@@ -31,6 +31,7 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.utils.*;
 
@@ -174,7 +175,7 @@
         ByteBuffer cname5 = createDynamicCompositeKey("test2", uuids[1], 42, false);
 
         ByteBuffer key = ByteBufferUtil.bytes("k");
-        RowMutation rm = new RowMutation("Keyspace1", key);
+        Mutation rm = new Mutation("Keyspace1", key);
         addColumn(rm, cname5);
         addColumn(rm, cname1);
         addColumn(rm, cname4);
@@ -184,13 +185,13 @@
 
         ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk("k"), cfName, System.currentTimeMillis()));
 
-        Iterator<Column> iter = cf.getSortedColumns().iterator();
+        Iterator<Cell> iter = cf.getSortedColumns().iterator();
 
-        assert iter.next().name().equals(cname1);
-        assert iter.next().name().equals(cname2);
-        assert iter.next().name().equals(cname3);
-        assert iter.next().name().equals(cname4);
-        assert iter.next().name().equals(cname5);
+        assert iter.next().name().toByteBuffer().equals(cname1);
+        assert iter.next().name().toByteBuffer().equals(cname2);
+        assert iter.next().name().toByteBuffer().equals(cname3);
+        assert iter.next().name().toByteBuffer().equals(cname4);
+        assert iter.next().name().toByteBuffer().equals(cname5);
     }
 
     @Test
@@ -206,7 +207,7 @@
         ByteBuffer cname5 = createDynamicCompositeKey("test2", uuids[1], 42, false, true);
 
         ByteBuffer key = ByteBufferUtil.bytes("kr");
-        RowMutation rm = new RowMutation("Keyspace1", key);
+        Mutation rm = new Mutation("Keyspace1", key);
         addColumn(rm, cname5);
         addColumn(rm, cname1);
         addColumn(rm, cname4);
@@ -216,7 +217,7 @@
 
         ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk("kr"), cfName, System.currentTimeMillis()));
 
-        Iterator<Column> iter = cf.getSortedColumns().iterator();
+        Iterator<Cell> iter = cf.getSortedColumns().iterator();
 
         assert iter.next().name().equals(cname5);
         assert iter.next().name().equals(cname4);
@@ -291,9 +292,9 @@
         assert !TypeParser.parse("DynamicCompositeType(a => BytesType)").isCompatibleWith(TypeParser.parse("DynamicCompositeType(a => BytesType, b => AsciiType)"));
     }
 
-    private void addColumn(RowMutation rm, ByteBuffer cname)
+    private void addColumn(Mutation rm, ByteBuffer cname)
     {
-        rm.add(cfName, cname, ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+        rm.add(cfName, CellNames.simpleDense(cname), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
     }
 
     private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne)

diff --git a/test/unit/org/apache/cassandra/db/marshal/TimeUUIDTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/TimeUUIDTypeTest.java
index 703845b..4e22df8 100644
--- a/test/unit/org/apache/cassandra/db/marshal/TimeUUIDTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/TimeUUIDTypeTest.java

@@ -18,7 +18,6 @@
 */
 package org.apache.cassandra.db.marshal;
 
-import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Random;
@@ -35,7 +34,7 @@
     TimeUUIDType timeUUIDType = new TimeUUIDType();
 
     @Test
-    public void testEquality() throws UnknownHostException
+    public void testEquality()
     {
         UUID a = UUIDGen.getTimeUUID();
         UUID b = new UUID(a.getMostSignificantBits(), a.getLeastSignificantBits());
@@ -46,7 +45,7 @@
     }
 
     @Test
-    public void testSmaller() throws UnknownHostException
+    public void testSmaller()
     {
         UUID a = UUIDGen.getTimeUUID();
         UUID b = UUIDGen.getTimeUUID();
@@ -62,7 +61,7 @@
     }
 
     @Test
-    public void testBigger() throws UnknownHostException
+    public void testBigger()
     {
         UUID a = UUIDGen.getTimeUUID();
         UUID b = UUIDGen.getTimeUUID();

diff --git a/test/unit/org/apache/cassandra/db/marshal/UUIDTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/UUIDTypeTest.java
index 3d6d5aa..1ecacf3 100644
--- a/test/unit/org/apache/cassandra/db/marshal/UUIDTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/UUIDTypeTest.java

@@ -30,13 +30,14 @@
 
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.UUIDGen;
-import org.apache.log4j.Logger;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.junit.Test;
 
 public class UUIDTypeTest
 {
 
-    private static final Logger logger = Logger.getLogger(UUIDTypeTest.class);
+    private static final Logger logger = LoggerFactory.getLogger(UUIDTypeTest.class);
 
     UUIDType uuidType = new UUIDType();
 

diff --git a/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java b/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java
index 80021fd..e14dec8 100644
--- a/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java
+++ b/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java

@@ -18,17 +18,24 @@
 */
 package org.apache.cassandra.dht;
 
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 
 public class OrderPreservingPartitionerTest extends PartitionerTestCase<StringToken>
 {
+    @BeforeClass
+    public static void cleanStatesFromPreviousTest()
+    {
+        // Since OrderPreservingPartitioner#describeOwnership tries to read SSTables,
+        // we need to clear data dir to clear garbage from previous test before running tests.
+        SchemaLoader.cleanupAndLeaveDirs();
+    }
+
     public void initPartitioner()
     {
         partitioner = new OrderPreservingPartitioner();
-        // need to clear data dir
-        SchemaLoader.cleanupAndLeaveDirs();
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/gms/SerializationsTest.java b/test/unit/org/apache/cassandra/gms/SerializationsTest.java
index 7e0008d..6317a98 100644
--- a/test/unit/org/apache/cassandra/gms/SerializationsTest.java
+++ b/test/unit/org/apache/cassandra/gms/SerializationsTest.java

@@ -20,6 +20,7 @@
 
 import org.apache.cassandra.AbstractSerializationsTester;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.FBUtilities;
 import org.junit.Test;
@@ -38,7 +39,7 @@
 {
     private void testEndpointStateWrite() throws IOException
     {
-        DataOutputStream out = getOutput("gms.EndpointState.bin");
+        DataOutputStreamAndChannel out = getOutput("gms.EndpointState.bin");
         HeartBeatState.serializer.serialize(Statics.HeartbeatSt, out, getVersion());
         EndpointState.serializer.serialize(Statics.EndpointSt, out, getVersion());
         VersionedValue.serializer.serialize(Statics.vv0, out, getVersion());
@@ -75,7 +76,7 @@
         GossipDigestAck2 ack2 = new GossipDigestAck2(states);
         GossipDigestSyn syn = new GossipDigestSyn("Not a real cluster name", StorageService.getPartitioner().getClass().getCanonicalName(), Statics.Digests);
 
-        DataOutputStream out = getOutput("gms.Gossip.bin");
+        DataOutputStreamAndChannel out = getOutput("gms.Gossip.bin");
         for (GossipDigest gd : Statics.Digests)
             GossipDigest.serializer.serialize(gd, out, getVersion());
         GossipDigestAck.serializer.serialize(ack, out, getVersion());

diff --git a/test/unit/org/apache/cassandra/io/LazilyCompactedRowTest.java b/test/unit/org/apache/cassandra/io/LazilyCompactedRowTest.java
deleted file mode 100644
index 0cd9622..0000000
--- a/test/unit/org/apache/cassandra/io/LazilyCompactedRowTest.java
+++ /dev/null

@@ -1,314 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.io;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-
-import com.google.common.base.Objects;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.compaction.*;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
-import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.io.util.MappedFileDataInput;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.CloseableIterator;
-
-import static org.junit.Assert.assertEquals;
-
-
-public class LazilyCompactedRowTest extends SchemaLoader
-{
-    private static void assertBytes(ColumnFamilyStore cfs, int gcBefore) throws IOException
-    {
-        AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
-        Collection<SSTableReader> sstables = cfs.getSSTables();
-
-        // compare eager and lazy compactions
-        AbstractCompactionIterable eager = new CompactionIterable(OperationType.UNKNOWN,
-                                                                  strategy.getScanners(sstables),
-                                                                  new PreCompactingController(cfs, sstables, gcBefore));
-        AbstractCompactionIterable lazy = new CompactionIterable(OperationType.UNKNOWN,
-                                                                 strategy.getScanners(sstables),
-                                                                 new LazilyCompactingController(cfs, sstables, gcBefore));
-        assertBytes(cfs, eager, lazy);
-
-        // compare eager and parallel-lazy compactions
-        eager = new CompactionIterable(OperationType.UNKNOWN,
-                                       strategy.getScanners(sstables),
-                                       new PreCompactingController(cfs, sstables, gcBefore));
-        AbstractCompactionIterable parallel = new ParallelCompactionIterable(OperationType.UNKNOWN,
-                                                                             strategy.getScanners(sstables),
-                                                                             new CompactionController(cfs, new HashSet<SSTableReader>(sstables), gcBefore),
-                                                                             0);
-        assertBytes(cfs, eager, parallel);
-    }
-
-    private static void assertBytes(ColumnFamilyStore cfs, AbstractCompactionIterable ci1, AbstractCompactionIterable ci2) throws IOException
-    {
-        CloseableIterator<AbstractCompactedRow> iter1 = ci1.iterator();
-        CloseableIterator<AbstractCompactedRow> iter2 = ci2.iterator();
-
-        while (true)
-        {
-            if (!iter1.hasNext())
-            {
-                assert !iter2.hasNext();
-                break;
-            }
-
-            AbstractCompactedRow row1 = iter1.next();
-            AbstractCompactedRow row2 = iter2.next();
-            DataOutputBuffer out1 = new DataOutputBuffer();
-            DataOutputBuffer out2 = new DataOutputBuffer();
-            row1.write(-1, out1);
-            row2.write(-1, out2);
-
-            File tmpFile1 = File.createTempFile("lcrt1", null);
-            File tmpFile2 = File.createTempFile("lcrt2", null);
-
-            tmpFile1.deleteOnExit();
-            tmpFile2.deleteOnExit();
-
-            new FileOutputStream(tmpFile1).write(out1.getData()); // writing data from row1
-            new FileOutputStream(tmpFile2).write(out2.getData()); // writing data from row2
-
-            MappedFileDataInput in1 = new MappedFileDataInput(new FileInputStream(tmpFile1), tmpFile1.getAbsolutePath(), 0, 0);
-            MappedFileDataInput in2 = new MappedFileDataInput(new FileInputStream(tmpFile2), tmpFile2.getAbsolutePath(), 0, 0);
-
-            // row key
-            assertEquals(ByteBufferUtil.readWithShortLength(in1), ByteBufferUtil.readWithShortLength(in2));
-
-            // cf metadata
-            ColumnFamily cf1 = TreeMapBackedSortedColumns.factory.create(cfs.metadata);
-            ColumnFamily cf2 = TreeMapBackedSortedColumns.factory.create(cfs.metadata);
-            cf1.delete(DeletionTime.serializer.deserialize(in1));
-            cf2.delete(DeletionTime.serializer.deserialize(in2));
-            assertEquals(cf1.deletionInfo(), cf2.deletionInfo());
-            // columns
-            while (true)
-            {
-                Column c1 = (Column)Column.onDiskSerializer().deserializeFromSSTable(in1, Descriptor.Version.CURRENT);
-                Column c2 = (Column)Column.onDiskSerializer().deserializeFromSSTable(in2, Descriptor.Version.CURRENT);
-                assert Objects.equal(c1, c2) : c1.getString(cfs.metadata.comparator) + " != " + c2.getString(cfs.metadata.comparator);
-                if (c1 == null)
-                    break;
-            }
-            // that should be everything
-            assert in1.available() == 0;
-            assert in2.available() == 0;
-        }
-    }
-
-    private void assertDigest(ColumnFamilyStore cfs, int gcBefore) throws NoSuchAlgorithmException
-    {
-        AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
-        Collection<SSTableReader> sstables = cfs.getSSTables();
-        AbstractCompactionIterable ci1 = new CompactionIterable(OperationType.UNKNOWN, strategy.getScanners(sstables), new PreCompactingController(cfs, sstables, gcBefore));
-        AbstractCompactionIterable ci2 = new CompactionIterable(OperationType.UNKNOWN, strategy.getScanners(sstables), new LazilyCompactingController(cfs, sstables, gcBefore));
-        CloseableIterator<AbstractCompactedRow> iter1 = ci1.iterator();
-        CloseableIterator<AbstractCompactedRow> iter2 = ci2.iterator();
-
-        while (true)
-        {
-            if (!iter1.hasNext())
-            {
-                assert !iter2.hasNext();
-                break;
-            }
-
-            AbstractCompactedRow row1 = iter1.next();
-            AbstractCompactedRow row2 = iter2.next();
-            MessageDigest digest1 = MessageDigest.getInstance("MD5");
-            MessageDigest digest2 = MessageDigest.getInstance("MD5");
-
-            row1.update(digest1);
-            row2.update(digest2);
-
-            assert MessageDigest.isEqual(digest1.digest(), digest2.digest());
-        }
-    }
-
-    @Test
-    public void testOneRow() throws IOException, NoSuchAlgorithmException
-    {
-        CompactionManager.instance.disableAutoCompaction();
-
-        Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
-
-        ByteBuffer key = ByteBufferUtil.bytes("k");
-        RowMutation rm = new RowMutation("Keyspace1", key);
-        rm.add("Standard1", ByteBufferUtil.bytes("c"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.apply();
-        cfs.forceBlockingFlush();
-
-        assertBytes(cfs, Integer.MAX_VALUE);
-        assertDigest(cfs, Integer.MAX_VALUE);
-    }
-
-    @Test
-    public void testOneRowTwoColumns() throws IOException, NoSuchAlgorithmException
-    {
-        CompactionManager.instance.disableAutoCompaction();
-
-        Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
-
-        ByteBuffer key = ByteBufferUtil.bytes("k");
-        RowMutation rm = new RowMutation("Keyspace1", key);
-        rm.add("Standard1", ByteBufferUtil.bytes("c"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.add("Standard1", ByteBufferUtil.bytes("d"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.apply();
-        cfs.forceBlockingFlush();
-
-        assertBytes(cfs, Integer.MAX_VALUE);
-        assertDigest(cfs, Integer.MAX_VALUE);
-    }
-
-    @Test
-    public void testOneRowManyColumns() throws IOException, NoSuchAlgorithmException
-    {
-        CompactionManager.instance.disableAutoCompaction();
-
-        Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
-
-        ByteBuffer key = ByteBuffer.wrap("k".getBytes());
-        RowMutation rm = new RowMutation("Keyspace1", key);
-        for (int i = 0; i < 1000; i++)
-            rm.add("Standard1", ByteBufferUtil.bytes(i), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.apply();
-        DataOutputBuffer out = new DataOutputBuffer();
-        RowMutation.serializer.serialize(rm, out, MessagingService.current_version);
-        assert out.getLength() > DatabaseDescriptor.getColumnIndexSize();
-        cfs.forceBlockingFlush();
-
-        assertBytes(cfs, Integer.MAX_VALUE);
-        assertDigest(cfs, Integer.MAX_VALUE);
-    }
-
-    @Test
-    public void testTwoRows() throws IOException, NoSuchAlgorithmException
-    {
-        CompactionManager.instance.disableAutoCompaction();
-
-        Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
-
-        ByteBuffer key = ByteBufferUtil.bytes("k");
-        RowMutation rm = new RowMutation("Keyspace1", key);
-        rm.add("Standard1", ByteBufferUtil.bytes("c"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.apply();
-        cfs.forceBlockingFlush();
-
-        rm.apply();
-        cfs.forceBlockingFlush();
-
-        assertBytes(cfs, Integer.MAX_VALUE);
-        assertDigest(cfs, Integer.MAX_VALUE);
-    }
-
-    @Test
-    public void testTwoRowsTwoColumns() throws IOException, NoSuchAlgorithmException
-    {
-        CompactionManager.instance.disableAutoCompaction();
-
-        Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
-
-        ByteBuffer key = ByteBufferUtil.bytes("k");
-        RowMutation rm = new RowMutation("Keyspace1", key);
-        rm.add("Standard1", ByteBufferUtil.bytes("c"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.add("Standard1", ByteBufferUtil.bytes("d"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.apply();
-        cfs.forceBlockingFlush();
-
-        rm.apply();
-        cfs.forceBlockingFlush();
-
-        assertBytes(cfs, Integer.MAX_VALUE);
-        assertDigest(cfs, Integer.MAX_VALUE);
-    }
-
-    @Test
-    public void testManyRows() throws IOException, NoSuchAlgorithmException
-    {
-        CompactionManager.instance.disableAutoCompaction();
-
-        Keyspace keyspace = Keyspace.open("Keyspace1");
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
-
-        final int ROWS_PER_SSTABLE = 10;
-        for (int j = 0; j < (cfs.metadata.getIndexInterval() * 3) / ROWS_PER_SSTABLE; j++)
-        {
-            for (int i = 0; i < ROWS_PER_SSTABLE; i++)
-            {
-                ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(i % 2));
-                RowMutation rm = new RowMutation("Keyspace1", key);
-                rm.add("Standard1", ByteBufferUtil.bytes(String.valueOf(i / 2)), ByteBufferUtil.EMPTY_BYTE_BUFFER, j * ROWS_PER_SSTABLE + i);
-                rm.apply();
-            }
-            cfs.forceBlockingFlush();
-        }
-
-        assertBytes(cfs, Integer.MAX_VALUE);
-        assertDigest(cfs, Integer.MAX_VALUE);
-    }
-
-    private static class LazilyCompactingController extends CompactionController
-    {
-        public LazilyCompactingController(ColumnFamilyStore cfs, Collection<SSTableReader> sstables, int gcBefore)
-        {
-            super(cfs, new HashSet<SSTableReader>(sstables), gcBefore);
-        }
-
-        @Override
-        public AbstractCompactedRow getCompactedRow(List<SSTableIdentityIterator> rows)
-        {
-            return new LazilyCompactedRow(this, rows);
-        }
-    }
-
-    private static class PreCompactingController extends CompactionController
-    {
-        public PreCompactingController(ColumnFamilyStore cfs, Collection<SSTableReader> sstables, int gcBefore)
-        {
-            super(cfs, new HashSet<SSTableReader>(sstables), gcBefore);
-        }
-
-        @Override
-        public AbstractCompactedRow getCompactedRow(List<SSTableIdentityIterator> rows)
-        {
-            return new PrecompactedRow(this, rows);
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/io/compress/CompressedRandomAccessReaderTest.java b/test/unit/org/apache/cassandra/io/compress/CompressedRandomAccessReaderTest.java
index 3c9dfe5..900abd8 100644
--- a/test/unit/org/apache/cassandra/io/compress/CompressedRandomAccessReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/compress/CompressedRandomAccessReaderTest.java

@@ -18,17 +18,22 @@
  */
 package org.apache.cassandra.io.compress;
 
-import java.io.*;
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
 import java.util.Collections;
 import java.util.Random;
 
 import org.junit.Test;
 
+import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
-import org.apache.cassandra.io.util.*;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.util.FileMark;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.io.util.SequentialWriter;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
@@ -58,8 +63,8 @@
         try
         {
 
-            SSTableMetadata.Collector sstableMetadataCollector = SSTableMetadata.createCollector(BytesType.instance).replayPosition(null);
-            CompressedSequentialWriter writer = new CompressedSequentialWriter(f, filename + ".metadata", false, new CompressionParameters(SnappyCompressor.instance, 32, Collections.<String, String>emptyMap()), sstableMetadataCollector);
+            MetadataCollector sstableMetadataCollector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance));
+            CompressedSequentialWriter writer = new CompressedSequentialWriter(f, filename + ".metadata", new CompressionParameters(SnappyCompressor.instance, 32, Collections.<String, String>emptyMap()), sstableMetadataCollector);
 
             for (int i = 0; i < 20; i++)
                 writer.write("x".getBytes());
@@ -97,10 +102,10 @@
 
         try
         {
-            SSTableMetadata.Collector sstableMetadataCollector = SSTableMetadata.createCollector(BytesType.instance).replayPosition(null);
+            MetadataCollector sstableMetadataCollector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance)).replayPosition(null);
             SequentialWriter writer = compressed
-                ? new CompressedSequentialWriter(f, filename + ".metadata", false, new CompressionParameters(SnappyCompressor.instance), sstableMetadataCollector)
-                : new SequentialWriter(f, CompressionParameters.DEFAULT_CHUNK_LENGTH, false);
+                ? new CompressedSequentialWriter(f, filename + ".metadata", new CompressionParameters(SnappyCompressor.instance), sstableMetadataCollector)
+                : new SequentialWriter(f, CompressionParameters.DEFAULT_CHUNK_LENGTH);
 
             writer.write("The quick ".getBytes());
             FileMark mark = writer.mark();
@@ -148,8 +153,8 @@
         File metadata = new File(file.getPath() + ".meta");
         metadata.deleteOnExit();
 
-        SSTableMetadata.Collector sstableMetadataCollector = SSTableMetadata.createCollector(BytesType.instance).replayPosition(null);
-        SequentialWriter writer = new CompressedSequentialWriter(file, metadata.getPath(), false, new CompressionParameters(SnappyCompressor.instance), sstableMetadataCollector);
+        MetadataCollector sstableMetadataCollector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance)).replayPosition(null);
+        SequentialWriter writer = new CompressedSequentialWriter(file, metadata.getPath(), new CompressionParameters(SnappyCompressor.instance), sstableMetadataCollector);
 
         writer.write(CONTENT.getBytes());
         writer.close();

diff --git a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java
index de814e1..5a09586 100644
--- a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java

@@ -27,27 +27,27 @@
 import com.google.common.io.Files;
 import org.junit.BeforeClass;
 import org.junit.Test;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.OutputHandler;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
 public class CQLSSTableWriterTest
 {
     @BeforeClass
     public static void setup() throws Exception
     {
+        Keyspace.setInitialized();
         StorageService.instance.initServer();
     }
 
@@ -96,7 +96,7 @@
 
         loader.stream().get();
 
-        UntypedResultSet rs = QueryProcessor.processInternal("SELECT * FROM cql_keyspace.table1;");
+        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * FROM cql_keyspace.table1;");
         assertEquals(4, rs.size());
 
         Iterator<UntypedResultSet.Row> iter = rs.iterator();

diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java
index 53a79f8..59ef4c4 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java

@@ -25,41 +25,46 @@
 
 import org.junit.Test;
 
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.marshal.IntegerType;
 import static org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 
 public class IndexHelperTest
 {
+    private static CellName cn(long l)
+    {
+        return Util.cellname(l);
+    }
+
     @Test
     public void testIndexHelper()
     {
         List<IndexInfo> indexes = new ArrayList<IndexInfo>();
-        indexes.add(new IndexInfo(bytes(0L), bytes(5L), 0, 0));
-        indexes.add(new IndexInfo(bytes(10L), bytes(15L), 0, 0));
-        indexes.add(new IndexInfo(bytes(20L), bytes(25L), 0, 0));
+        indexes.add(new IndexInfo(cn(0L), cn(5L), 0, 0));
+        indexes.add(new IndexInfo(cn(10L), cn(15L), 0, 0));
+        indexes.add(new IndexInfo(cn(20L), cn(25L), 0, 0));
 
-        AbstractType comp = IntegerType.instance;
+        CellNameType comp = new SimpleDenseCellNameType(IntegerType.instance);
 
-        assertEquals(0, IndexHelper.indexFor(bytes(-1L), indexes, comp, false, -1));
-        assertEquals(0, IndexHelper.indexFor(bytes(5L), indexes, comp, false, -1));
-        assertEquals(1, IndexHelper.indexFor(bytes(12L), indexes, comp, false, -1));
-        assertEquals(2, IndexHelper.indexFor(bytes(17L), indexes, comp, false, -1));
-        assertEquals(3, IndexHelper.indexFor(bytes(100L), indexes, comp, false, -1));
-        assertEquals(3, IndexHelper.indexFor(bytes(100L), indexes, comp, false, 0));
-        assertEquals(3, IndexHelper.indexFor(bytes(100L), indexes, comp, false, 1));
-        assertEquals(3, IndexHelper.indexFor(bytes(100L), indexes, comp, false, 2));
-        assertEquals(-1, IndexHelper.indexFor(bytes(100L), indexes, comp, false, 3));
+        assertEquals(0, IndexHelper.indexFor(cn(-1L), indexes, comp, false, -1));
+        assertEquals(0, IndexHelper.indexFor(cn(5L), indexes, comp, false, -1));
+        assertEquals(1, IndexHelper.indexFor(cn(12L), indexes, comp, false, -1));
+        assertEquals(2, IndexHelper.indexFor(cn(17L), indexes, comp, false, -1));
+        assertEquals(3, IndexHelper.indexFor(cn(100L), indexes, comp, false, -1));
+        assertEquals(3, IndexHelper.indexFor(cn(100L), indexes, comp, false, 0));
+        assertEquals(3, IndexHelper.indexFor(cn(100L), indexes, comp, false, 1));
+        assertEquals(3, IndexHelper.indexFor(cn(100L), indexes, comp, false, 2));
+        assertEquals(-1, IndexHelper.indexFor(cn(100L), indexes, comp, false, 3));
 
-        assertEquals(-1, IndexHelper.indexFor(bytes(-1L), indexes, comp, true, -1));
-        assertEquals(0, IndexHelper.indexFor(bytes(5L), indexes, comp, true, -1));
-        assertEquals(1, IndexHelper.indexFor(bytes(17L), indexes, comp, true, -1));
-        assertEquals(2, IndexHelper.indexFor(bytes(100L), indexes, comp, true, -1));
-        assertEquals(0, IndexHelper.indexFor(bytes(100L), indexes, comp, true, 0));
-        assertEquals(1, IndexHelper.indexFor(bytes(12L), indexes, comp, true, -1));
-        assertEquals(1, IndexHelper.indexFor(bytes(100L), indexes, comp, true, 1));
-        assertEquals(2, IndexHelper.indexFor(bytes(100L), indexes, comp, true, 2));
-        assertEquals(-1, IndexHelper.indexFor(bytes(100L), indexes, comp, true, 4));
+        assertEquals(-1, IndexHelper.indexFor(cn(-1L), indexes, comp, true, -1));
+        assertEquals(0, IndexHelper.indexFor(cn(5L), indexes, comp, true, -1));
+        assertEquals(1, IndexHelper.indexFor(cn(17L), indexes, comp, true, -1));
+        assertEquals(2, IndexHelper.indexFor(cn(100L), indexes, comp, true, -1));
+        assertEquals(0, IndexHelper.indexFor(cn(100L), indexes, comp, true, 0));
+        assertEquals(1, IndexHelper.indexFor(cn(12L), indexes, comp, true, -1));
+        assertEquals(1, IndexHelper.indexFor(cn(100L), indexes, comp, true, 1));
+        assertEquals(2, IndexHelper.indexFor(cn(100L), indexes, comp, true, 2));
+        assertEquals(-1, IndexHelper.indexFor(cn(100L), indexes, comp, true, 4));
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
new file mode 100644
index 0000000..5281449
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java

@@ -0,0 +1,494 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.metrics.RestorableMeter;
+
+import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
+import static org.apache.cassandra.io.sstable.IndexSummaryManager.DOWNSAMPLE_THESHOLD;
+import static org.apache.cassandra.io.sstable.IndexSummaryManager.UPSAMPLE_THRESHOLD;
+import static org.apache.cassandra.io.sstable.IndexSummaryManager.redistributeSummaries;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+public class IndexSummaryManagerTest extends SchemaLoader
+{
+    private static final Logger logger = LoggerFactory.getLogger(IndexSummaryManagerTest.class);
+
+    int originalMinIndexInterval;
+    int originalMaxIndexInterval;
+    long originalCapacity;
+
+    @Before
+    public void beforeTest()
+    {
+        String ksname = "Keyspace1";
+        String cfname = "StandardLowIndexInterval"; // index interval of 8, no key caching
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        originalMinIndexInterval = cfs.metadata.getMinIndexInterval();
+        originalMaxIndexInterval = cfs.metadata.getMaxIndexInterval();
+        originalCapacity = IndexSummaryManager.instance.getMemoryPoolCapacityInMB();
+    }
+
+    @After
+    public void afterTest()
+    {
+        String ksname = "Keyspace1";
+        String cfname = "StandardLowIndexInterval"; // index interval of 8, no key caching
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        cfs.metadata.minIndexInterval(originalMinIndexInterval);
+        cfs.metadata.maxIndexInterval(originalMaxIndexInterval);
+        IndexSummaryManager.instance.setMemoryPoolCapacityInMB(originalCapacity);
+    }
+
+    private static long totalOffHeapSize(List<SSTableReader> sstables)
+    {
+        long total = 0;
+        for (SSTableReader sstable : sstables)
+            total += sstable.getIndexSummaryOffHeapSize();
+
+        return total;
+    }
+
+    private static List<SSTableReader> resetSummaries(List<SSTableReader> sstables, long originalOffHeapSize) throws IOException
+    {
+        for (SSTableReader sstable : sstables)
+            sstable.readMeter = new RestorableMeter(100.0, 100.0);
+
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, originalOffHeapSize * sstables.size());
+        for (SSTableReader sstable : sstables)
+            assertEquals(BASE_SAMPLING_LEVEL, sstable.getIndexSummarySamplingLevel());
+
+        return sstables;
+    }
+
+    private void validateData(ColumnFamilyStore cfs, int numRows)
+    {
+        for (int i = 0; i < numRows; i++)
+        {
+            DecoratedKey key = Util.dk(String.format("%3d", i));
+            QueryFilter filter = QueryFilter.getIdentityFilter(key, cfs.getColumnFamilyName(), System.currentTimeMillis());
+            ColumnFamily row = cfs.getColumnFamily(filter);
+            assertNotNull(row);
+            Cell cell = row.getColumn(Util.cellname("column"));
+            assertNotNull(cell);
+            assertEquals(100, cell.value().array().length);
+        }
+    }
+
+    private Comparator<SSTableReader> hotnessComparator = new Comparator<SSTableReader>()
+    {
+        public int compare(SSTableReader o1, SSTableReader o2)
+        {
+            return Double.compare(o1.readMeter.fifteenMinuteRate(), o2.readMeter.fifteenMinuteRate());
+        }
+    };
+
+    private void createSSTables(String ksname, String cfname, int numSSTables, int numRows)
+    {
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        cfs.truncateBlocking();
+        cfs.disableAutoCompaction();
+
+        ArrayList<Future> futures = new ArrayList<>(numSSTables);
+        ByteBuffer value = ByteBuffer.wrap(new byte[100]);
+        for (int sstable = 0; sstable < numSSTables; sstable++)
+        {
+            for (int row = 0; row < numRows; row++)
+            {
+                DecoratedKey key = Util.dk(String.format("%3d", row));
+                Mutation rm = new Mutation(ksname, key.getKey());
+                rm.add(cfname, Util.cellname("column"), value, 0);
+                rm.applyUnsafe();
+            }
+            futures.add(cfs.forceFlush());
+        }
+        for (Future future : futures)
+        {
+            try
+            {
+                future.get();
+            } catch (InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+            catch (ExecutionException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        assertEquals(numSSTables, cfs.getSSTables().size());
+        validateData(cfs, numRows);
+    }
+
+    @Test
+    public void testChangeMinIndexInterval() throws IOException
+    {
+        String ksname = "Keyspace1";
+        String cfname = "StandardLowIndexInterval"; // index interval of 8, no key caching
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        int numSSTables = 1;
+        int numRows = 256;
+        createSSTables(ksname, cfname, numSSTables, numRows);
+
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        for (SSTableReader sstable : sstables)
+            sstable.readMeter = new RestorableMeter(100.0, 100.0);
+
+        for (SSTableReader sstable : sstables)
+            assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
+
+        // double the min_index_interval
+        cfs.metadata.minIndexInterval(originalMinIndexInterval * 2);
+        IndexSummaryManager.instance.redistributeSummaries();
+        for (SSTableReader sstable : cfs.getSSTables())
+        {
+            assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
+            assertEquals(numRows / cfs.metadata.getMinIndexInterval(), sstable.getIndexSummarySize());
+        }
+
+        // return min_index_interval to its original value
+        cfs.metadata.minIndexInterval(originalMinIndexInterval);
+        IndexSummaryManager.instance.redistributeSummaries();
+        for (SSTableReader sstable : cfs.getSSTables())
+        {
+            assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
+            assertEquals(numRows / cfs.metadata.getMinIndexInterval(), sstable.getIndexSummarySize());
+        }
+
+        // halve the min_index_interval, but constrain the available space to exactly what we have now; as a result,
+        // the summary shouldn't change
+        cfs.metadata.minIndexInterval(originalMinIndexInterval / 2);
+        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        long summarySpace = sstable.getIndexSummaryOffHeapSize();
+        IndexSummaryManager.redistributeSummaries(Collections.EMPTY_LIST, Arrays.asList(sstable), summarySpace);
+        sstable = cfs.getSSTables().iterator().next();
+        assertEquals(originalMinIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
+        assertEquals(numRows / originalMinIndexInterval, sstable.getIndexSummarySize());
+
+        // keep the min_index_interval the same, but now give the summary enough space to grow by 50%
+        double previousInterval = sstable.getEffectiveIndexInterval();
+        int previousSize = sstable.getIndexSummarySize();
+        IndexSummaryManager.redistributeSummaries(Collections.EMPTY_LIST, Arrays.asList(sstable), (long) Math.ceil(summarySpace * 1.5));
+        sstable = cfs.getSSTables().iterator().next();
+        assertEquals(previousSize * 1.5, (double) sstable.getIndexSummarySize(), 1);
+        assertEquals(previousInterval * (1.0 / 1.5), sstable.getEffectiveIndexInterval(), 0.001);
+
+        // return min_index_interval to it's original value (double it), but only give the summary enough space
+        // to have an effective index interval of twice the new min
+        cfs.metadata.minIndexInterval(originalMinIndexInterval);
+        IndexSummaryManager.redistributeSummaries(Collections.EMPTY_LIST, Arrays.asList(sstable), (long) Math.ceil(summarySpace / 2.0));
+        sstable = cfs.getSSTables().iterator().next();
+        assertEquals(originalMinIndexInterval * 2, sstable.getEffectiveIndexInterval(), 0.001);
+        assertEquals(numRows / (originalMinIndexInterval * 2), sstable.getIndexSummarySize());
+
+        // raise the min_index_interval above our current effective interval, but set the max_index_interval lower
+        // than what we actually have space for (meaning the index summary would ideally be smaller, but this would
+        // result in an effective interval above the new max)
+        cfs.metadata.minIndexInterval(originalMinIndexInterval * 4);
+        cfs.metadata.maxIndexInterval(originalMinIndexInterval * 4);
+        IndexSummaryManager.redistributeSummaries(Collections.EMPTY_LIST, Arrays.asList(sstable), 10);
+        sstable = cfs.getSSTables().iterator().next();
+        assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
+    }
+
+    @Test
+    public void testChangeMaxIndexInterval() throws IOException
+    {
+        String ksname = "Keyspace1";
+        String cfname = "StandardLowIndexInterval"; // index interval of 8, no key caching
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        int numSSTables = 1;
+        int numRows = 256;
+        createSSTables(ksname, cfname, numSSTables, numRows);
+
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        for (SSTableReader sstable : sstables)
+            sstable.readMeter = new RestorableMeter(100.0, 100.0);
+
+        IndexSummaryManager.redistributeSummaries(Collections.EMPTY_LIST, sstables, 1);
+        sstables = new ArrayList<>(cfs.getSSTables());
+        for (SSTableReader sstable : sstables)
+            assertEquals(cfs.metadata.getMaxIndexInterval(), sstable.getEffectiveIndexInterval(), 0.01);
+
+        // halve the max_index_interval
+        cfs.metadata.maxIndexInterval(cfs.metadata.getMaxIndexInterval() / 2);
+        IndexSummaryManager.redistributeSummaries(Collections.EMPTY_LIST, sstables, 1);
+        sstables = new ArrayList<>(cfs.getSSTables());
+        for (SSTableReader sstable : sstables)
+        {
+            assertEquals(cfs.metadata.getMaxIndexInterval(), sstable.getEffectiveIndexInterval(), 0.01);
+            assertEquals(numRows / cfs.metadata.getMaxIndexInterval(), sstable.getIndexSummarySize());
+        }
+
+        // return max_index_interval to its original value
+        cfs.metadata.maxIndexInterval(cfs.metadata.getMaxIndexInterval() * 2);
+        IndexSummaryManager.redistributeSummaries(Collections.EMPTY_LIST, sstables, 1);
+        for (SSTableReader sstable : cfs.getSSTables())
+        {
+            assertEquals(cfs.metadata.getMaxIndexInterval(), sstable.getEffectiveIndexInterval(), 0.01);
+            assertEquals(numRows / cfs.metadata.getMaxIndexInterval(), sstable.getIndexSummarySize());
+        }
+    }
+
+    @Test(timeout = 10000)
+    public void testRedistributeSummaries() throws IOException
+    {
+        String ksname = "Keyspace1";
+        String cfname = "StandardLowIndexInterval"; // index interval of 8, no key caching
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        int numSSTables = 4;
+        int numRows = 256;
+        createSSTables(ksname, cfname, numSSTables, numRows);
+
+        int minSamplingLevel = (BASE_SAMPLING_LEVEL * cfs.metadata.getMinIndexInterval()) / cfs.metadata.getMaxIndexInterval();
+
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        for (SSTableReader sstable : sstables)
+            sstable.readMeter = new RestorableMeter(100.0, 100.0);
+
+        long singleSummaryOffHeapSpace = sstables.get(0).getIndexSummaryOffHeapSize();
+
+        // there should be enough space to not downsample anything
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, (singleSummaryOffHeapSpace * numSSTables));
+        for (SSTableReader sstable : sstables)
+            assertEquals(BASE_SAMPLING_LEVEL, sstable.getIndexSummarySamplingLevel());
+        assertEquals(singleSummaryOffHeapSpace * numSSTables, totalOffHeapSize(sstables));
+        validateData(cfs, numRows);
+
+        // everything should get cut in half
+        assert sstables.size() == 4;
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, (singleSummaryOffHeapSpace * (numSSTables / 2)));
+        for (SSTableReader sstable : sstables)
+            assertEquals(BASE_SAMPLING_LEVEL / 2, sstable.getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+
+        // everything should get cut to a quarter
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, (singleSummaryOffHeapSpace * (numSSTables / 4)));
+        for (SSTableReader sstable : sstables)
+            assertEquals(BASE_SAMPLING_LEVEL / 4, sstable.getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+
+        // upsample back up to half
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables,(singleSummaryOffHeapSpace * (numSSTables / 2) + 4));
+        assert sstables.size() == 4;
+        for (SSTableReader sstable : sstables)
+            assertEquals(BASE_SAMPLING_LEVEL / 2, sstable.getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+
+        // upsample back up to the original index summary
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, (singleSummaryOffHeapSpace * numSSTables));
+        for (SSTableReader sstable : sstables)
+            assertEquals(BASE_SAMPLING_LEVEL, sstable.getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+
+        // make two of the four sstables cold, only leave enough space for three full index summaries,
+        // so the two cold sstables should get downsampled to be half of their original size
+        sstables.get(0).readMeter = new RestorableMeter(50.0, 50.0);
+        sstables.get(1).readMeter = new RestorableMeter(50.0, 50.0);
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, (singleSummaryOffHeapSpace * 3));
+        Collections.sort(sstables, hotnessComparator);
+        assertEquals(BASE_SAMPLING_LEVEL / 2, sstables.get(0).getIndexSummarySamplingLevel());
+        assertEquals(BASE_SAMPLING_LEVEL / 2, sstables.get(1).getIndexSummarySamplingLevel());
+        assertEquals(BASE_SAMPLING_LEVEL, sstables.get(2).getIndexSummarySamplingLevel());
+        assertEquals(BASE_SAMPLING_LEVEL, sstables.get(3).getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+
+        // small increases or decreases in the read rate don't result in downsampling or upsampling
+        double lowerRate = 50.0 * (DOWNSAMPLE_THESHOLD + (DOWNSAMPLE_THESHOLD * 0.10));
+        double higherRate = 50.0 * (UPSAMPLE_THRESHOLD - (UPSAMPLE_THRESHOLD * 0.10));
+        sstables.get(0).readMeter = new RestorableMeter(lowerRate, lowerRate);
+        sstables.get(1).readMeter = new RestorableMeter(higherRate, higherRate);
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, (singleSummaryOffHeapSpace * 3));
+        Collections.sort(sstables, hotnessComparator);
+        assertEquals(BASE_SAMPLING_LEVEL / 2, sstables.get(0).getIndexSummarySamplingLevel());
+        assertEquals(BASE_SAMPLING_LEVEL / 2, sstables.get(1).getIndexSummarySamplingLevel());
+        assertEquals(BASE_SAMPLING_LEVEL, sstables.get(2).getIndexSummarySamplingLevel());
+        assertEquals(BASE_SAMPLING_LEVEL, sstables.get(3).getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+
+        // reset, and then this time, leave enough space for one of the cold sstables to not get downsampled
+        sstables = resetSummaries(sstables, singleSummaryOffHeapSpace);
+        sstables.get(0).readMeter = new RestorableMeter(1.0, 1.0);
+        sstables.get(1).readMeter = new RestorableMeter(2.0, 2.0);
+        sstables.get(2).readMeter = new RestorableMeter(1000.0, 1000.0);
+        sstables.get(3).readMeter = new RestorableMeter(1000.0, 1000.0);
+
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, (singleSummaryOffHeapSpace * 3) + 50);
+        Collections.sort(sstables, hotnessComparator);
+
+        if (sstables.get(0).getIndexSummarySamplingLevel() == minSamplingLevel)
+            assertEquals(BASE_SAMPLING_LEVEL, sstables.get(1).getIndexSummarySamplingLevel());
+        else
+            assertEquals(BASE_SAMPLING_LEVEL, sstables.get(0).getIndexSummarySamplingLevel());
+
+        assertEquals(BASE_SAMPLING_LEVEL, sstables.get(2).getIndexSummarySamplingLevel());
+        assertEquals(BASE_SAMPLING_LEVEL, sstables.get(3).getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+
+
+        // Cause a mix of upsampling and downsampling. We'll leave enough space for two full index summaries. The two
+        // coldest sstables will get downsampled to 4/128 of their size, leaving us with 1 and 92/128th index
+        // summaries worth of space.  The hottest sstable should get a full index summary, and the one in the middle
+        // should get the remainder.
+        sstables.get(0).readMeter = new RestorableMeter(0.0, 0.0);
+        sstables.get(1).readMeter = new RestorableMeter(0.0, 0.0);
+        sstables.get(2).readMeter = new RestorableMeter(92, 92);
+        sstables.get(3).readMeter = new RestorableMeter(128.0, 128.0);
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, (long) (singleSummaryOffHeapSpace + (singleSummaryOffHeapSpace * (92.0 / BASE_SAMPLING_LEVEL))));
+        Collections.sort(sstables, hotnessComparator);
+        assertEquals(1, sstables.get(0).getIndexSummarySize());  // at the min sampling level
+        assertEquals(1, sstables.get(0).getIndexSummarySize());  // at the min sampling level
+        assertTrue(sstables.get(2).getIndexSummarySamplingLevel() > minSamplingLevel);
+        assertTrue(sstables.get(2).getIndexSummarySamplingLevel() < BASE_SAMPLING_LEVEL);
+        assertEquals(BASE_SAMPLING_LEVEL, sstables.get(3).getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+
+        // Don't leave enough space for even the minimal index summaries
+        sstables = redistributeSummaries(Collections.EMPTY_LIST, sstables, 10);
+        for (SSTableReader sstable : sstables)
+            assertEquals(1, sstable.getIndexSummarySize());  // at the min sampling level
+        validateData(cfs, numRows);
+    }
+
+    @Test
+    public void testRebuildAtSamplingLevel() throws IOException
+    {
+        String ksname = "Keyspace1";
+        String cfname = "StandardLowIndexInterval";
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        cfs.truncateBlocking();
+        cfs.disableAutoCompaction();
+
+        ByteBuffer value = ByteBuffer.wrap(new byte[100]);
+
+        int numRows = 256;
+        for (int row = 0; row < numRows; row++)
+        {
+            DecoratedKey key = Util.dk(String.valueOf(row));
+            Mutation rm = new Mutation(ksname, key.getKey());
+            rm.add(cfname, Util.cellname("column"), value, 0);
+            rm.apply();
+        }
+        cfs.forceBlockingFlush();
+
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        assertEquals(1, sstables.size());
+        SSTableReader original = sstables.get(0);
+
+        SSTableReader sstable = original;
+        for (int samplingLevel = 1; samplingLevel < BASE_SAMPLING_LEVEL; samplingLevel++)
+        {
+            sstable = sstable.cloneWithNewSummarySamplingLevel(cfs, samplingLevel);
+            assertEquals(samplingLevel, sstable.getIndexSummarySamplingLevel());
+            int expectedSize = (numRows * samplingLevel) / (sstable.metadata.getMinIndexInterval() * BASE_SAMPLING_LEVEL);
+            assertEquals(expectedSize, sstable.getIndexSummarySize(), 1);
+        }
+
+        // don't leave replaced SSTRs around to break other tests
+        cfs.getDataTracker().replaceReaders(Collections.singleton(original), Collections.singleton(sstable));
+    }
+
+    @Test
+    public void testJMXFunctions() throws IOException
+    {
+        IndexSummaryManager manager = IndexSummaryManager.instance;
+
+        // resize interval
+        manager.setResizeIntervalInMinutes(-1);
+        assertNull(manager.getTimeToNextResize(TimeUnit.MINUTES));
+
+        manager.setResizeIntervalInMinutes(10);
+        assertEquals(10, manager.getResizeIntervalInMinutes());
+        assertEquals(10, manager.getTimeToNextResize(TimeUnit.MINUTES), 1);
+        manager.setResizeIntervalInMinutes(15);
+        assertEquals(15, manager.getResizeIntervalInMinutes());
+        assertEquals(15, manager.getTimeToNextResize(TimeUnit.MINUTES), 2);
+
+        // memory pool capacity
+        assertTrue(manager.getMemoryPoolCapacityInMB() >= 0);
+        manager.setMemoryPoolCapacityInMB(10);
+        assertEquals(10, manager.getMemoryPoolCapacityInMB());
+
+        String ksname = "Keyspace1";
+        String cfname = "StandardLowIndexInterval"; // index interval of 8, no key caching
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        cfs.truncateBlocking();
+        cfs.disableAutoCompaction();
+
+        ByteBuffer value = ByteBuffer.wrap(new byte[100]);
+
+        int numSSTables = 2;
+        int numRows = 10;
+        for (int sstable = 0; sstable < numSSTables; sstable++)
+        {
+            for (int row = 0; row < numRows; row++)
+            {
+                DecoratedKey key = Util.dk(String.valueOf(row));
+                Mutation rm = new Mutation(ksname, key.getKey());
+                rm.add(cfname, Util.cellname("column"), value, 0);
+                rm.apply();
+            }
+            cfs.forceBlockingFlush();
+        }
+
+        assertTrue(manager.getAverageIndexInterval() >= cfs.metadata.getMinIndexInterval());
+        Map<String, Integer> intervals = manager.getIndexIntervals();
+        for (Map.Entry<String, Integer> entry : intervals.entrySet())
+            if (entry.getKey().contains("StandardLowIndexInterval"))
+                assertEquals(cfs.metadata.getMinIndexInterval(), entry.getValue(), 0.001);
+
+        manager.setMemoryPoolCapacityInMB(0);
+        manager.redistributeSummaries();
+        assertTrue(manager.getAverageIndexInterval() > cfs.metadata.getMinIndexInterval());
+        intervals = manager.getIndexIntervals();
+        for (Map.Entry<String, Integer> entry : intervals.entrySet())
+        {
+            if (entry.getKey().contains("StandardLowIndexInterval"))
+                assertTrue(entry.getValue() >= cfs.metadata.getMinIndexInterval());
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryTest.java
index 8e73161..9aca66d 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryTest.java

@@ -18,28 +18,28 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.DataInputStream;
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-import java.util.UUID;
+import java.util.*;
 
 import com.google.common.collect.Lists;
 import org.junit.Test;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.RandomPartitioner;
+import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
+import static org.apache.cassandra.io.sstable.IndexSummaryBuilder.downsample;
+import static org.apache.cassandra.io.sstable.IndexSummaryBuilder.entriesAtSamplingLevel;
+import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
+
+import static org.junit.Assert.*;
 
 public class IndexSummaryTest
 {
@@ -48,7 +48,7 @@
     {
         Pair<List<DecoratedKey>, IndexSummary> random = generateRandomIndex(100, 1);
         for (int i = 0; i < 100; i++)
-            assertEquals(random.left.get(i).key, ByteBuffer.wrap(random.right.getKey(i)));
+            assertEquals(random.left.get(i).getKey(), ByteBuffer.wrap(random.right.getKey(i)));
     }
 
     @Test
@@ -71,15 +71,14 @@
     public void testSerialization() throws IOException
     {
         Pair<List<DecoratedKey>, IndexSummary> random = generateRandomIndex(100, 1);
-        ByteArrayOutputStream aos = new ByteArrayOutputStream();
-        DataOutputStream dos = new DataOutputStream(aos);
-        IndexSummary.serializer.serialize(random.right, dos);
+        DataOutputBuffer dos = new DataOutputBuffer();
+        IndexSummary.serializer.serialize(random.right, dos, false);
         // write junk
         dos.writeUTF("JUNK");
         dos.writeUTF("JUNK");
         FileUtils.closeQuietly(dos);
-        DataInputStream dis = new DataInputStream(new ByteArrayInputStream(aos.toByteArray()));
-        IndexSummary is = IndexSummary.serializer.deserialize(dis, DatabaseDescriptor.getPartitioner());
+        DataInputStream dis = new DataInputStream(new ByteArrayInputStream(dos.toByteArray()));
+        IndexSummary is = IndexSummary.serializer.deserialize(dis, DatabaseDescriptor.getPartitioner(), false, 1, 1);
         for (int i = 0; i < 100; i++)
             assertEquals(i, is.binarySearch(random.left.get(i)));
         // read the junk
@@ -92,18 +91,17 @@
     public void testAddEmptyKey() throws Exception
     {
         IPartitioner p = new RandomPartitioner();
-        IndexSummaryBuilder builder = new IndexSummaryBuilder(1, 1);
+        IndexSummaryBuilder builder = new IndexSummaryBuilder(1, 1, BASE_SAMPLING_LEVEL);
         builder.maybeAddEntry(p.decorateKey(ByteBufferUtil.EMPTY_BYTE_BUFFER), 0);
         IndexSummary summary = builder.build(p);
         assertEquals(1, summary.size());
         assertEquals(0, summary.getPosition(0));
         assertArrayEquals(new byte[0], summary.getKey(0));
 
-        ByteArrayOutputStream aos = new ByteArrayOutputStream();
-        DataOutputStream dos = new DataOutputStream(aos);
-        IndexSummary.serializer.serialize(summary, dos);
-        DataInputStream dis = new DataInputStream(new ByteArrayInputStream(aos.toByteArray()));
-        IndexSummary loaded = IndexSummary.serializer.deserialize(dis, p);
+        DataOutputBuffer dos = new DataOutputBuffer();
+        IndexSummary.serializer.serialize(summary, dos, false);
+        DataInputStream dis = new DataInputStream(new ByteArrayInputStream(dos.toByteArray()));
+        IndexSummary loaded = IndexSummary.serializer.deserialize(dis, p, false, 1, 1);
 
         assertEquals(1, loaded.size());
         assertEquals(summary.getPosition(0), loaded.getPosition(0));
@@ -113,7 +111,7 @@
     private Pair<List<DecoratedKey>, IndexSummary> generateRandomIndex(int size, int interval)
     {
         List<DecoratedKey> list = Lists.newArrayList();
-        IndexSummaryBuilder builder = new IndexSummaryBuilder(list.size(), interval);
+        IndexSummaryBuilder builder = new IndexSummaryBuilder(list.size(), interval, BASE_SAMPLING_LEVEL);
         for (int i = 0; i < size; i++)
         {
             UUID uuid = UUID.randomUUID();
@@ -126,4 +124,128 @@
         IndexSummary summary = builder.build(DatabaseDescriptor.getPartitioner());
         return Pair.create(list, summary);
     }
+
+    @Test
+    public void testDownsamplePatterns()
+    {
+        assertEquals(Arrays.asList(0), Downsampling.getSamplingPattern(0));
+        assertEquals(Arrays.asList(0), Downsampling.getSamplingPattern(1));
+
+        assertEquals(Arrays.asList(1, 0), Downsampling.getSamplingPattern(2));
+        assertEquals(Arrays.asList(3, 1, 2, 0), Downsampling.getSamplingPattern(4));
+        assertEquals(Arrays.asList(7, 3, 5, 1, 6, 2, 4, 0), Downsampling.getSamplingPattern(8));
+        assertEquals(Arrays.asList(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0), Downsampling.getSamplingPattern(16));
+    }
+
+    private static boolean shouldSkip(int index, List<Integer> startPoints)
+    {
+        for (int start : startPoints)
+        {
+            if ((index - start) % BASE_SAMPLING_LEVEL == 0)
+                return true;
+        }
+        return false;
+    }
+
+    @Test
+    public void testDownsample()
+    {
+        final int NUM_KEYS = 4096;
+        final int INDEX_INTERVAL = 128;
+        final int ORIGINAL_NUM_ENTRIES = NUM_KEYS / INDEX_INTERVAL;
+
+
+        Pair<List<DecoratedKey>, IndexSummary> random = generateRandomIndex(NUM_KEYS, INDEX_INTERVAL);
+        List<DecoratedKey> keys = random.left;
+        IndexSummary original = random.right;
+
+        // sanity check on the original index summary
+        for (int i = 0; i < ORIGINAL_NUM_ENTRIES; i++)
+            assertEquals(keys.get(i * INDEX_INTERVAL).getKey(), ByteBuffer.wrap(original.getKey(i)));
+
+        List<Integer> samplePattern = Downsampling.getSamplingPattern(BASE_SAMPLING_LEVEL);
+
+        // downsample by one level, then two levels, then three levels...
+        int downsamplingRound = 1;
+        for (int samplingLevel = BASE_SAMPLING_LEVEL - 1; samplingLevel >= 1; samplingLevel--)
+        {
+            IndexSummary downsampled = downsample(original, samplingLevel, 128, DatabaseDescriptor.getPartitioner());
+            assertEquals(entriesAtSamplingLevel(samplingLevel, original.getMaxNumberOfEntries()), downsampled.size());
+
+            int sampledCount = 0;
+            List<Integer> skipStartPoints = samplePattern.subList(0, downsamplingRound);
+            for (int i = 0; i < ORIGINAL_NUM_ENTRIES; i++)
+            {
+                if (!shouldSkip(i, skipStartPoints))
+                {
+                    assertEquals(keys.get(i * INDEX_INTERVAL).getKey(), ByteBuffer.wrap(downsampled.getKey(sampledCount)));
+                    sampledCount++;
+                }
+            }
+            downsamplingRound++;
+        }
+
+        // downsample one level each time
+        IndexSummary previous = original;
+        downsamplingRound = 1;
+        for (int downsampleLevel = BASE_SAMPLING_LEVEL - 1; downsampleLevel >= 1; downsampleLevel--)
+        {
+            IndexSummary downsampled = downsample(previous, downsampleLevel, 128, DatabaseDescriptor.getPartitioner());
+            assertEquals(entriesAtSamplingLevel(downsampleLevel, original.getMaxNumberOfEntries()), downsampled.size());
+
+            int sampledCount = 0;
+            List<Integer> skipStartPoints = samplePattern.subList(0, downsamplingRound);
+            for (int i = 0; i < ORIGINAL_NUM_ENTRIES; i++)
+            {
+                if (!shouldSkip(i, skipStartPoints))
+                {
+                    assertEquals(keys.get(i * INDEX_INTERVAL).getKey(), ByteBuffer.wrap(downsampled.getKey(sampledCount)));
+                    sampledCount++;
+                }
+            }
+
+            previous = downsampled;
+            downsamplingRound++;
+        }
+    }
+
+    @Test
+    public void testOriginalIndexLookup()
+    {
+        for (int i = BASE_SAMPLING_LEVEL; i >= 1; i--)
+            assertEquals(i, Downsampling.getOriginalIndexes(i).size());
+
+        ArrayList<Integer> full = new ArrayList<>();
+        for (int i = 0; i < BASE_SAMPLING_LEVEL; i++)
+            full.add(i);
+
+        assertEquals(full, Downsampling.getOriginalIndexes(BASE_SAMPLING_LEVEL));
+        // the entry at index 127 is the first to go
+        assertEquals(full.subList(0, full.size() - 1), Downsampling.getOriginalIndexes(BASE_SAMPLING_LEVEL - 1));
+
+        // spot check a few values (these depend on BASE_SAMPLING_LEVEL being 128)
+        assertEquals(128, BASE_SAMPLING_LEVEL);
+        assertEquals(Arrays.asList(0, 32, 64, 96), Downsampling.getOriginalIndexes(4));
+        assertEquals(Arrays.asList(0, 64), Downsampling.getOriginalIndexes(2));
+        assertEquals(Arrays.asList(), Downsampling.getOriginalIndexes(0));
+    }
+
+    @Test
+    public void testGetNumberOfSkippedEntriesAfterIndex()
+    {
+        int indexInterval = 128;
+        for (int i = 0; i < BASE_SAMPLING_LEVEL; i++)
+            assertEquals(indexInterval, Downsampling.getEffectiveIndexIntervalAfterIndex(i, BASE_SAMPLING_LEVEL, indexInterval));
+
+        // with one round of downsampling, only the last summary entry has been removed, so only the last index will have
+        // double the gap until the next sample
+        for (int i = 0; i < BASE_SAMPLING_LEVEL - 2; i++)
+            assertEquals(indexInterval, Downsampling.getEffectiveIndexIntervalAfterIndex(i, BASE_SAMPLING_LEVEL - 1, indexInterval));
+        assertEquals(indexInterval * 2, Downsampling.getEffectiveIndexIntervalAfterIndex(BASE_SAMPLING_LEVEL - 2, BASE_SAMPLING_LEVEL - 1, indexInterval));
+
+        // at samplingLevel=2, the retained summary points are [0, 64] (assumes BASE_SAMPLING_LEVEL is 128)
+        assertEquals(128, BASE_SAMPLING_LEVEL);
+        assertEquals(64 * indexInterval, Downsampling.getEffectiveIndexIntervalAfterIndex(0, 2, indexInterval));
+        assertEquals(64 * indexInterval, Downsampling.getEffectiveIndexIntervalAfterIndex(1, 2, indexInterval));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
index c4ab8ab..5341a4b 100644
--- a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java

@@ -20,24 +20,32 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.File;
-import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.columniterator.SSTableNamesIterator;
+import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.StreamPlan;
 import org.apache.cassandra.streaming.StreamSession;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.junit.BeforeClass;
-import org.junit.Test;
+import org.apache.cassandra.utils.FBUtilities;
 
 /**
  * Tests backwards compatibility for SSTables
@@ -54,6 +62,7 @@
     @BeforeClass
     public static void beforeClass()
     {
+        Keyspace.setInitialized();
         String scp = System.getProperty(LEGACY_SSTABLE_PROP);
         assert scp != null;
         LEGACY_SSTABLE_ROOT = new File(scp).getAbsoluteFile();
@@ -67,10 +76,10 @@
     /**
      * Get a descriptor for the legacy sstable at the given version.
      */
-    protected Descriptor getDescriptor(String ver) throws IOException
+    protected Descriptor getDescriptor(String ver)
     {
         File directory = new File(LEGACY_SSTABLE_ROOT + File.separator + ver + File.separator + KSNAME);
-        return new Descriptor(ver, directory, KSNAME, CFNAME, 0, false);
+        return new Descriptor(ver, directory, KSNAME, CFNAME, 0, Descriptor.Type.FINAL);
     }
 
     /**
@@ -97,7 +106,7 @@
         StorageService.instance.initServer();
 
         for (File version : LEGACY_SSTABLE_ROOT.listFiles())
-            if (Descriptor.Version.validate(version.getName()))
+            if (Descriptor.Version.validate(version.getName()) && new Descriptor.Version(version.getName()).isCompatible())
                 testStreaming(version.getName());
     }
 
@@ -111,32 +120,41 @@
         ArrayList<StreamSession.SSTableStreamingSections> details = new ArrayList<>();
         details.add(new StreamSession.SSTableStreamingSections(sstable,
                                                                sstable.getPositionsForRanges(ranges),
-                                                               sstable.estimatedKeysForRanges(ranges)));
+                                                               sstable.estimatedKeysForRanges(ranges), sstable.getSSTableMetadata().repairedAt));
         new StreamPlan("LegacyStreamingTest").transferFiles(FBUtilities.getBroadcastAddress(), details)
                                              .execute().get();
-        sstable.close();
 
         ColumnFamilyStore cfs = Keyspace.open(KSNAME).getColumnFamilyStore(CFNAME);
         assert cfs.getSSTables().size() == 1;
         sstable = cfs.getSSTables().iterator().next();
+        CellNameType type = sstable.metadata.comparator;
         for (String keystring : TEST_DATA)
         {
             ByteBuffer key = ByteBufferUtil.bytes(keystring);
-            SSTableNamesIterator iter = new SSTableNamesIterator(sstable, Util.dk(key), FBUtilities.singleton(key, sstable.metadata.comparator));
+            SSTableNamesIterator iter = new SSTableNamesIterator(sstable, Util.dk(key), FBUtilities.singleton(Util.cellname(key), type));
             ColumnFamily cf = iter.getColumnFamily();
 
             // check not deleted (CASSANDRA-6527)
             assert cf.deletionInfo().equals(DeletionInfo.live());
-            assert iter.next().name().equals(key);
+            assert iter.next().name().toByteBuffer().equals(key);
         }
     }
 
     @Test
     public void testVersions() throws Throwable
     {
+        boolean notSkipped = false;
+
         for (File version : LEGACY_SSTABLE_ROOT.listFiles())
-            if (Descriptor.Version.validate(version.getName()))
+        {
+            if (Descriptor.Version.validate(version.getName()) && new Descriptor.Version(version.getName()).isCompatible())
+            {
+                notSkipped = true;
                 testVersion(version.getName());
+            }
+        }
+
+        assert notSkipped;
     }
 
     public void testVersion(String version) throws Throwable
@@ -144,13 +162,14 @@
         try
         {
             SSTableReader reader = SSTableReader.open(getDescriptor(version));
+            CellNameType type = reader.metadata.comparator;
             for (String keystring : TEST_DATA)
             {
                 ByteBuffer key = ByteBufferUtil.bytes(keystring);
                 // confirm that the bloom filter does not reject any keys/names
                 DecoratedKey dk = reader.partitioner.decorateKey(key);
-                SSTableNamesIterator iter = new SSTableNamesIterator(reader, dk, FBUtilities.singleton(key, reader.metadata.comparator));
-                assert iter.next().name().equals(key);
+                SSTableNamesIterator iter = new SSTableNamesIterator(reader, dk, FBUtilities.singleton(Util.cellname(key), type));
+                assert iter.next().name().toByteBuffer().equals(key);
             }
 
             // TODO actually test some reads

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java
index 45a69e2..39beb94 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java

@@ -45,6 +45,7 @@
     @BeforeClass
     public static void setup() throws Exception
     {
+        Keyspace.setInitialized();
         StorageService.instance.initServer();
     }
 
@@ -60,7 +61,7 @@
                                                                              StorageService.getPartitioner(),
                                                                              1);
         DecoratedKey key = Util.dk("key1");
-        writer.newRow(key.key);
+        writer.newRow(key.getKey());
         writer.addColumn(ByteBufferUtil.bytes("col1"), ByteBufferUtil.bytes(100), 1);
         writer.close();
 
@@ -84,6 +85,6 @@
         List<Row> rows = Util.getRangeSlice(Keyspace.open("Keyspace1").getColumnFamilyStore("Standard1"));
         assertEquals(1, rows.size());
         assertEquals(key, rows.get(0).key);
-        assertEquals(ByteBufferUtil.bytes(100), rows.get(0).cf.getColumn(ByteBufferUtil.bytes("col1")).value());
+        assertEquals(ByteBufferUtil.bytes(100), rows.get(0).cf.getColumn(Util.cellname("col1")).value());
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataSerializerTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataSerializerTest.java
deleted file mode 100644
index 13b2f26..0000000
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataSerializerTest.java
+++ /dev/null

@@ -1,90 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.io.sstable;
-
-import java.io.ByteArrayOutputStream;
-import java.io.ByteArrayInputStream;
-import java.io.DataOutputStream;
-import java.io.DataInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.junit.Test;
-
-import org.apache.cassandra.db.commitlog.ReplayPosition;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.dht.RandomPartitioner;
-import org.apache.cassandra.utils.EstimatedHistogram;
-import org.apache.cassandra.utils.Pair;
-
-public class SSTableMetadataSerializerTest
-{
-    @Test
-    public void testSerialization() throws IOException
-    {
-        EstimatedHistogram rowSizes = new EstimatedHistogram(
-            new long[] { 1L, 2L },
-            new long[] { 3L, 4L, 5L });
-        EstimatedHistogram columnCounts = new EstimatedHistogram(
-            new long[] { 6L, 7L },
-            new long[] { 8L, 9L, 10L });
-        ReplayPosition rp = new ReplayPosition(11L, 12);
-        long minTimestamp = 2162517136L;
-        long maxTimestamp = 4162517136L;
-
-        SSTableMetadata.Collector collector = SSTableMetadata.createCollector(BytesType.instance)
-                                                             .estimatedRowSize(rowSizes)
-                                                             .estimatedColumnCount(columnCounts)
-                                                             .replayPosition(rp);
-        collector.updateMinTimestamp(minTimestamp);
-        collector.updateMaxTimestamp(maxTimestamp);
-        SSTableMetadata originalMetadata = collector.finalizeMetadata(RandomPartitioner.class.getCanonicalName(), 0.1);
-
-        ByteArrayOutputStream byteOutput = new ByteArrayOutputStream();
-        DataOutputStream out = new DataOutputStream(byteOutput);
-        
-        Set<Integer> ancestors = new HashSet<Integer>();
-        ancestors.addAll(Arrays.asList(1,2,3,4));
-
-        SSTableMetadata.serializer.serialize(originalMetadata, ancestors, out);
-
-        ByteArrayInputStream byteInput = new ByteArrayInputStream(byteOutput.toByteArray());
-        DataInputStream in = new DataInputStream(byteInput);
-        Descriptor desc = new Descriptor(Descriptor.Version.CURRENT, new File("."), "", "", 0, false);
-        Pair<SSTableMetadata, Set<Integer>> statsPair = SSTableMetadata.serializer.deserialize(in, desc);
-        SSTableMetadata stats = statsPair.left;
-
-        assert stats.estimatedRowSize.equals(originalMetadata.estimatedRowSize);
-        assert stats.estimatedRowSize.equals(rowSizes);
-        assert stats.estimatedColumnCount.equals(originalMetadata.estimatedColumnCount);
-        assert stats.estimatedColumnCount.equals(columnCounts);
-        assert stats.replayPosition.equals(originalMetadata.replayPosition);
-        assert stats.replayPosition.equals(rp);
-        assert stats.minTimestamp == minTimestamp;
-        assert stats.maxTimestamp == maxTimestamp;
-        assert stats.minTimestamp == originalMetadata.minTimestamp;
-        assert stats.maxTimestamp == originalMetadata.maxTimestamp;
-        assert stats.bloomFilterFPChance == originalMetadata.bloomFilterFPChance;
-        assert RandomPartitioner.class.getCanonicalName().equals(stats.partitioner);
-        assert ancestors.equals(statsPair.right);
-    }
-}

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
index 6bd5f56..72307c5 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java

@@ -1,4 +1,3 @@
-package org.apache.cassandra.io.sstable;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -19,7 +18,7 @@
  * under the License.
  * 
  */
-
+package org.apache.cassandra.io.sstable;
 
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
@@ -30,22 +29,22 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowMutation;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.CounterId;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
+import static org.apache.cassandra.Util.cellname;
+
 public class SSTableMetadataTest extends SchemaLoader
 {
     @Test
-    public void testTrackMaxDeletionTime() throws ExecutionException, InterruptedException
+    public void testTrackMaxDeletionTime()
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
@@ -53,16 +52,16 @@
         for(int i = 0; i < 10; i++)
         {
             DecoratedKey key = Util.dk(Integer.toString(i));
-            RowMutation rm = new RowMutation("Keyspace1", key.key);
+            Mutation rm = new Mutation("Keyspace1", key.getKey());
             for (int j = 0; j < 10; j++)
-                rm.add("Standard1", ByteBufferUtil.bytes(Integer.toString(j)),
+                rm.add("Standard1", cellname(Integer.toString(j)),
                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
                        timestamp,
                        10 + j);
             rm.apply();
         }
-        RowMutation rm = new RowMutation("Keyspace1", Util.dk("longttl").key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col"),
+        Mutation rm = new Mutation("Keyspace1", Util.dk("longttl").getKey());
+        rm.add("Standard1", cellname("col"),
                ByteBufferUtil.EMPTY_BYTE_BUFFER,
                timestamp,
                10000);
@@ -77,8 +76,8 @@
             assertEquals(ttltimestamp + 10000, firstDelTime, 10);
 
         }
-        rm = new RowMutation("Keyspace1", Util.dk("longttl2").key);
-        rm.add("Standard1", ByteBufferUtil.bytes("col"),
+        rm = new Mutation("Keyspace1", Util.dk("longttl2").getKey());
+        rm.add("Standard1", cellname("col"),
                ByteBufferUtil.EMPTY_BYTE_BUFFER,
                timestamp,
                20000);
@@ -86,7 +85,7 @@
         ttltimestamp = (int) (System.currentTimeMillis()/1000);
         store.forceBlockingFlush();
         assertEquals(2, store.getSSTables().size());
-        List<SSTableReader> sstables = new ArrayList<SSTableReader>(store.getSSTables());
+        List<SSTableReader> sstables = new ArrayList<>(store.getSSTables());
         if(sstables.get(0).getSSTableMetadata().maxLocalDeletionTime < sstables.get(1).getSSTableMetadata().maxLocalDeletionTime)
         {
             assertEquals(sstables.get(0).getSSTableMetadata().maxLocalDeletionTime, firstDelTime);
@@ -124,13 +123,13 @@
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
         long timestamp = System.currentTimeMillis();
         DecoratedKey key = Util.dk("deletetest");
-        RowMutation rm = new RowMutation("Keyspace1", key.key);
+        Mutation rm = new Mutation("Keyspace1", key.getKey());
         for (int i = 0; i<5; i++)
-            rm.add("Standard2", ByteBufferUtil.bytes("deletecolumn"+i),
+            rm.add("Standard2", cellname("deletecolumn" + i),
                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
                        timestamp,
                        100);
-        rm.add("Standard2", ByteBufferUtil.bytes("todelete"),
+        rm.add("Standard2", cellname("todelete"),
                    ByteBufferUtil.EMPTY_BYTE_BUFFER,
                    timestamp,
                    1000);
@@ -144,8 +143,8 @@
             firstMaxDelTime = sstable.getSSTableMetadata().maxLocalDeletionTime;
             assertEquals(ttltimestamp + 1000, firstMaxDelTime, 10);
         }
-        rm = new RowMutation("Keyspace1", key.key);
-        rm.delete("Standard2", ByteBufferUtil.bytes("todelete"), timestamp + 1);
+        rm = new Mutation("Keyspace1", key.getKey());
+        rm.delete("Standard2", cellname("todelete"), timestamp + 1);
         rm.apply();
         store.forceBlockingFlush();
         assertEquals(2,store.getSSTables().size());
@@ -176,12 +175,10 @@
         for (int j = 0; j < 8; j++)
         {
             DecoratedKey key = Util.dk("row"+j);
-            RowMutation rm = new RowMutation("Keyspace1", key.key);
+            Mutation rm = new Mutation("Keyspace1", key.getKey());
             for (int i = 100; i<150; i++)
             {
-                rm.add("Standard3", ByteBufferUtil.bytes(j+"col"+i),
-                   ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                   System.currentTimeMillis());
+                rm.add("Standard3", cellname(j + "col" + i), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
             }
             rm.apply();
         }
@@ -193,12 +190,10 @@
             assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().maxColumnNames.get(0)), "7col149");
         }
         DecoratedKey key = Util.dk("row2");
-        RowMutation rm = new RowMutation("Keyspace1", key.key);
+        Mutation rm = new Mutation("Keyspace1", key.getKey());
         for (int i = 101; i<299; i++)
         {
-            rm.add("Standard3", ByteBufferUtil.bytes(9+"col"+i),
-               ByteBufferUtil.EMPTY_BYTE_BUFFER,
-               System.currentTimeMillis());
+            rm.add("Standard3", cellname(9 + "col" + i), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
         }
         rm.apply();
 
@@ -211,6 +206,7 @@
             assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().maxColumnNames.get(0)), "9col298");
         }
     }
+
     @Test
     public void testMaxMinComposites() throws CharacterCodingException, ExecutionException, InterruptedException
     {
@@ -229,13 +225,13 @@
 
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("StandardComposite2");
 
-        CompositeType ct = CompositeType.getInstance(BytesType.instance, IntegerType.instance);
+        CellNameType type = cfs.getComparator();
 
         ByteBuffer key = ByteBufferUtil.bytes("k");
         for (int i = 0; i < 10; i++)
         {
-            RowMutation rm = new RowMutation("Keyspace1", key);
-            ByteBuffer colName = ct.builder().add(ByteBufferUtil.bytes("a"+(9-i))).add(ByteBufferUtil.bytes(i)).build();
+            Mutation rm = new Mutation("Keyspace1", key);
+            CellName colName = type.makeCellName(ByteBufferUtil.bytes("a"+(9-i)), ByteBufferUtil.bytes(i));
             rm.add("StandardComposite2", colName, ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
             rm.apply();
         }
@@ -244,8 +240,8 @@
         key = ByteBufferUtil.bytes("k2");
         for (int i = 0; i < 10; i++)
         {
-            RowMutation rm = new RowMutation("Keyspace1", key);
-            ByteBuffer colName = ct.builder().add(ByteBufferUtil.bytes("b"+(9-i))).add(ByteBufferUtil.bytes(i)).build();
+            Mutation rm = new Mutation("Keyspace1", key);
+            CellName colName = type.makeCellName(ByteBufferUtil.bytes("b"+(9-i)), ByteBufferUtil.bytes(i));
             rm.add("StandardComposite2", colName, ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
             rm.apply();
         }
@@ -260,4 +256,54 @@
             assertEquals(0, ByteBufferUtil.toInt(sstable.getSSTableMetadata().minColumnNames.get(1)));
         }
     }
+
+    @Test
+    public void testLegacyCounterShardTracking()
+    {
+        ColumnFamilyStore cfs = Keyspace.open("Keyspace1").getColumnFamilyStore("Counter1");
+
+        // A cell with all shards
+        CounterContext.ContextState state = CounterContext.ContextState.allocate(1, 1, 1);
+        state.writeGlobal(CounterId.fromInt(1), 1L, 1L);
+        state.writeLocal(CounterId.fromInt(2), 1L, 1L);
+        state.writeRemote(CounterId.fromInt(3), 1L, 1L);
+        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
+        new Mutation(Util.dk("k").getKey(), cells).apply();
+        cfs.forceBlockingFlush();
+        assertTrue(cfs.getSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
+        cfs.truncateBlocking();
+
+        // A cell with global and remote shards
+        state = CounterContext.ContextState.allocate(0, 1, 1);
+        state.writeLocal(CounterId.fromInt(2), 1L, 1L);
+        state.writeRemote(CounterId.fromInt(3), 1L, 1L);
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
+        new Mutation(Util.dk("k").getKey(), cells).apply();
+        cfs.forceBlockingFlush();
+        assertTrue(cfs.getSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
+        cfs.truncateBlocking();
+
+        // A cell with global and local shards
+        state = CounterContext.ContextState.allocate(1, 1, 0);
+        state.writeGlobal(CounterId.fromInt(1), 1L, 1L);
+        state.writeLocal(CounterId.fromInt(2), 1L, 1L);
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
+        new Mutation(Util.dk("k").getKey(), cells).apply();
+        cfs.forceBlockingFlush();
+        assertTrue(cfs.getSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
+        cfs.truncateBlocking();
+
+        // A cell with global only
+        state = CounterContext.ContextState.allocate(1, 0, 0);
+        state.writeGlobal(CounterId.fromInt(1), 1L, 1L);
+        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+        cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
+        new Mutation(Util.dk("k").getKey(), cells).apply();
+        cfs.forceBlockingFlush();
+        assertFalse(cfs.getSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
+        cfs.truncateBlocking();
+    }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
index 427d089..91f5341 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java

@@ -25,49 +25,67 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.ScheduledThreadPoolExecutor;
+import java.util.concurrent.ThreadPoolExecutor;
 
-import org.junit.Assert;
 import com.google.common.collect.Sets;
+import org.junit.Assert;
 import org.junit.Test;
 import org.junit.runner.RunWith;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.IndexExpression;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.Row;
+import org.apache.cassandra.db.RowPosition;
 import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.ICompactionScanner;
-import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.db.composites.Composites;
 import org.apache.cassandra.dht.LocalPartitioner;
 import org.apache.cassandra.dht.LocalToken;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.MmappedSegmentedFile;
 import org.apache.cassandra.io.util.SegmentedFile;
+import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
+
+import static org.apache.cassandra.Util.cellname;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class SSTableReaderTest extends SchemaLoader
 {
+    private static final Logger logger = LoggerFactory.getLogger(SSTableReaderTest.class);
+
     static Token t(int i)
     {
         return StorageService.getPartitioner().getToken(ByteBufferUtil.bytes(String.valueOf(i)));
     }
 
     @Test
-    public void testGetPositionsForRanges() throws IOException, ExecutionException, InterruptedException
+    public void testGetPositionsForRanges() throws ExecutionException, InterruptedException
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
@@ -77,8 +95,8 @@
         for (int j = 0; j < 10; j++)
         {
             ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            RowMutation rm = new RowMutation("Keyspace1", key);
-            rm.add("Standard2", ByteBufferUtil.bytes("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
+            Mutation rm = new Mutation("Keyspace1", key);
+            rm.add("Standard2", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
             rm.apply();
         }
         store.forceBlockingFlush();
@@ -118,8 +136,8 @@
         for (int j = 0; j < 100; j += 2)
         {
             ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            RowMutation rm = new RowMutation("Keyspace1", key);
-            rm.add("Standard1", ByteBufferUtil.bytes("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
+            Mutation rm = new Mutation("Keyspace1", key);
+            rm.add("Standard1", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
             rm.apply();
         }
         store.forceBlockingFlush();
@@ -144,7 +162,7 @@
     }
 
     @Test
-    public void testPersistentStatistics() throws IOException, ExecutionException, InterruptedException
+    public void testPersistentStatistics()
     {
 
         Keyspace keyspace = Keyspace.open("Keyspace1");
@@ -153,8 +171,8 @@
         for (int j = 0; j < 100; j += 2)
         {
             ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            RowMutation rm = new RowMutation("Keyspace1", key);
-            rm.add("Standard1", ByteBufferUtil.bytes("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
+            Mutation rm = new Mutation("Keyspace1", key);
+            rm.add("Standard1", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
             rm.apply();
         }
         store.forceBlockingFlush();
@@ -163,14 +181,14 @@
         assert store.getMaxRowSize() != 0;
     }
 
-    private void clearAndLoad(ColumnFamilyStore cfs) throws IOException
+    private void clearAndLoad(ColumnFamilyStore cfs)
     {
         cfs.clearUnsafe();
         cfs.loadNewSSTables();
     }
 
     @Test
-    public void testGetPositionsForRangesWithKeyCache() throws IOException, ExecutionException, InterruptedException
+    public void testGetPositionsForRangesWithKeyCache() throws ExecutionException, InterruptedException
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
@@ -181,8 +199,8 @@
         for (int j = 0; j < 10; j++)
         {
             ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            RowMutation rm = new RowMutation("Keyspace1", key);
-            rm.add("Standard2", ByteBufferUtil.bytes("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
+            Mutation rm = new Mutation("Keyspace1", key);
+            rm.add("Standard2", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
             rm.apply();
         }
         store.forceBlockingFlush();
@@ -204,14 +222,14 @@
     }
 
     @Test
-    public void testPersistentStatisticsWithSecondaryIndex() throws IOException, ExecutionException, InterruptedException
+    public void testPersistentStatisticsWithSecondaryIndex()
     {
         // Create secondary index and flush to disk
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Indexed1");
         ByteBuffer key = ByteBufferUtil.bytes(String.valueOf("k1"));
-        RowMutation rm = new RowMutation("Keyspace1", key);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), System.currentTimeMillis());
+        Mutation rm = new Mutation("Keyspace1", key);
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), System.currentTimeMillis());
         rm.apply();
         store.forceBlockingFlush();
 
@@ -233,17 +251,17 @@
 
         DecoratedKey firstKey = null, lastKey = null;
         long timestamp = System.currentTimeMillis();
-        for (int i = 0; i < store.metadata.getIndexInterval(); i++)
+        for (int i = 0; i < store.metadata.getMinIndexInterval(); i++)
         {
             DecoratedKey key = Util.dk(String.valueOf(i));
             if (firstKey == null)
                 firstKey = key;
             if (lastKey == null)
                 lastKey = key;
-            if (store.metadata.getKeyValidator().compare(lastKey.key, key.key) < 0)
+            if (store.metadata.getKeyValidator().compare(lastKey.getKey(), key.getKey()) < 0)
                 lastKey = key;
-            RowMutation rm = new RowMutation(ks, key.key);
-            rm.add(cf, ByteBufferUtil.bytes("col"),
+            Mutation rm = new Mutation(ks, key.getKey());
+            rm.add(cf, cellname("col"),
                    ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp);
             rm.apply();
         }
@@ -254,8 +272,8 @@
 
         // test to see if sstable can be opened as expected
         SSTableReader target = SSTableReader.open(desc);
-        Assert.assertEquals(target.getKeySampleSize(), 1);
-        Assert.assertArrayEquals(ByteBufferUtil.getArray(firstKey.key), target.getKeySample(0));
+        Assert.assertEquals(target.getIndexSummarySize(), 1);
+        Assert.assertArrayEquals(ByteBufferUtil.getArray(firstKey.getKey()), target.getIndexSummaryKey(0));
         assert target.first.equals(firstKey);
         assert target.last.equals(lastKey);
     }
@@ -266,24 +284,24 @@
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Indexed1");
         ByteBuffer key = ByteBufferUtil.bytes(String.valueOf("k1"));
-        RowMutation rm = new RowMutation("Keyspace1", key);
-        rm.add("Indexed1", ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(1L), System.currentTimeMillis());
+        Mutation rm = new Mutation("Keyspace1", key);
+        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), System.currentTimeMillis());
         rm.apply();
         store.forceBlockingFlush();
 
         ColumnFamilyStore indexCfs = store.indexManager.getIndexForColumn(ByteBufferUtil.bytes("birthdate")).getIndexCfs();
         assert indexCfs.partitioner instanceof LocalPartitioner;
         SSTableReader sstable = indexCfs.getSSTables().iterator().next();
-        assert sstable.first.token instanceof LocalToken;
+        assert sstable.first.getToken() instanceof LocalToken;
 
         SegmentedFile.Builder ibuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode());
         SegmentedFile.Builder dbuilder = sstable.compression
                                           ? SegmentedFile.getCompressedBuilder()
                                           : SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode());
-        SSTableReader.saveSummary(sstable, ibuilder, dbuilder);
+        sstable.saveSummary(ibuilder, dbuilder);
 
         SSTableReader reopened = SSTableReader.open(sstable.descriptor);
-        assert reopened.first.token instanceof LocalToken;
+        assert reopened.first.getToken() instanceof LocalToken;
     }
 
     /** see CASSANDRA-5407 */
@@ -293,8 +311,8 @@
         Keyspace keyspace = Keyspace.open("Keyspace1");
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
         ByteBuffer key = ByteBufferUtil.bytes(String.valueOf("k1"));
-        RowMutation rm = new RowMutation("Keyspace1", key);
-        rm.add("Standard1", ByteBufferUtil.bytes("xyz"), ByteBufferUtil.bytes("abc"), 0);
+        Mutation rm = new Mutation("Keyspace1", key);
+        rm.add("Standard1", cellname("xyz"), ByteBufferUtil.bytes("abc"), 0);
         rm.apply();
         store.forceBlockingFlush();
         boolean foundScanner = false;
@@ -320,8 +338,8 @@
         for (int j = 0; j < 130; j++)
         {
             ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            RowMutation rm = new RowMutation("Keyspace1", key);
-            rm.add("Standard2", ByteBufferUtil.bytes("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
+            Mutation rm = new Mutation("Keyspace1", key);
+            rm.add("Standard2", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
             rm.apply();
         }
         store.forceBlockingFlush();
@@ -343,7 +361,65 @@
         assert sections.size() == 1 : "Expected to find range in sstable opened for bulk loading";
     }
 
-    private void assertIndexQueryWorks(ColumnFamilyStore indexedCFS) throws IOException
+    @Test
+    public void testIndexSummaryReplacement() throws IOException, ExecutionException, InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open("Keyspace1");
+        final ColumnFamilyStore store = keyspace.getColumnFamilyStore("StandardLowIndexInterval"); // index interval of 8, no key caching
+        CompactionManager.instance.disableAutoCompaction();
+
+        final int NUM_ROWS = 512;
+        for (int j = 0; j < NUM_ROWS; j++)
+        {
+            ByteBuffer key = ByteBufferUtil.bytes(String.format("%3d", j));
+            Mutation rm = new Mutation("Keyspace1", key);
+            rm.add("StandardLowIndexInterval", Util.cellname("0"), ByteBufferUtil.bytes(String.format("%3d", j)), j);
+            rm.apply();
+        }
+        store.forceBlockingFlush();
+        CompactionManager.instance.performMaximal(store);
+
+        Collection<SSTableReader> sstables = store.getSSTables();
+        assert sstables.size() == 1;
+        final SSTableReader sstable = sstables.iterator().next();
+
+        ThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(5);
+        List<Future> futures = new ArrayList<>(NUM_ROWS * 2);
+        for (int i = 0; i < NUM_ROWS; i++)
+        {
+            final ByteBuffer key = ByteBufferUtil.bytes(String.format("%3d", i));
+            final int index = i;
+
+            futures.add(executor.submit(new Runnable()
+            {
+                public void run()
+                {
+                    ColumnFamily result = store.getColumnFamily(sstable.partitioner.decorateKey(key), Composites.EMPTY, Composites.EMPTY, false, 100, 100);
+                    assertFalse(result.isEmpty());
+                    assertEquals(0, ByteBufferUtil.compare(String.format("%3d", index).getBytes(), result.getColumn(Util.cellname("0")).value()));
+                }
+            }));
+
+            futures.add(executor.submit(new Runnable()
+            {
+                public void run()
+                {
+                    Iterable<DecoratedKey> results = store.keySamples(
+                            new Range<>(sstable.partitioner.getMinimumToken(), sstable.partitioner.getToken(key)));
+                    assertTrue(results.iterator().hasNext());
+                }
+            }));
+        }
+
+        SSTableReader replacement = sstable.cloneWithNewSummarySamplingLevel(store, 1);
+        store.getDataTracker().replaceReaders(Arrays.asList(sstable), Arrays.asList(replacement));
+        for (Future future : futures)
+            future.get();
+
+        assertEquals(sstable.estimatedKeys(), replacement.estimatedKeys(), 1);
+    }
+
+    private void assertIndexQueryWorks(ColumnFamilyStore indexedCFS)
     {
         assert "Indexed1".equals(indexedCFS.name);
 
@@ -352,9 +428,8 @@
             clearAndLoad(cfs);
 
         // query using index to see if sstable for secondary index opens
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexOperator.EQ, ByteBufferUtil.bytes(1L));
+        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), IndexExpression.Operator.EQ, ByteBufferUtil.bytes(1L));
         List<IndexExpression> clause = Arrays.asList(expr);
-        IPartitioner p = StorageService.getPartitioner();
         Range<RowPosition> range = Util.range("", "");
         List<Row> rows = indexedCFS.search(range, clause, new IdentityQueryFilter(), 100);
         assert rows.size() == 1;
@@ -362,11 +437,11 @@
 
     private List<Range<Token>> makeRanges(Token left, Token right)
     {
-        return Arrays.<Range<Token>>asList(new Range[]{ new Range<Token>(left, right) });
+        return Arrays.asList(new Range<>(left, right));
     }
 
     private DecoratedKey k(int i)
     {
-        return new DecoratedKey(t(i), ByteBufferUtil.bytes(String.valueOf(i)));
+        return new BufferDecoratedKey(t(i), ByteBufferUtil.bytes(String.valueOf(i)));
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java
index 6dca637..ff1a305 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java

@@ -71,8 +71,8 @@
     {
         long timestamp = System.currentTimeMillis();
         DecoratedKey decoratedKey = Util.dk(toKey(key));
-        RowMutation rm = new RowMutation(KEYSPACE, decoratedKey.key);
-        rm.add(TABLE, ByteBufferUtil.bytes("col"), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, 1000);
+        Mutation rm = new Mutation(KEYSPACE, decoratedKey.getKey());
+        rm.add(TABLE, Util.cellname("col"), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, 1000);
         rm.apply();
     }
 
@@ -80,7 +80,7 @@
     {
         SSTableScanner scanner = sstable.getScanner(new DataRange(boundsFor(scanStart, scanEnd), new IdentityQueryFilter()));
         for (int i = expectedStart; i <= expectedEnd; i++)
-            assertEquals(toKey(i), new String(scanner.next().getKey().key.array()));
+            assertEquals(toKey(i), new String(scanner.next().getKey().getKey().array()));
         assertFalse(scanner.hasNext());
     }
 
@@ -110,7 +110,7 @@
         // full range scan
         SSTableScanner scanner = sstable.getScanner();
         for (int i = 2; i < 10; i++)
-            assertEquals(toKey(i), new String(scanner.next().getKey().key.array()));
+            assertEquals(toKey(i), new String(scanner.next().getKey().getKey().array()));
 
         // a simple read of a chunk in the middle
         assertScanMatches(sstable, 3, 6, 3, 6);
@@ -147,7 +147,7 @@
             for (int expected = rangeStart; expected <= rangeEnd; expected++)
             {
                 assertTrue(String.format("Expected to see key %03d", expected), scanner.hasNext());
-                assertEquals(toKey(expected), new String(scanner.next().getKey().key.array()));
+                assertEquals(toKey(expected), new String(scanner.next().getKey().getKey().array()));
             }
         }
         assertFalse(scanner.hasNext());

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableSimpleWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableSimpleWriterTest.java
index 9e7aa16..15980a4 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableSimpleWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableSimpleWriterTest.java

@@ -20,6 +20,7 @@
 
 import java.io.File;
 
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.dht.IPartitioner;
 import org.junit.Test;
 
@@ -43,7 +44,7 @@
         String cfname = "StandardInteger1";
 
         Keyspace t = Keyspace.open(keyspaceName); // make sure we create the directory
-        File dir = Directories.create(keyspaceName, cfname).getDirectoryForNewSSTables();
+        File dir = new Directories(Schema.instance.getCFMetaData(keyspaceName, cfname)).getDirectoryForNewSSTables();
         assert dir.exists();
 
         IPartitioner partitioner = StorageService.getPartitioner();
@@ -92,9 +93,9 @@
         ColumnFamily cf = Util.getColumnFamily(t, Util.dk("Key10"), cfname);
         assert cf.getColumnCount() == INC * NBCOL : "expecting " + (INC * NBCOL) + " columns, got " + cf.getColumnCount();
         int i = 0;
-        for (Column c : cf)
+        for (Cell c : cf)
         {
-            assert toInt(c.name()) == i : "Column name should be " + i + ", got " + toInt(c.name());
+            assert toInt(c.name().toByteBuffer()) == i : "Cell name should be " + i + ", got " + toInt(c.name().toByteBuffer());
             assert c.value().equals(bytes("v"));
             assert c.timestamp() == 1;
             ++i;

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableUtils.java b/test/unit/org/apache/cassandra/io/sstable/SSTableUtils.java
index feeac7c..d39f968 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableUtils.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableUtils.java

@@ -25,6 +25,7 @@
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import org.apache.cassandra.Util;
@@ -36,11 +37,11 @@
     public static String KEYSPACENAME = "Keyspace1";
     public static String CFNAME = "Standard1";
 
-    public static ColumnFamily createCF(long mfda, int ldt, Column... cols)
+    public static ColumnFamily createCF(long mfda, int ldt, Cell... cols)
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(KEYSPACENAME, CFNAME);
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACENAME, CFNAME);
         cf.delete(new DeletionInfo(mfda, ldt));
-        for (Column col : cols)
+        for (Cell col : cols)
             cf.addColumn(col);
         return cf;
     }
@@ -59,14 +60,14 @@
         File keyspaceDir = new File(tempdir, keyspaceName);
         keyspaceDir.mkdir();
         keyspaceDir.deleteOnExit();
-        File datafile = new File(new Descriptor(keyspaceDir, keyspaceName, cfname, generation, false).filenameFor("Data.db"));
+        File datafile = new File(new Descriptor(keyspaceDir, keyspaceName, cfname, generation, Descriptor.Type.FINAL).filenameFor("Data.db"));
         if (!datafile.createNewFile())
             throw new IOException("unable to create file " + datafile);
         datafile.deleteOnExit();
         return datafile;
     }
 
-    public static void assertContentEquals(SSTableReader lhs, SSTableReader rhs) throws IOException
+    public static void assertContentEquals(SSTableReader lhs, SSTableReader rhs)
     {
         SSTableScanner slhs = lhs.getScanner();
         SSTableScanner srhs = rhs.getScanner();
@@ -80,7 +81,7 @@
         assert !srhs.hasNext() : "RHS contained more rows than LHS";
     }
 
-    public static void assertContentEquals(OnDiskAtomIterator lhs, OnDiskAtomIterator rhs) throws IOException
+    public static void assertContentEquals(OnDiskAtomIterator lhs, OnDiskAtomIterator rhs)
     {
         assertEquals(lhs.getKey(), rhs.getKey());
         // check metadata
@@ -98,9 +99,9 @@
         // iterate columns
         while (lhs.hasNext())
         {
-            Column clhs = (Column)lhs.next();
+            Cell clhs = (Cell)lhs.next();
             assert rhs.hasNext() : "LHS contained more columns than RHS for " + lhs.getKey();
-            Column crhs = (Column)rhs.next();
+            Cell crhs = (Cell)rhs.next();
 
             assertEquals("Mismatched columns for " + lhs.getKey(), clhs, crhs);
         }
@@ -162,8 +163,8 @@
             Map<String, ColumnFamily> map = new HashMap<String, ColumnFamily>();
             for (String key : keys)
             {
-                ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(ksname, cfname);
-                cf.addColumn(new Column(ByteBufferUtil.bytes(key), ByteBufferUtil.bytes(key), 0));
+                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(ksname, cfname);
+                cf.addColumn(new BufferCell(Util.cellname(key), ByteBufferUtil.bytes(key), 0));
                 map.put(key, cf);
             }
             return write(map);
@@ -198,7 +199,7 @@
         public SSTableReader write(int expectedSize, Appender appender) throws IOException
         {
             File datafile = (dest == null) ? tempSSTableFile(ksname, cfname, generation) : new File(dest.filenameFor(Component.DATA));
-            SSTableWriter writer = new SSTableWriter(datafile.getAbsolutePath(), expectedSize);
+            SSTableWriter writer = new SSTableWriter(datafile.getAbsolutePath(), expectedSize, ActiveRepairService.UNREPAIRED_SSTABLE);
             while (appender.append(writer)) { /* pass */ }
             SSTableReader reader = writer.closeAndOpenReader();
             // mark all components for removal

diff --git a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
new file mode 100644
index 0000000..7751a51
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java

@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.metadata;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.EnumSet;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+import org.junit.Test;
+
+import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.dht.RandomPartitioner;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.EstimatedHistogram;
+
+import static org.junit.Assert.assertEquals;
+
+public class MetadataSerializerTest
+{
+    @Test
+    public void testSerialization() throws IOException
+    {
+        EstimatedHistogram rowSizes = new EstimatedHistogram(new long[] { 1L, 2L },
+                                                             new long[] { 3L, 4L, 5L });
+        EstimatedHistogram columnCounts = new EstimatedHistogram(new long[] { 6L, 7L },
+                                                                 new long[] { 8L, 9L, 10L });
+        ReplayPosition rp = new ReplayPosition(11L, 12);
+        long minTimestamp = 2162517136L;
+        long maxTimestamp = 4162517136L;
+
+        MetadataCollector collector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance))
+                                                      .estimatedRowSize(rowSizes)
+                                                      .estimatedColumnCount(columnCounts)
+                                                      .replayPosition(rp);
+        collector.updateMinTimestamp(minTimestamp);
+        collector.updateMaxTimestamp(maxTimestamp);
+
+        Set<Integer> ancestors = Sets.newHashSet(1, 2, 3, 4);
+        for (int i : ancestors)
+            collector.addAncestor(i);
+
+        String partitioner = RandomPartitioner.class.getCanonicalName();
+        double bfFpChance = 0.1;
+        Map<MetadataType, MetadataComponent> originalMetadata = collector.finalizeMetadata(partitioner, bfFpChance, 0);
+
+        MetadataSerializer serializer = new MetadataSerializer();
+        // Serialize to tmp file
+        File statsFile = File.createTempFile(Component.STATS.name, null);
+        try (DataOutputStreamAndChannel out = new DataOutputStreamAndChannel(new FileOutputStream(statsFile)))
+        {
+            serializer.serialize(originalMetadata, out);
+        }
+
+        Descriptor desc = new Descriptor(Descriptor.Version.CURRENT, statsFile.getParentFile(), "", "", 0, Descriptor.Type.FINAL);
+        try (RandomAccessReader in = RandomAccessReader.open(statsFile))
+        {
+            Map<MetadataType, MetadataComponent> deserialized = serializer.deserialize(desc, in, EnumSet.allOf(MetadataType.class));
+
+            for (MetadataType type : MetadataType.values())
+            {
+                assertEquals(originalMetadata.get(type), deserialized.get(type));
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/util/BufferedRandomAccessFileTest.java b/test/unit/org/apache/cassandra/io/util/BufferedRandomAccessFileTest.java
index a16b291..7dbbdc2 100644
--- a/test/unit/org/apache/cassandra/io/util/BufferedRandomAccessFileTest.java
+++ b/test/unit/org/apache/cassandra/io/util/BufferedRandomAccessFileTest.java

@@ -344,30 +344,34 @@
             for (final int offset : Arrays.asList(0, 8))
             {
                 File file1 = writeTemporaryFile(new byte[16]);
-                final RandomAccessReader file = RandomAccessReader.open(file1, bufferSize, null);
-                expectEOF(new Callable<Object>()
+                try (final RandomAccessReader file = RandomAccessReader.open(file1, bufferSize, null))
                 {
-                    public Object call() throws IOException
+                    expectEOF(new Callable<Object>()
                     {
-                        file.readFully(target, offset, 17);
-                        return null;
-                    }
-                });
+                        public Object call() throws IOException
+                        {
+                            file.readFully(target, offset, 17);
+                            return null;
+                        }
+                    });
+                }
             }
 
             // first read is ok but eventually EOFs
             for (final int n : Arrays.asList(1, 2, 4, 8))
             {
                 File file1 = writeTemporaryFile(new byte[16]);
-                final RandomAccessReader file = RandomAccessReader.open(file1, bufferSize, null);
-                expectEOF(new Callable<Object>()
+                try (final RandomAccessReader file = RandomAccessReader.open(file1, bufferSize, null))
                 {
-                    public Object call() throws IOException
+                    expectEOF(new Callable<Object>()
                     {
-                        while (true)
-                            file.readFully(target, 0, n);
-                    }
-                });
+                        public Object call() throws IOException
+                        {
+                            while (true)
+                                file.readFully(target, 0, n);
+                        }
+                    });
+                }
             }
         }
     }
@@ -414,15 +418,17 @@
         tmpFile.deleteOnExit();
 
         // Create the BRAF by filename instead of by file.
-        final RandomAccessReader r = RandomAccessReader.open(new File(tmpFile.getPath()));
-        assert tmpFile.getPath().equals(r.getPath());
-
-        // Create a mark and move the rw there.
-        final FileMark mark = r.mark();
-        r.reset(mark);
-
-        // Expect this call to succeed.
-        r.bytesPastMark(mark);
+        try (final RandomAccessReader r = RandomAccessReader.open(new File(tmpFile.getPath())))
+        {
+            assert tmpFile.getPath().equals(r.getPath());
+    
+            // Create a mark and move the rw there.
+            final FileMark mark = r.mark();
+            r.reset(mark);
+    
+            // Expect this call to succeed.
+            r.bytesPastMark(mark);
+        }
     }
 
     @Test
@@ -456,11 +462,13 @@
             }
         }, ClosedChannelException.class);
 
-        RandomAccessReader copy = RandomAccessReader.open(new File(r.getPath()));
-        ByteBuffer contents = copy.readBytes((int) copy.length());
-
-        assertEquals(contents.limit(), data.length);
-        assertEquals(ByteBufferUtil.compare(contents, data), 0);
+        try (RandomAccessReader copy = RandomAccessReader.open(new File(r.getPath())))
+        {
+            ByteBuffer contents = copy.readBytes((int) copy.length());
+    
+            assertEquals(contents.limit(), data.length);
+            assertEquals(ByteBufferUtil.compare(contents, data), 0);
+        }
     }
 
     @Test
@@ -501,16 +509,20 @@
     @Test (expected = AssertionError.class)
     public void testAssertionErrorWhenBytesPastMarkIsNegative() throws IOException
     {
-        SequentialWriter w = createTempFile("brafAssertionErrorWhenBytesPastMarkIsNegative");
-        w.write(new byte[30]);
-        w.close();
-
-        RandomAccessReader r = RandomAccessReader.open(w);
-        r.seek(10);
-        r.mark();
-
-        r.seek(0);
-        r.bytesPastMark();
+        try (SequentialWriter w = createTempFile("brafAssertionErrorWhenBytesPastMarkIsNegative"))
+        {
+            w.write(new byte[30]);
+            w.flush();
+    
+            try (RandomAccessReader r = RandomAccessReader.open(w))
+            {
+                r.seek(10);
+                r.mark();
+        
+                r.seek(0);
+                r.bytesPastMark();
+            }
+        }
     }
 
     @Test
@@ -518,6 +530,8 @@
     {
         //see https://issues.apache.org/jira/browse/CASSANDRA-7756
 
+        final FileCacheService.CacheKey cacheKey = new FileCacheService.CacheKey();
+
         final int THREAD_COUNT = 40;
         ExecutorService executorService = Executors.newFixedThreadPool(THREAD_COUNT);
 
@@ -538,8 +552,8 @@
             RandomAccessReader r2 = RandomAccessReader.open(w2);
 
 
-            FileCacheService.instance.put(r1);
-            FileCacheService.instance.put(r2);
+            FileCacheService.instance.put(cacheKey, r1);
+            FileCacheService.instance.put(cacheKey, r2);
 
             final CountDownLatch finished = new CountDownLatch(THREAD_COUNT);
             final AtomicBoolean hadError = new AtomicBoolean(false);
@@ -671,16 +685,20 @@
     public void testSetNegativeLength() throws IOException, IllegalArgumentException
     {
         File tmpFile = File.createTempFile("set_negative_length", "bin");
-        SequentialWriter file = SequentialWriter.open(tmpFile);
-        file.truncate(-8L);
+        try (SequentialWriter file = SequentialWriter.open(tmpFile))
+        {
+            file.truncate(-8L);
+        }
     }
 
     @Test (expected=IOException.class)
     public void testSetLengthDuringReadMode() throws IOException
     {
         File tmpFile = File.createTempFile("set_length_during_read_mode", "bin");
-        RandomAccessReader file = RandomAccessReader.open(tmpFile);
-        file.setLength(4L);
+        try (RandomAccessReader file = RandomAccessReader.open(tmpFile))
+        {
+            file.setLength(4L);
+        }
     }
 
     private SequentialWriter createTempFile(String name) throws IOException

diff --git a/test/unit/org/apache/cassandra/io/util/DataOutputTest.java b/test/unit/org/apache/cassandra/io/util/DataOutputTest.java
new file mode 100644
index 0000000..76f3304
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/DataOutputTest.java

@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class DataOutputTest
+{
+
+    @Test
+    public void testDataOutputStreamPlus() throws IOException
+    {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        DataOutputStreamPlus write = new DataOutputStreamPlus(bos);
+        DataInput canon = testWrite(write);
+        DataInput test = new DataInputStream(new ByteArrayInputStream(bos.toByteArray()));
+        testRead(test, canon);
+    }
+
+    @Test
+    public void testDataOutputChannelAndChannel() throws IOException
+    {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        DataOutputStreamPlus write = new DataOutputStreamAndChannel(Channels.newChannel(bos));
+        DataInput canon = testWrite(write);
+        DataInput test = new DataInputStream(new ByteArrayInputStream(bos.toByteArray()));
+        testRead(test, canon);
+    }
+
+    @Test
+    public void testDataOutputBuffer() throws IOException
+    {
+        DataOutputBuffer write = new DataOutputBuffer();
+        DataInput canon = testWrite(write);
+        DataInput test = new DataInputStream(new ByteArrayInputStream(write.toByteArray()));
+        testRead(test, canon);
+    }
+
+    @Test
+    public void testDataOutputDirectByteBuffer() throws IOException
+    {
+        ByteBuffer buf = wrap(new byte[345], true);
+        DataOutputByteBuffer write = new DataOutputByteBuffer(buf.duplicate());
+        DataInput canon = testWrite(write);
+        DataInput test = new DataInputStream(new ByteArrayInputStream(ByteBufferUtil.getArray(buf)));
+        testRead(test, canon);
+    }
+
+    @Test
+    public void testDataOutputHeapByteBuffer() throws IOException
+    {
+        ByteBuffer buf = wrap(new byte[345], false);
+        DataOutputByteBuffer write = new DataOutputByteBuffer(buf.duplicate());
+        DataInput canon = testWrite(write);
+        DataInput test = new DataInputStream(new ByteArrayInputStream(ByteBufferUtil.getArray(buf)));
+        testRead(test, canon);
+    }
+
+    @Test
+    public void testFileOutputStream() throws IOException
+    {
+        File file = FileUtils.createTempFile("dataoutput", "test");
+        try
+        {
+            DataOutputStreamAndChannel write = new DataOutputStreamAndChannel(new FileOutputStream(file));
+            DataInput canon = testWrite(write);
+            write.close();
+            DataInputStream test = new DataInputStream(new FileInputStream(file));
+            testRead(test, canon);
+            test.close();
+        }
+        finally
+        {
+            Assert.assertTrue(file.delete());
+        }
+    }
+
+    @Test
+    public void testRandomAccessFile() throws IOException
+    {
+        File file = FileUtils.createTempFile("dataoutput", "test");
+        try
+        {
+            final RandomAccessFile raf = new RandomAccessFile(file, "rw");
+            DataOutputStreamAndChannel write = new DataOutputStreamAndChannel(Channels.newOutputStream(raf.getChannel()), raf.getChannel());
+            DataInput canon = testWrite(write);
+            write.close();
+            DataInputStream test = new DataInputStream(new FileInputStream(file));
+            testRead(test, canon);
+            test.close();
+        }
+        finally
+        {
+            Assert.assertTrue(file.delete());
+        }
+    }
+
+    @Test
+    public void testSequentialWriter() throws IOException
+    {
+        File file = FileUtils.createTempFile("dataoutput", "test");
+        final SequentialWriter writer = new SequentialWriter(file, 32);
+        DataOutputStreamAndChannel write = new DataOutputStreamAndChannel(writer, writer);
+        DataInput canon = testWrite(write);
+        write.flush();
+        write.close();
+        DataInputStream test = new DataInputStream(new FileInputStream(file));
+        testRead(test, canon);
+        test.close();
+        Assert.assertTrue(file.delete());
+    }
+
+    private DataInput testWrite(DataOutputPlus test) throws IOException
+    {
+        final ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        final DataOutput canon = new DataOutputStream(bos);
+        Random rnd = ThreadLocalRandom.current();
+
+        byte[] bytes = new byte[50];
+        rnd.nextBytes(bytes);
+        ByteBufferUtil.writeWithLength(bytes, test);
+        ByteBufferUtil.writeWithLength(bytes, canon);
+
+        bytes = new byte[50];
+        rnd.nextBytes(bytes);
+        ByteBufferUtil.writeWithLength(wrap(bytes, false), test);
+        ByteBufferUtil.writeWithLength(bytes, canon);
+
+        bytes = new byte[50];
+        rnd.nextBytes(bytes);
+        ByteBufferUtil.writeWithLength(wrap(bytes, true), test);
+        ByteBufferUtil.writeWithLength(bytes, canon);
+
+        bytes = new byte[50];
+        rnd.nextBytes(bytes);
+        ByteBufferUtil.writeWithShortLength(bytes, test);
+        ByteBufferUtil.writeWithShortLength(bytes, canon);
+
+        bytes = new byte[50];
+        rnd.nextBytes(bytes);
+        ByteBufferUtil.writeWithShortLength(wrap(bytes, false), test);
+        ByteBufferUtil.writeWithShortLength(bytes, canon);
+
+        bytes = new byte[50];
+        rnd.nextBytes(bytes);
+        ByteBufferUtil.writeWithShortLength(wrap(bytes, true), test);
+        ByteBufferUtil.writeWithShortLength(bytes, canon);
+        // 318
+
+        {
+            long v = rnd.nextLong();
+            test.writeLong(v);
+            canon.writeLong(v);
+        }
+        {
+            int v = rnd.nextInt();
+            test.writeInt(v);
+            canon.writeInt(v);
+        }
+        {
+            short v = (short) rnd.nextInt();
+            test.writeShort(v);
+            canon.writeShort(v);
+        }
+        {
+            byte v = (byte) rnd.nextInt();
+            test.write(v);
+            canon.write(v);
+        }
+        {
+            double v = rnd.nextDouble();
+            test.writeDouble(v);
+            canon.writeDouble(v);
+        }
+        {
+            float v = (float) rnd.nextDouble();
+            test.writeFloat(v);
+            canon.writeFloat(v);
+        }
+
+        // 27
+        return new DataInputStream(new ByteArrayInputStream(bos.toByteArray()));
+    }
+
+    private void testRead(DataInput test, DataInput canon) throws IOException
+    {
+        Assert.assertEquals(ByteBufferUtil.readWithLength(canon), ByteBufferUtil.readWithLength(test));
+        Assert.assertEquals(ByteBufferUtil.readWithLength(canon), ByteBufferUtil.readWithLength(test));
+        Assert.assertEquals(ByteBufferUtil.readWithLength(canon), ByteBufferUtil.readWithLength(test));
+        Assert.assertEquals(ByteBufferUtil.readWithShortLength(canon), ByteBufferUtil.readWithShortLength(test));
+        Assert.assertEquals(ByteBufferUtil.readWithShortLength(canon), ByteBufferUtil.readWithShortLength(test));
+        Assert.assertEquals(ByteBufferUtil.readWithShortLength(canon), ByteBufferUtil.readWithShortLength(test));
+        assert test.readLong() == canon.readLong();
+        assert test.readInt() == canon.readInt();
+        assert test.readShort() == canon.readShort();
+        assert test.readByte() == canon.readByte();
+        assert test.readDouble() == canon.readDouble();
+        assert test.readFloat() == canon.readFloat();
+        try
+        {
+            test.readInt();
+            assert false;
+        }
+        catch (EOFException _)
+        {
+        }
+    }
+
+    private static ByteBuffer wrap(byte[] bytes, boolean direct)
+    {
+        ByteBuffer buf = direct ? ByteBuffer.allocateDirect(bytes.length + 20) : ByteBuffer.allocate(bytes.length + 20);
+        buf.position(10);
+        buf.limit(bytes.length + 10);
+        buf.duplicate().put(bytes);
+        return buf;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java b/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java
index 7e483a9..714520e 100644
--- a/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java

@@ -35,6 +35,7 @@
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.net.OutboundTcpConnectionPool;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.db.Keyspace;
 
 import static org.junit.Assert.assertEquals;
 
@@ -47,8 +48,8 @@
     {
         SchemaLoader.mkdirs();
         SchemaLoader.cleanup();
+        Keyspace.setInitialized();
         StorageService.instance.initServer(0);
-
     }
 
     private class TestCloudstackSnitch extends CloudstackSnitch

diff --git a/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java b/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java
index fe48ce5..6015adf 100644
--- a/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java

@@ -21,16 +21,18 @@
  */
 
 
-import static org.junit.Assert.assertEquals;
-
 import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.Map;
 
+import org.junit.AfterClass;
 import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.ApplicationState;
 import org.apache.cassandra.gms.Gossiper;
@@ -38,9 +40,8 @@
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.net.OutboundTcpConnectionPool;
 import org.apache.cassandra.service.StorageService;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
 
 public class EC2SnitchTest
 {
@@ -51,6 +52,7 @@
     {
         SchemaLoader.mkdirs();
         SchemaLoader.cleanup();
+        Keyspace.setInitialized();
         StorageService.instance.initServer(0);
     }
 

diff --git a/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java b/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java
index 09f96db..70080a8 100644
--- a/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java

@@ -52,6 +52,7 @@
     {
         SchemaLoader.mkdirs();
         SchemaLoader.cleanup();
+        Keyspace.setInitialized();
         StorageService.instance.initServer(0);
     }
 

diff --git a/test/unit/org/apache/cassandra/locator/GossipingPropertyFileSnitchTest.java b/test/unit/org/apache/cassandra/locator/GossipingPropertyFileSnitchTest.java
new file mode 100644
index 0000000..9026ebf
--- /dev/null
+++ b/test/unit/org/apache/cassandra/locator/GossipingPropertyFileSnitchTest.java

@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.locator;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.apache.cassandra.utils.FBUtilities;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link GossipingPropertyFileSnitch}.
+ */
+public class GossipingPropertyFileSnitchTest
+{
+    @Test
+    public void testAutoReloadConfig() throws Exception
+    {
+        String confFile = FBUtilities.resourceToFile(SnitchProperties.RACKDC_PROPERTY_FILENAME);
+        
+        final GossipingPropertyFileSnitch snitch = new GossipingPropertyFileSnitch(/*refreshPeriodInSeconds*/1);
+        YamlFileNetworkTopologySnitchTest.checkEndpoint(snitch, FBUtilities.getBroadcastAddress().getHostAddress(), "DC1", "RAC1");
+
+        final Path effectiveFile = Paths.get(confFile);
+        final Path backupFile = Paths.get(confFile + ".bak");
+        final Path modifiedFile = Paths.get(confFile + ".mod");
+        
+        try
+        {
+            Files.copy(effectiveFile, backupFile);
+            Files.copy(modifiedFile, effectiveFile, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
+            
+            Thread.sleep(1500);
+            
+            YamlFileNetworkTopologySnitchTest.checkEndpoint(snitch, FBUtilities.getBroadcastAddress().getHostAddress(), "DC2", "RAC2");
+        }
+        finally
+        {
+            Files.copy(backupFile, effectiveFile, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
+            Files.delete(backupFile);
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/locator/OldNetworkTopologyStrategyTest.java b/test/unit/org/apache/cassandra/locator/OldNetworkTopologyStrategyTest.java
index a11a128..14cb54d 100644
--- a/test/unit/org/apache/cassandra/locator/OldNetworkTopologyStrategyTest.java
+++ b/test/unit/org/apache/cassandra/locator/OldNetworkTopologyStrategyTest.java

@@ -154,7 +154,7 @@
         tmd.updateNormalToken(endpointToken, ep);
     }
 
-    private void testGetEndpoints(AbstractReplicationStrategy strategy, Token[] keyTokens) throws UnknownHostException
+    private void testGetEndpoints(AbstractReplicationStrategy strategy, Token[] keyTokens)
     {
         for (Token keyToken : keyTokens)
         {
@@ -181,7 +181,7 @@
         BigIntegerToken newToken = new BigIntegerToken("21267647932558653966460912964485513216");
         BigIntegerToken[] tokens = initTokens();
         BigIntegerToken[] tokensAfterMove = initTokensAfterMove(tokens, movingNodeIdx, newToken);
-        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx, newToken);
+        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx);
 
         assertEquals(ranges.left.iterator().next().left, tokensAfterMove[movingNodeIdx]);
         assertEquals(ranges.left.iterator().next().right, tokens[movingNodeIdx]);
@@ -198,7 +198,7 @@
         BigIntegerToken newToken = new BigIntegerToken("35267647932558653966460912964485513216");
         BigIntegerToken[] tokens = initTokens();
         BigIntegerToken[] tokensAfterMove = initTokensAfterMove(tokens, movingNodeIdx, newToken);
-        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx, newToken);
+        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx);
 
         assertEquals("No data should be streamed", ranges.left.size(), 0);
         assertEquals(ranges.right.iterator().next().left, tokens[movingNodeIdx]);
@@ -216,7 +216,7 @@
         BigIntegerToken newToken = new BigIntegerToken("90070591730234615865843651857942052864");
         BigIntegerToken[] tokens = initTokens();
         BigIntegerToken[] tokensAfterMove = initTokensAfterMove(tokens, movingNodeIdx, newToken);
-        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx, newToken);
+        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx);
 
         // sort the results, so they can be compared
         Range[] toStream = ranges.left.toArray(new Range[0]);
@@ -248,7 +248,7 @@
         BigIntegerToken newToken = new BigIntegerToken("52535295865117307932921825928971026432");
         BigIntegerToken[] tokens = initTokens();
         BigIntegerToken[] tokensAfterMove = initTokensAfterMove(tokens, movingNodeIdx, newToken);
-        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx, newToken);
+        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx);
 
 
         // sort the results, so they can be compared
@@ -280,7 +280,7 @@
         BigIntegerToken newToken = new BigIntegerToken("158873535527910577765226390751398592512");
         BigIntegerToken[] tokens = initTokens();
         BigIntegerToken[] tokensAfterMove = initTokensAfterMove(tokens, movingNodeIdx, newToken);
-        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx, newToken);
+        Pair<Set<Range<Token>>, Set<Range<Token>>> ranges = calculateStreamAndFetchRanges(tokens, tokensAfterMove, movingNodeIdx);
 
         Range[] toStream = ranges.left.toArray(new Range[0]);
         Range[] toFetch = ranges.right.toArray(new Range[0]);
@@ -350,7 +350,7 @@
 
     }
 
-    private Pair<Set<Range<Token>>, Set<Range<Token>>> calculateStreamAndFetchRanges(BigIntegerToken[] tokens, BigIntegerToken[] tokensAfterMove, int movingNodeIdx, BigIntegerToken newToken) throws UnknownHostException
+    private Pair<Set<Range<Token>>, Set<Range<Token>>> calculateStreamAndFetchRanges(BigIntegerToken[] tokens, BigIntegerToken[] tokensAfterMove, int movingNodeIdx) throws UnknownHostException
     {
         RackInferringSnitch endpointSnitch = new RackInferringSnitch();
 

diff --git a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java
index 67b80f2..d4978ae 100644
--- a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java
+++ b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java

@@ -31,11 +31,9 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.dht.*;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.service.StorageServiceAccessor;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -50,7 +48,7 @@
     }
 
     @Test
-    public void testBigIntegerEndpoints() throws UnknownHostException, ConfigurationException
+    public void testBigIntegerEndpoints() throws UnknownHostException
     {
         List<Token> endpointTokens = new ArrayList<Token>();
         List<Token> keyTokens = new ArrayList<Token>();
@@ -62,7 +60,7 @@
     }
 
     @Test
-    public void testStringEndpoints() throws UnknownHostException, ConfigurationException
+    public void testStringEndpoints() throws UnknownHostException
     {
         IPartitioner partitioner = new OrderPreservingPartitioner();
 
@@ -77,7 +75,7 @@
 
     // given a list of endpoint tokens, and a set of key tokens falling between the endpoint tokens,
     // make sure that the Strategy picks the right endpoints for the keys.
-    private void verifyGetNaturalEndpoints(Token[] endpointTokens, Token[] keyTokens) throws UnknownHostException, ConfigurationException
+    private void verifyGetNaturalEndpoints(Token[] endpointTokens, Token[] keyTokens) throws UnknownHostException
     {
         TokenMetadata tmd;
         AbstractReplicationStrategy strategy;
@@ -106,7 +104,7 @@
     }
 
     @Test
-    public void testGetEndpointsDuringBootstrap() throws UnknownHostException, ConfigurationException
+    public void testGetEndpointsDuringBootstrap() throws UnknownHostException
     {
         // the token difference will be RING_SIZE * 2.
         final int RING_SIZE = 10;
@@ -166,7 +164,7 @@
         StorageServiceAccessor.setTokenMetadata(oldTmd);
     }
 
-    private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd) throws ConfigurationException
+    private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd)
     {
         KSMetaData ksmd = Schema.instance.getKSMetaData(keyspaceName);
         return AbstractReplicationStrategy.createReplicationStrategy(

diff --git a/test/unit/org/apache/cassandra/locator/YamlFileNetworkTopologySnitchTest.java b/test/unit/org/apache/cassandra/locator/YamlFileNetworkTopologySnitchTest.java
index be1c24b..af1a7e9 100644
--- a/test/unit/org/apache/cassandra/locator/YamlFileNetworkTopologySnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/YamlFileNetworkTopologySnitchTest.java

@@ -88,7 +88,7 @@
      * @param expectedRack
      *            expected rack
      */
-    private void checkEndpoint(final AbstractNetworkTopologySnitch snitch,
+    public static void checkEndpoint(final AbstractNetworkTopologySnitch snitch,
             final String endpointString, final String expectedDatacenter,
             final String expectedRack)
     {

diff --git a/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java b/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java
new file mode 100644
index 0000000..88104a6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java

@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.metrics;
+
+import java.io.IOException;
+
+import com.datastax.driver.core.Cluster;
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.Session;
+
+import org.apache.cassandra.OrderedJUnit4ClassRunner;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.service.EmbeddedCassandraService;
+
+import static junit.framework.Assert.assertEquals;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+@RunWith(OrderedJUnit4ClassRunner.class)
+public class CQLMetricsTest extends SchemaLoader
+{
+    private static EmbeddedCassandraService cassandra;
+
+    private static Cluster cluster;
+    private static Session session;
+    private static PreparedStatement metricsStatement;
+
+    @BeforeClass()
+    public static void setup() throws ConfigurationException, IOException
+    {
+        Schema.instance.clear();
+
+        cassandra = new EmbeddedCassandraService();
+        cassandra.start();
+
+        cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build();
+        session = cluster.connect();
+
+        session.execute("CREATE KEYSPACE IF NOT EXISTS junit WITH replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };");
+        session.execute("CREATE TABLE IF NOT EXISTS junit.metricstest (id int PRIMARY KEY, val text);");
+    }
+
+    @Test
+    public void testPreparedStatementsCount()
+    {
+        assertEquals(0, (int) QueryProcessor.metrics.preparedStatementsCount.value());
+        metricsStatement = session.prepare("INSERT INTO junit.metricstest (id, val) VALUES (?, ?)");
+        assertEquals(1, (int) QueryProcessor.metrics.preparedStatementsCount.value());
+    }
+
+    @Test
+    public void testRegularStatementsExecuted()
+    {
+        clearMetrics();
+
+        assertEquals(0, QueryProcessor.metrics.preparedStatementsExecuted.count());
+        assertEquals(0, QueryProcessor.metrics.regularStatementsExecuted.count());
+
+        for (int i = 0; i < 10; i++)
+            session.execute(String.format("INSERT INTO junit.metricstest (id, val) VALUES (%d, '%s')", i, "val" + i));
+
+        assertEquals(0, QueryProcessor.metrics.preparedStatementsExecuted.count());
+        assertEquals(10, QueryProcessor.metrics.regularStatementsExecuted.count());
+    }
+
+    @Test
+    public void testPreparedStatementsExecuted()
+    {
+        clearMetrics();
+
+        assertEquals(0, QueryProcessor.metrics.preparedStatementsExecuted.count());
+        assertEquals(0, QueryProcessor.metrics.regularStatementsExecuted.count());
+
+        for (int i = 0; i < 10; i++)
+            session.execute(metricsStatement.bind(i, "val" + i));
+
+        assertEquals(10, QueryProcessor.metrics.preparedStatementsExecuted.count());
+        assertEquals(0, QueryProcessor.metrics.regularStatementsExecuted.count());
+    }
+
+    @Test
+    public void testPreparedStatementsRatio()
+    {
+        clearMetrics();
+
+        assertEquals(Double.NaN, QueryProcessor.metrics.preparedStatementsRatio.value());
+
+        for (int i = 0; i < 10; i++)
+            session.execute(metricsStatement.bind(i, "val" + i));
+        assertEquals(1.0, QueryProcessor.metrics.preparedStatementsRatio.value());
+
+        for (int i = 0; i < 10; i++)
+            session.execute(String.format("INSERT INTO junit.metricstest (id, val) VALUES (%d, '%s')", i, "val" + i));
+        assertEquals(0.5, QueryProcessor.metrics.preparedStatementsRatio.value());
+    }
+
+    private void clearMetrics()
+    {
+        QueryProcessor.metrics.preparedStatementsExecuted.clear();
+        QueryProcessor.metrics.regularStatementsExecuted.clear();
+        QueryProcessor.metrics.preparedStatementsEvicted.clear();
+    }
+}
+

diff --git a/test/unit/org/apache/cassandra/repair/DifferencerTest.java b/test/unit/org/apache/cassandra/repair/DifferencerTest.java
index b6dce40..bc0f0de 100644
--- a/test/unit/org/apache/cassandra/repair/DifferencerTest.java
+++ b/test/unit/org/apache/cassandra/repair/DifferencerTest.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.repair;
 
 import java.net.InetAddress;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.UUID;
@@ -26,6 +27,8 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.dht.Range;
@@ -33,6 +36,7 @@
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.sink.IMessageSink;
 import org.apache.cassandra.sink.SinkManager;
 import org.apache.cassandra.repair.messages.RepairMessage;
@@ -82,7 +86,7 @@
             }
         });
         Range<Token> range = new Range<>(partirioner.getMinimumToken(), partirioner.getRandomToken());
-        RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), "Keyspace1", "Standard1", range);
+        RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), "Keyspace1", "Standard1", range);
 
         MerkleTree tree1 = createInitialTree(desc);
         MerkleTree tree2 = createInitialTree(desc);
@@ -101,7 +105,13 @@
     public void testDifference() throws Throwable
     {
         Range<Token> range = new Range<>(partirioner.getMinimumToken(), partirioner.getRandomToken());
-        RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), "Keyspace1", "Standard1", range);
+        UUID parentRepairSession = UUID.randomUUID();
+        Keyspace keyspace = Keyspace.open("Keyspace1");
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
+
+        ActiveRepairService.instance.registerParentRepairSession(parentRepairSession, Arrays.asList(cfs), Arrays.asList(range));
+
+        RepairJobDesc desc = new RepairJobDesc(parentRepairSession, UUID.randomUUID(), "Keyspace1", "Standard1", range);
 
         MerkleTree tree1 = createInitialTree(desc);
         MerkleTree tree2 = createInitialTree(desc);

diff --git a/test/unit/org/apache/cassandra/repair/ValidatorTest.java b/test/unit/org/apache/cassandra/repair/ValidatorTest.java
index 9fa5d89..4d65cdb 100644
--- a/test/unit/org/apache/cassandra/repair/ValidatorTest.java
+++ b/test/unit/org/apache/cassandra/repair/ValidatorTest.java

@@ -17,21 +17,26 @@
  */
 package org.apache.cassandra.repair;
 
+import java.io.IOException;
 import java.net.InetAddress;
+import java.security.MessageDigest;
 import java.util.UUID;
 
 import org.junit.After;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.TreeMapBackedSortedColumns;
-import org.apache.cassandra.db.compaction.PrecompactedRow;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.compaction.AbstractCompactedRow;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.ColumnStats;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
@@ -41,13 +46,10 @@
 import org.apache.cassandra.repair.messages.ValidationComplete;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.SimpleCondition;
+import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 public class ValidatorTest extends SchemaLoader
 {
@@ -65,7 +67,7 @@
     public void testValidatorComplete() throws Throwable
     {
         Range<Token> range = new Range<>(partitioner.getMinimumToken(), partitioner.getRandomToken());
-        final RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), keyspace, columnFamily, range);
+        final RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), keyspace, columnFamily, range);
 
         final SimpleCondition lock = new SimpleCondition();
         SinkManager.add(new IMessageSink()
@@ -102,30 +104,52 @@
         ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily);
 
         Validator validator = new Validator(desc, remote, 0);
-        validator.prepare(cfs);
+        MerkleTree tree = new MerkleTree(cfs.partitioner, validator.desc.range, MerkleTree.RECOMMENDED_DEPTH, (int) Math.pow(2, 15));
+        validator.prepare(cfs, tree);
 
         // and confirm that the tree was split
-        assertTrue(validator.tree.size() > 1);
+        assertTrue(tree.size() > 1);
 
         // add a row
         Token mid = partitioner.midpoint(range.left, range.right);
-        validator.add(new PrecompactedRow(new DecoratedKey(mid, ByteBufferUtil.bytes("inconceivable!")),
-                                                 TreeMapBackedSortedColumns.factory.create(cfs.metadata)));
+        validator.add(new CompactedRowStub(new BufferDecoratedKey(mid, ByteBufferUtil.bytes("inconceivable!"))));
         validator.complete();
 
         // confirm that the tree was validated
-        Token min = validator.tree.partitioner().getMinimumToken();
-        assertNotNull(validator.tree.hash(new Range<>(min, min)));
+        Token min = tree.partitioner().getMinimumToken();
+        assertNotNull(tree.hash(new Range<>(min, min)));
 
         if (!lock.isSignaled())
             lock.await();
     }
 
+    private static class CompactedRowStub extends AbstractCompactedRow
+    {
+        private CompactedRowStub(DecoratedKey key)
+        {
+            super(key);
+        }
+
+        public RowIndexEntry write(long currentPosition, DataOutputPlus out) throws IOException
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void update(MessageDigest digest) { }
+
+        public ColumnStats columnStats()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void close() throws IOException { }
+    }
+
     @Test
     public void testValidatorFailed() throws Throwable
     {
         Range<Token> range = new Range<>(partitioner.getMinimumToken(), partitioner.getRandomToken());
-        final RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), keyspace, columnFamily, range);
+        final RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), keyspace, columnFamily, range);
 
         final SimpleCondition lock = new SimpleCondition();
         SinkManager.add(new IMessageSink()

diff --git a/test/unit/org/apache/cassandra/serializers/ClientUtilsTest.java b/test/unit/org/apache/cassandra/serializers/ClientUtilsTest.java
index 67bef2e..563d6cb 100644
--- a/test/unit/org/apache/cassandra/serializers/ClientUtilsTest.java
+++ b/test/unit/org/apache/cassandra/serializers/ClientUtilsTest.java

@@ -23,7 +23,6 @@
 
 import java.math.BigDecimal;
 import java.math.BigInteger;
-import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.sql.Date;
 import java.util.UUID;
@@ -35,7 +34,7 @@
 {
     /** Exercises the classes in the clientutil jar to expose missing dependencies. */
     @Test
-    public void test() throws UnknownHostException
+    public void test()
     {
         AsciiSerializer.instance.deserialize(AsciiSerializer.instance.serialize("string"));
         BooleanSerializer.instance.deserialize(BooleanSerializer.instance.serialize(true));

diff --git a/test/unit/org/apache/cassandra/service/AntiEntropyServiceCounterTest.java b/test/unit/org/apache/cassandra/service/AntiEntropyServiceCounterTest.java
index 0950f1d..4a82183 100644
--- a/test/unit/org/apache/cassandra/service/AntiEntropyServiceCounterTest.java
+++ b/test/unit/org/apache/cassandra/service/AntiEntropyServiceCounterTest.java

@@ -24,6 +24,7 @@
 import java.util.LinkedList;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.db.ConsistencyLevel;
 
@@ -38,8 +39,8 @@
     public List<IMutation> getWriteData()
     {
         List<IMutation> rms = new LinkedList<IMutation>();
-        RowMutation rm = new RowMutation(keyspaceName, ByteBufferUtil.bytes("key1"));
-        rm.addCounter(cfname, ByteBufferUtil.bytes("Column1"), 42);
+        Mutation rm = new Mutation(keyspaceName, ByteBufferUtil.bytes("key1"));
+        rm.addCounter(cfname, CellNames.simpleDense(ByteBufferUtil.bytes("Column1")), 42);
         rms.add(new CounterMutation(rm, ConsistencyLevel.ONE));
         return rms;
     }

diff --git a/test/unit/org/apache/cassandra/service/AntiEntropyServiceStandardTest.java b/test/unit/org/apache/cassandra/service/AntiEntropyServiceStandardTest.java
index 169be2d..f4025cc 100644
--- a/test/unit/org/apache/cassandra/service/AntiEntropyServiceStandardTest.java
+++ b/test/unit/org/apache/cassandra/service/AntiEntropyServiceStandardTest.java

@@ -23,6 +23,7 @@
 import java.util.List;
 import java.util.LinkedList;
 
+import org.apache.cassandra.Util;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -37,9 +38,9 @@
     public List<IMutation> getWriteData()
     {
         List<IMutation> rms = new LinkedList<IMutation>();
-        RowMutation rm;
-        rm = new RowMutation(keyspaceName, ByteBufferUtil.bytes("key1"));
-        rm.add(cfname, ByteBufferUtil.bytes("Column1"), ByteBufferUtil.bytes("asdfasdf"), 0);
+        Mutation rm;
+        rm = new Mutation(keyspaceName, ByteBufferUtil.bytes("key1"));
+        rm.add(cfname, Util.cellname("Column1"), ByteBufferUtil.bytes("asdfasdf"), 0);
         rms.add(rm);
         return rms;
     }

diff --git a/test/unit/org/apache/cassandra/service/AntiEntropyServiceTestAbstract.java b/test/unit/org/apache/cassandra/service/AntiEntropyServiceTestAbstract.java
index eeb297a..ac39de6 100644
--- a/test/unit/org/apache/cassandra/service/AntiEntropyServiceTestAbstract.java
+++ b/test/unit/org/apache/cassandra/service/AntiEntropyServiceTestAbstract.java

@@ -45,7 +45,6 @@
 import org.apache.cassandra.repair.RepairJobDesc;
 import org.apache.cassandra.utils.FBUtilities;
 
-import static org.apache.cassandra.service.ActiveRepairService.*;
 import static org.junit.Assert.assertEquals;
 
 public abstract class AntiEntropyServiceTestAbstract extends SchemaLoader
@@ -103,7 +102,7 @@
 
         local_range = StorageService.instance.getPrimaryRangesForEndpoint(keyspaceName, LOCAL).iterator().next();
 
-        desc = new RepairJobDesc(UUID.randomUUID(), keyspaceName, cfname, local_range);
+        desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), keyspaceName, cfname, local_range);
         // Set a fake session corresponding to this fake request
         ActiveRepairService.instance.submitArtificialRepairSession(desc);
     }

diff --git a/test/unit/org/apache/cassandra/service/BatchlogEndpointSelectorTest.java b/test/unit/org/apache/cassandra/service/BatchlogEndpointFilterTest.java
similarity index 78%
rename from test/unit/org/apache/cassandra/service/BatchlogEndpointSelectorTest.java
rename to test/unit/org/apache/cassandra/service/BatchlogEndpointFilterTest.java
index 293078d..72e8df5 100644
--- a/test/unit/org/apache/cassandra/service/BatchlogEndpointSelectorTest.java
+++ b/test/unit/org/apache/cassandra/service/BatchlogEndpointFilterTest.java

@@ -17,45 +17,24 @@
  */
 package org.apache.cassandra.service;
 
-import static org.hamcrest.CoreMatchers.is;
-import static org.junit.Assert.assertThat;
-
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.Collection;
-
 import org.junit.Test;
 import org.junit.matchers.JUnitMatchers;
 
 import com.google.common.collect.ImmutableMultimap;
 import com.google.common.collect.Multimap;
 
-public class BatchlogEndpointSelectorTest
+import org.apache.cassandra.db.BatchlogManager;
+
+import static org.junit.Assert.assertThat;
+import static org.hamcrest.CoreMatchers.is;
+
+public class BatchlogEndpointFilterTest
 {
-    private final BatchlogEndpointSelector target;
     private static final String LOCAL = "local";
-    
 
-    public BatchlogEndpointSelectorTest() throws UnknownHostException
-    {
-        target = new BatchlogEndpointSelector(LOCAL)
-        {
-            @Override
-            protected boolean isValid(InetAddress input)
-            {   
-                //we will use always alive non-localhost endpoints
-                return true;
-            }
-
-            @Override
-            protected int getRandomInt(int bound)
-            {
-                //we don't need a random behavior here
-                return bound - 1;
-            }
-        };
-    }
-    
     @Test
     public void shouldSelect2hostsFromNonLocalRacks() throws UnknownHostException
     {
@@ -67,12 +46,12 @@
                 .put("2", InetAddress.getByName("2"))
                 .put("2", InetAddress.getByName("22"))
                 .build();
-        Collection<InetAddress> result = target.chooseEndpoints(endpoints);
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
         assertThat(result.size(), is(2));
         assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("11")));
         assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("22")));
     }
-    
+
     @Test
     public void shouldSelectHostFromLocal() throws UnknownHostException
     {
@@ -81,23 +60,23 @@
                 .put(LOCAL, InetAddress.getByName("00"))
                 .put("1", InetAddress.getByName("1"))
                 .build();
-        Collection<InetAddress> result = target.chooseEndpoints(endpoints);
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
         assertThat(result.size(), is(2));
         assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("1")));
         assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("0")));
     }
-    
+
     @Test
     public void shouldReturnAsIsIfNoEnoughEndpoints() throws UnknownHostException
     {
         Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
                 .put(LOCAL, InetAddress.getByName("0"))
                 .build();
-        Collection<InetAddress> result = target.chooseEndpoints(endpoints);
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
         assertThat(result.size(), is(1));
         assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("0")));
     }
-    
+
     @Test
     public void shouldSelectTwoFirstHostsFromSingleOtherRack() throws UnknownHostException
     {
@@ -108,9 +87,31 @@
                 .put("1", InetAddress.getByName("11"))
                 .put("1", InetAddress.getByName("111"))
                 .build();
-        Collection<InetAddress> result = target.chooseEndpoints(endpoints);
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
         assertThat(result.size(), is(2));
         assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("1")));
         assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("11")));
     }
+
+    private static class TestEndpointFilter extends BatchlogManager.EndpointFilter
+    {
+        public TestEndpointFilter(String localRack, Multimap<String, InetAddress> endpoints)
+        {
+            super(localRack, endpoints);
+        }
+
+        @Override
+        protected boolean isValid(InetAddress input)
+        {
+            // We will use always alive non-localhost endpoints
+            return true;
+        }
+
+        @Override
+        protected int getRandomInt(int bound)
+        {
+            // We don't need random behavior here
+            return bound - 1;
+        }
+    }
 }

diff --git a/test/unit/org/apache/cassandra/service/EmbeddedCassandraServiceTest.java b/test/unit/org/apache/cassandra/service/EmbeddedCassandraServiceTest.java
index a70c7c0..24b5a74 100644
--- a/test/unit/org/apache/cassandra/service/EmbeddedCassandraServiceTest.java
+++ b/test/unit/org/apache/cassandra/service/EmbeddedCassandraServiceTest.java

@@ -26,7 +26,6 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.thrift.*;
@@ -61,7 +60,7 @@
      * @throws InterruptedException
      */
     @BeforeClass
-    public static void setup() throws TTransportException, IOException, InterruptedException, ConfigurationException
+    public static void setup() throws TTransportException, IOException, InterruptedException
     {
         Schema.instance.clear(); // Schema are now written on disk and will be reloaded
         cassandra = new EmbeddedCassandraService();

diff --git a/test/unit/org/apache/cassandra/service/InitClientTest.java b/test/unit/org/apache/cassandra/service/InitClientTest.java
index 7d44cd8..4ce0678 100644
--- a/test/unit/org/apache/cassandra/service/InitClientTest.java
+++ b/test/unit/org/apache/cassandra/service/InitClientTest.java

@@ -2,8 +2,6 @@
 
 import org.junit.Test;
 
-import java.io.IOException;
-
 import org.apache.cassandra.exceptions.ConfigurationException;
 
 /**
@@ -28,7 +26,7 @@
 public class InitClientTest // extends CleanupHelper
 {
     @Test
-    public void testInitClientStartup() throws IOException, ConfigurationException
+    public void testInitClientStartup() throws ConfigurationException
     {
         StorageService.instance.initClient(0);
     }

diff --git a/test/unit/org/apache/cassandra/service/LeaveAndBootstrapTest.java b/test/unit/org/apache/cassandra/service/LeaveAndBootstrapTest.java
index 0317017..eef8c86 100644
--- a/test/unit/org/apache/cassandra/service/LeaveAndBootstrapTest.java
+++ b/test/unit/org/apache/cassandra/service/LeaveAndBootstrapTest.java

@@ -19,7 +19,6 @@
 
 package org.apache.cassandra.service;
 
-import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.*;
@@ -51,7 +50,7 @@
     private static IPartitioner oldPartitioner;
 
     @BeforeClass
-    public static void setup() throws IOException, ConfigurationException
+    public static void setup() throws ConfigurationException
     {
         oldPartitioner = StorageService.instance.setPartitionerUnsafe(partitioner);
         SchemaLoader.loadSchema();
@@ -142,7 +141,7 @@
      * simultaneously
      */
     @Test
-    public void testSimultaneousMove() throws UnknownHostException, ConfigurationException
+    public void testSimultaneousMove() throws UnknownHostException
     {
         StorageService ss = StorageService.instance;
         final int RING_SIZE = 10;
@@ -246,7 +245,7 @@
             for (int i = 0; i < keyTokens.size(); i++)
             {
                 endpoints = tmd.getWriteEndpoints(keyTokens.get(i), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(i)));
-                assertTrue(expectedEndpoints.get(keyspaceName).get(keyTokens.get(i)).size() == endpoints.size());
+                assertEquals(expectedEndpoints.get(keyspaceName).get(keyTokens.get(i)).size(), endpoints.size());
                 assertTrue(expectedEndpoints.get(keyspaceName).get(keyTokens.get(i)).containsAll(endpoints));
             }
 
@@ -257,7 +256,7 @@
             for (int i=0; i<3; ++i)
             {
                 endpoints = tmd.getWriteEndpoints(keyTokens.get(i), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(i)));
-                assertTrue(endpoints.size() == 3);
+                assertEquals(3, endpoints.size());
                 assertTrue(endpoints.contains(hosts.get(i+1)));
                 assertTrue(endpoints.contains(hosts.get(i+2)));
                 assertTrue(endpoints.contains(hosts.get(i+3)));
@@ -265,7 +264,7 @@
 
             // token 35 should go to nodes 4, 5, 6, 7 and boot1
             endpoints = tmd.getWriteEndpoints(keyTokens.get(3), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(3)));
-            assertTrue(endpoints.size() == 5);
+            assertEquals(5, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(4)));
             assertTrue(endpoints.contains(hosts.get(5)));
             assertTrue(endpoints.contains(hosts.get(6)));
@@ -274,7 +273,7 @@
 
             // token 45 should go to nodes 5, 6, 7, 0, boot1 and boot2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(4), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(4)));
-            assertTrue(endpoints.size() == 6);
+            assertEquals(6, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(5)));
             assertTrue(endpoints.contains(hosts.get(6)));
             assertTrue(endpoints.contains(hosts.get(7)));
@@ -284,7 +283,7 @@
 
             // token 55 should go to nodes 6, 7, 8, 0, 1, boot1 and boot2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(5), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(5)));
-            assertTrue(endpoints.size() == 7);
+            assertEquals(7, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(6)));
             assertTrue(endpoints.contains(hosts.get(7)));
             assertTrue(endpoints.contains(hosts.get(8)));
@@ -295,7 +294,7 @@
 
             // token 65 should go to nodes 7, 8, 9, 0, 1 and boot2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(6), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(6)));
-            assertTrue(endpoints.size() == 6);
+            assertEquals(6, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(7)));
             assertTrue(endpoints.contains(hosts.get(8)));
             assertTrue(endpoints.contains(hosts.get(9)));
@@ -305,7 +304,7 @@
 
             // token 75 should to go nodes 8, 9, 0, 1, 2 and boot2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(7), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(7)));
-            assertTrue(endpoints.size() == 6);
+            assertEquals(6, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(8)));
             assertTrue(endpoints.contains(hosts.get(9)));
             assertTrue(endpoints.contains(hosts.get(0)));
@@ -315,7 +314,7 @@
 
             // token 85 should go to nodes 9, 0, 1 and 2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(8), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(8)));
-            assertTrue(endpoints.size() == 4);
+            assertEquals(4, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(9)));
             assertTrue(endpoints.contains(hosts.get(0)));
             assertTrue(endpoints.contains(hosts.get(1)));
@@ -323,7 +322,7 @@
 
             // token 95 should go to nodes 0, 1 and 2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(9), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(9)));
-            assertTrue(endpoints.size() == 3);
+            assertEquals(3, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(0)));
             assertTrue(endpoints.contains(hosts.get(1)));
             assertTrue(endpoints.contains(hosts.get(2)));
@@ -368,7 +367,7 @@
             for (int i = 0; i < keyTokens.size(); i++)
             {
                 endpoints = tmd.getWriteEndpoints(keyTokens.get(i), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(i)));
-                assertTrue(expectedEndpoints.get(keyspaceName).get(keyTokens.get(i)).size() == endpoints.size());
+                assertEquals(expectedEndpoints.get(keyspaceName).get(keyTokens.get(i)).size(), endpoints.size());
                 assertTrue(expectedEndpoints.get(keyspaceName).get(keyTokens.get(i)).containsAll(endpoints));
             }
 
@@ -379,7 +378,7 @@
             for (int i=0; i<3; ++i)
             {
                 endpoints = tmd.getWriteEndpoints(keyTokens.get(i), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(i)));
-                assertTrue(endpoints.size() == 3);
+                assertEquals(3, endpoints.size());
                 assertTrue(endpoints.contains(hosts.get(i+1)));
                 assertTrue(endpoints.contains(hosts.get(i+2)));
                 assertTrue(endpoints.contains(hosts.get(i+3)));
@@ -387,21 +386,21 @@
 
             // token 35 goes to nodes 4, 5 and boot1
             endpoints = tmd.getWriteEndpoints(keyTokens.get(3), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(3)));
-            assertTrue(endpoints.size() == 3);
+            assertEquals(3, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(4)));
             assertTrue(endpoints.contains(hosts.get(5)));
             assertTrue(endpoints.contains(boot1));
 
             // token 45 goes to nodes 5, boot1 and node7
             endpoints = tmd.getWriteEndpoints(keyTokens.get(4), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(4)));
-            assertTrue(endpoints.size() == 3);
+            assertEquals(3, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(5)));
             assertTrue(endpoints.contains(boot1));
             assertTrue(endpoints.contains(hosts.get(7)));
 
             // token 55 goes to boot1, 7, boot2, 8 and 0
             endpoints = tmd.getWriteEndpoints(keyTokens.get(5), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(5)));
-            assertTrue(endpoints.size() == 5);
+            assertEquals(5, endpoints.size());
             assertTrue(endpoints.contains(boot1));
             assertTrue(endpoints.contains(hosts.get(7)));
             assertTrue(endpoints.contains(boot2));
@@ -410,7 +409,7 @@
 
             // token 65 goes to nodes 7, boot2, 8, 0 and 1
             endpoints = tmd.getWriteEndpoints(keyTokens.get(6), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(6)));
-            assertTrue(endpoints.size() == 5);
+            assertEquals(5, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(7)));
             assertTrue(endpoints.contains(boot2));
             assertTrue(endpoints.contains(hosts.get(8)));
@@ -419,7 +418,7 @@
 
             // token 75 goes to nodes boot2, 8, 0, 1 and 2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(7), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(7)));
-            assertTrue(endpoints.size() == 5);
+            assertEquals(5, endpoints.size());
             assertTrue(endpoints.contains(boot2));
             assertTrue(endpoints.contains(hosts.get(8)));
             assertTrue(endpoints.contains(hosts.get(0)));
@@ -428,14 +427,14 @@
 
             // token 85 goes to nodes 0, 1 and 2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(8), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(8)));
-            assertTrue(endpoints.size() == 3);
+            assertEquals(3, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(0)));
             assertTrue(endpoints.contains(hosts.get(1)));
             assertTrue(endpoints.contains(hosts.get(2)));
 
             // token 95 goes to nodes 0, 1 and 2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(9), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(9)));
-            assertTrue(endpoints.size() == 3);
+            assertEquals(3, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(0)));
             assertTrue(endpoints.contains(hosts.get(1)));
             assertTrue(endpoints.contains(hosts.get(2)));
@@ -478,7 +477,7 @@
 
         assertFalse(tmd.isMember(hosts.get(2)));
         assertFalse(tmd.isLeaving(hosts.get(2)));
-        assertTrue(tmd.getBootstrapTokens().get(keyTokens.get(4)).equals(hosts.get(2)));
+        assertEquals(hosts.get(2), tmd.getBootstrapTokens().get(keyTokens.get(4)));
 
         // Bootstrap node hosts.get(3) to keyTokens.get(1)
         Gossiper.instance.injectApplicationState(hosts.get(3), ApplicationState.TOKENS, valueFactory.tokens(Collections.singleton(keyTokens.get(1))));
@@ -488,8 +487,8 @@
 
         assertFalse(tmd.isMember(hosts.get(3)));
         assertFalse(tmd.isLeaving(hosts.get(3)));
-        assertTrue(tmd.getBootstrapTokens().get(keyTokens.get(4)).equals(hosts.get(2)));
-        assertTrue(tmd.getBootstrapTokens().get(keyTokens.get(1)).equals(hosts.get(3)));
+        assertEquals(hosts.get(2), tmd.getBootstrapTokens().get(keyTokens.get(4)));
+        assertEquals(hosts.get(3), tmd.getBootstrapTokens().get(keyTokens.get(1)));
 
         // Bootstrap node hosts.get(2) further to keyTokens.get(3)
         Gossiper.instance.injectApplicationState(hosts.get(2), ApplicationState.TOKENS, valueFactory.tokens(Collections.singleton(keyTokens.get(3))));
@@ -499,9 +498,9 @@
 
         assertFalse(tmd.isMember(hosts.get(2)));
         assertFalse(tmd.isLeaving(hosts.get(2)));
-        assertTrue(tmd.getBootstrapTokens().get(keyTokens.get(3)).equals(hosts.get(2)));
-        assertTrue(tmd.getBootstrapTokens().get(keyTokens.get(4)) == null);
-        assertTrue(tmd.getBootstrapTokens().get(keyTokens.get(1)).equals(hosts.get(3)));
+        assertEquals(hosts.get(2), tmd.getBootstrapTokens().get(keyTokens.get(3)));
+        assertNull(tmd.getBootstrapTokens().get(keyTokens.get(4)));
+        assertEquals(hosts.get(3), tmd.getBootstrapTokens().get(keyTokens.get(1)));
 
         // Go to normal again for both nodes
         Gossiper.instance.injectApplicationState(hosts.get(3), ApplicationState.TOKENS, valueFactory.tokens(Collections.singleton(keyTokens.get(2))));
@@ -511,10 +510,10 @@
 
         assertTrue(tmd.isMember(hosts.get(2)));
         assertFalse(tmd.isLeaving(hosts.get(2)));
-        assertTrue(tmd.getToken(hosts.get(2)).equals(keyTokens.get(3)));
+        assertEquals(keyTokens.get(3), tmd.getToken(hosts.get(2)));
         assertTrue(tmd.isMember(hosts.get(3)));
         assertFalse(tmd.isLeaving(hosts.get(3)));
-        assertTrue(tmd.getToken(hosts.get(3)).equals(keyTokens.get(2)));
+        assertEquals(keyTokens.get(2), tmd.getToken(hosts.get(3)));
 
         assertTrue(tmd.getBootstrapTokens().isEmpty());
     }
@@ -540,14 +539,14 @@
         ss.onChange(hosts.get(2), ApplicationState.STATUS, valueFactory.leaving(Collections.singleton(endpointTokens.get(2))));
 
         assertTrue(tmd.isLeaving(hosts.get(2)));
-        assertTrue(tmd.getToken(hosts.get(2)).equals(endpointTokens.get(2)));
+        assertEquals(endpointTokens.get(2), tmd.getToken(hosts.get(2)));
 
         // back to normal
         Gossiper.instance.injectApplicationState(hosts.get(2), ApplicationState.TOKENS, valueFactory.tokens(Collections.singleton(keyTokens.get(2))));
         ss.onChange(hosts.get(2), ApplicationState.STATUS, valueFactory.normal(Collections.singleton(keyTokens.get(2))));
 
         assertTrue(tmd.getLeavingEndpoints().isEmpty());
-        assertTrue(tmd.getToken(hosts.get(2)).equals(keyTokens.get(2)));
+        assertEquals(keyTokens.get(2), tmd.getToken(hosts.get(2)));
 
         // node 3 goes through leave and left and then jumps to normal at its new token
         ss.onChange(hosts.get(2), ApplicationState.STATUS, valueFactory.leaving(Collections.singleton(keyTokens.get(2))));
@@ -558,7 +557,7 @@
 
         assertTrue(tmd.getBootstrapTokens().isEmpty());
         assertTrue(tmd.getLeavingEndpoints().isEmpty());
-        assertTrue(tmd.getToken(hosts.get(2)).equals(keyTokens.get(4)));
+        assertEquals(keyTokens.get(4), tmd.getToken(hosts.get(2)));
     }
 
     @Test
@@ -582,9 +581,9 @@
         Gossiper.instance.injectApplicationState(hosts.get(2), ApplicationState.TOKENS, valueFactory.tokens(Collections.singleton(keyTokens.get(0))));
         ss.onChange(hosts.get(2), ApplicationState.STATUS, valueFactory.leaving(Collections.singleton(keyTokens.get(0))));
 
-        assertTrue(tmd.getToken(hosts.get(2)).equals(keyTokens.get(0)));
+        assertEquals(keyTokens.get(0), tmd.getToken(hosts.get(2)));
         assertTrue(tmd.isLeaving(hosts.get(2)));
-        assertTrue(tmd.getEndpoint(endpointTokens.get(2)) == null);
+        assertNull(tmd.getEndpoint(endpointTokens.get(2)));
 
         // go to boostrap
         Gossiper.instance.injectApplicationState(hosts.get(2), ApplicationState.TOKENS, valueFactory.tokens(Collections.singleton(keyTokens.get(1))));
@@ -593,13 +592,13 @@
                     valueFactory.bootstrapping(Collections.<Token>singleton(keyTokens.get(1))));
 
         assertFalse(tmd.isLeaving(hosts.get(2)));
-        assertTrue(tmd.getBootstrapTokens().size() == 1);
-        assertTrue(tmd.getBootstrapTokens().get(keyTokens.get(1)).equals(hosts.get(2)));
+        assertEquals(1, tmd.getBootstrapTokens().size());
+        assertEquals(hosts.get(2), tmd.getBootstrapTokens().get(keyTokens.get(1)));
 
         // jump to leaving again
         ss.onChange(hosts.get(2), ApplicationState.STATUS, valueFactory.leaving(Collections.singleton(keyTokens.get(1))));
 
-        assertTrue(tmd.getEndpoint(keyTokens.get(1)).equals(hosts.get(2)));
+        assertEquals(hosts.get(2), tmd.getEndpoint(keyTokens.get(1)));
         assertTrue(tmd.isLeaving(hosts.get(2)));
         assertTrue(tmd.getBootstrapTokens().isEmpty());
 
@@ -639,8 +638,8 @@
         ss.onChange(hosts.get(3), ApplicationState.STATUS, valueFactory.bootstrapping(Collections.<Token>singleton(keyTokens.get(1))));
 
         assertFalse(tmd.isMember(hosts.get(3)));
-        assertTrue(tmd.getBootstrapTokens().size() == 1);
-        assertTrue(tmd.getBootstrapTokens().get(keyTokens.get(1)).equals(hosts.get(3)));
+        assertEquals(1, tmd.getBootstrapTokens().size());
+        assertEquals(hosts.get(3), tmd.getBootstrapTokens().get(keyTokens.get(1)));
 
         // and then directly to 'left'
         Gossiper.instance.injectApplicationState(hosts.get(2), ApplicationState.TOKENS, valueFactory.tokens(Collections.singleton(keyTokens.get(1))));
@@ -667,8 +666,8 @@
         Util.createInitialRing(ss, partitioner, endpointTokens, new ArrayList<Token>(), hosts, new ArrayList<UUID>(), 2);
 
         InetAddress toRemove = hosts.get(1);
-        SystemKeyspace.updatePeerInfo(toRemove, "data_center", "'dc42'");
-        SystemKeyspace.updatePeerInfo(toRemove, "rack", "'rack42'");
+        SystemKeyspace.updatePeerInfo(toRemove, "data_center", "dc42");
+        SystemKeyspace.updatePeerInfo(toRemove, "rack", "rack42");
         assertEquals("rack42", SystemKeyspace.loadDcRackInfo().get(toRemove).get("rack"));
 
         // mark the node as removed
@@ -703,7 +702,7 @@
         return addrs;
     }
 
-    private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd) throws ConfigurationException
+    private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd)
     {
         KSMetaData ksmd = Schema.instance.getKSMetaData(keyspaceName);
         return AbstractReplicationStrategy.createReplicationStrategy(

diff --git a/test/unit/org/apache/cassandra/service/MoveTest.java b/test/unit/org/apache/cassandra/service/MoveTest.java
index 834411b..821fff0 100644
--- a/test/unit/org/apache/cassandra/service/MoveTest.java
+++ b/test/unit/org/apache/cassandra/service/MoveTest.java

@@ -19,7 +19,6 @@
 
 package org.apache.cassandra.service;
 
-import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.*;
@@ -57,7 +56,7 @@
      * So instead of extending SchemaLoader, we call it's method below.
      */
     @BeforeClass
-    public static void setup() throws IOException, ConfigurationException
+    public static void setup() throws ConfigurationException
     {
         oldPartitioner = StorageService.instance.setPartitionerUnsafe(partitioner);
         SchemaLoader.loadSchema();
@@ -150,7 +149,7 @@
      * Test ranges and write endpoints when multiple nodes are on the move simultaneously
      */
     @Test
-    public void testSimultaneousMove() throws UnknownHostException, ConfigurationException
+    public void testSimultaneousMove() throws UnknownHostException
     {
         StorageService ss = StorageService.instance;
         final int RING_SIZE = 10;
@@ -226,35 +225,35 @@
 
         Multimap<InetAddress, Range<Token>> keyspace1ranges = keyspaceStrategyMap.get("Keyspace1").getAddressRanges();
         Collection<Range<Token>> ranges1 = keyspace1ranges.get(InetAddress.getByName("127.0.0.1"));
-        assertEquals(collectionSize(ranges1), 1);
-        assertTrue(ranges1.iterator().next().equals(generateRange(97, 0)));
+        assertEquals(1, collectionSize(ranges1));
+        assertEquals(generateRange(97, 0), ranges1.iterator().next());
         Collection<Range<Token>> ranges2 = keyspace1ranges.get(InetAddress.getByName("127.0.0.2"));
-        assertEquals(collectionSize(ranges2), 1);
-        assertTrue(ranges2.iterator().next().equals(generateRange(0, 10)));
+        assertEquals(1, collectionSize(ranges2));
+        assertEquals(generateRange(0, 10), ranges2.iterator().next());
         Collection<Range<Token>> ranges3 = keyspace1ranges.get(InetAddress.getByName("127.0.0.3"));
-        assertEquals(collectionSize(ranges3), 1);
-        assertTrue(ranges3.iterator().next().equals(generateRange(10, 20)));
+        assertEquals(1, collectionSize(ranges3));
+        assertEquals(generateRange(10, 20), ranges3.iterator().next());
         Collection<Range<Token>> ranges4 = keyspace1ranges.get(InetAddress.getByName("127.0.0.4"));
-        assertEquals(collectionSize(ranges4), 1);
-        assertTrue(ranges4.iterator().next().equals(generateRange(20, 30)));
+        assertEquals(1, collectionSize(ranges4));
+        assertEquals(generateRange(20, 30), ranges4.iterator().next());
         Collection<Range<Token>> ranges5 = keyspace1ranges.get(InetAddress.getByName("127.0.0.5"));
-        assertEquals(collectionSize(ranges5), 1);
-        assertTrue(ranges5.iterator().next().equals(generateRange(30, 40)));
+        assertEquals(1, collectionSize(ranges5));
+        assertEquals(generateRange(30, 40), ranges5.iterator().next());
         Collection<Range<Token>> ranges6 = keyspace1ranges.get(InetAddress.getByName("127.0.0.6"));
-        assertEquals(collectionSize(ranges6), 1);
-        assertTrue(ranges6.iterator().next().equals(generateRange(40, 50)));
+        assertEquals(1, collectionSize(ranges6));
+        assertEquals(generateRange(40, 50), ranges6.iterator().next());
         Collection<Range<Token>> ranges7 = keyspace1ranges.get(InetAddress.getByName("127.0.0.7"));
-        assertEquals(collectionSize(ranges7), 1);
-        assertTrue(ranges7.iterator().next().equals(generateRange(50, 67)));
+        assertEquals(1, collectionSize(ranges7));
+        assertEquals(generateRange(50, 67), ranges7.iterator().next());
         Collection<Range<Token>> ranges8 = keyspace1ranges.get(InetAddress.getByName("127.0.0.8"));
-        assertEquals(collectionSize(ranges8), 1);
-        assertTrue(ranges8.iterator().next().equals(generateRange(67, 70)));
+        assertEquals(1, collectionSize(ranges8));
+        assertEquals(generateRange(67, 70), ranges8.iterator().next());
         Collection<Range<Token>> ranges9 = keyspace1ranges.get(InetAddress.getByName("127.0.0.9"));
-        assertEquals(collectionSize(ranges9), 1);
-        assertTrue(ranges9.iterator().next().equals(generateRange(70, 87)));
+        assertEquals(1, collectionSize(ranges9));
+        assertEquals(generateRange(70, 87), ranges9.iterator().next());
         Collection<Range<Token>> ranges10 = keyspace1ranges.get(InetAddress.getByName("127.0.0.10"));
-        assertEquals(collectionSize(ranges10), 1);
-        assertTrue(ranges10.iterator().next().equals(generateRange(87, 97)));
+        assertEquals(1, collectionSize(ranges10));
+        assertEquals(generateRange(87, 97), ranges10.iterator().next());
 
 
         /**
@@ -408,7 +407,7 @@
             for (Token token : keyTokens)
             {
                 endpoints = tmd.getWriteEndpoints(token, keyspaceName, strategy.getNaturalEndpoints(token));
-                assertTrue(expectedEndpoints.get(keyspaceName).get(token).size() == endpoints.size());
+                assertEquals(expectedEndpoints.get(keyspaceName).get(token).size(), endpoints.size());
                 assertTrue(expectedEndpoints.get(keyspaceName).get(token).containsAll(endpoints));
             }
 
@@ -420,7 +419,7 @@
             for (int i = 0; i < 3; i++)
             {
                 endpoints = tmd.getWriteEndpoints(keyTokens.get(i), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(i)));
-                assertTrue(endpoints.size() == 3);
+                assertEquals(3, endpoints.size());
                 assertTrue(endpoints.contains(hosts.get(i+1)));
                 assertTrue(endpoints.contains(hosts.get(i+2)));
                 assertTrue(endpoints.contains(hosts.get(i+3)));
@@ -428,7 +427,7 @@
 
             // token 35 should go to nodes 4, 5, 6 and boot1
             endpoints = tmd.getWriteEndpoints(keyTokens.get(3), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(3)));
-            assertTrue(endpoints.size() == 4);
+            assertEquals(4, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(4)));
             assertTrue(endpoints.contains(hosts.get(5)));
             assertTrue(endpoints.contains(hosts.get(6)));
@@ -436,7 +435,7 @@
 
             // token 45 should go to nodes 5, 6, 7 boot1
             endpoints = tmd.getWriteEndpoints(keyTokens.get(4), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(4)));
-            assertTrue(endpoints.size() == 4);
+            assertEquals(4, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(5)));
             assertTrue(endpoints.contains(hosts.get(6)));
             assertTrue(endpoints.contains(hosts.get(7)));
@@ -444,7 +443,7 @@
 
             // token 55 should go to nodes 6, 7, 8 boot1 and boot2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(5), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(5)));
-            assertTrue(endpoints.size() == 5);
+            assertEquals(5, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(6)));
             assertTrue(endpoints.contains(hosts.get(7)));
             assertTrue(endpoints.contains(hosts.get(8)));
@@ -453,7 +452,7 @@
 
             // token 65 should go to nodes 6, 7, 8 and boot2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(6), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(6)));
-            assertTrue(endpoints.size() == 4);
+            assertEquals(4, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(6)));
             assertTrue(endpoints.contains(hosts.get(7)));
             assertTrue(endpoints.contains(hosts.get(8)));
@@ -461,7 +460,7 @@
 
             // token 75 should to go nodes 8, 9, 0 and boot2
             endpoints = tmd.getWriteEndpoints(keyTokens.get(7), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(7)));
-            assertTrue(endpoints.size() == 4);
+            assertEquals(4, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(8)));
             assertTrue(endpoints.contains(hosts.get(9)));
             assertTrue(endpoints.contains(hosts.get(0)));
@@ -469,14 +468,14 @@
 
             // token 85 should go to nodes 8, 9 and 0
             endpoints = tmd.getWriteEndpoints(keyTokens.get(8), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(8)));
-            assertTrue(endpoints.size() == 3);
+            assertEquals(3, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(8)));
             assertTrue(endpoints.contains(hosts.get(9)));
             assertTrue(endpoints.contains(hosts.get(0)));
 
             // token 95 should go to nodes 9, 0 and 1
             endpoints = tmd.getWriteEndpoints(keyTokens.get(9), keyspaceName, strategy.getNaturalEndpoints(keyTokens.get(9)));
-            assertTrue(endpoints.size() == 3);
+            assertEquals(3, endpoints.size());
             assertTrue(endpoints.contains(hosts.get(9)));
             assertTrue(endpoints.contains(hosts.get(0)));
             assertTrue(endpoints.contains(hosts.get(1)));
@@ -513,14 +512,14 @@
         ss.onChange(hosts.get(2), ApplicationState.STATUS, valueFactory.moving(newToken));
 
         assertTrue(tmd.isMoving(hosts.get(2)));
-        assertTrue(tmd.getToken(hosts.get(2)).equals(endpointTokens.get(2)));
+        assertEquals(endpointTokens.get(2), tmd.getToken(hosts.get(2)));
 
         // back to normal
         Gossiper.instance.injectApplicationState(hosts.get(2), ApplicationState.TOKENS, valueFactory.tokens(Collections.singleton(newToken)));
         ss.onChange(hosts.get(2), ApplicationState.STATUS, valueFactory.normal(Collections.singleton(newToken)));
 
         assertTrue(tmd.getMovingEndpoints().isEmpty());
-        assertTrue(tmd.getToken(hosts.get(2)).equals(newToken));
+        assertEquals(newToken, tmd.getToken(hosts.get(2)));
 
         newToken = positionToken(8);
         // node 2 goes through leave and left and then jumps to normal at its new token
@@ -530,7 +529,7 @@
 
         assertTrue(tmd.getBootstrapTokens().isEmpty());
         assertTrue(tmd.getMovingEndpoints().isEmpty());
-        assertTrue(tmd.getToken(hosts.get(2)).equals(newToken));
+        assertEquals(newToken, tmd.getToken(hosts.get(2)));
     }
 
     private static Collection<InetAddress> makeAddrs(String... hosts) throws UnknownHostException
@@ -541,7 +540,7 @@
         return addrs;
     }
 
-    private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd) throws ConfigurationException
+    private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd)
     {
         KSMetaData ksmd = Schema.instance.getKSMetaData(keyspaceName);
         return AbstractReplicationStrategy.createReplicationStrategy(

diff --git a/test/unit/org/apache/cassandra/service/QueryPagerTest.java b/test/unit/org/apache/cassandra/service/QueryPagerTest.java
index 0645433..e71e97a 100644
--- a/test/unit/org/apache/cassandra/service/QueryPagerTest.java
+++ b/test/unit/org/apache/cassandra/service/QueryPagerTest.java

@@ -30,6 +30,7 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.*;
 import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.*;
@@ -37,7 +38,7 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.junit.Assert.*;
-import static org.apache.cassandra.cql3.QueryProcessor.processInternal;
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
 import static org.apache.cassandra.Util.range;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 
@@ -47,6 +48,11 @@
     private static final String KS = "Keyspace1";
     private static final String CF = "Standard1";
 
+    private static String string(CellName name)
+    {
+        return string(name.toByteBuffer());
+    }
+
     private static String string(ByteBuffer bb)
     {
         try
@@ -75,7 +81,7 @@
          */
         for (int i = 0; i < nbKeys; i++)
         {
-            RowMutation rm = new RowMutation(KS, bytes("k" + i));
+            Mutation rm = new Mutation(KS, bytes("k" + i));
             ColumnFamily cf = rm.addOrGet(CF);
 
             for (int j = 0; j < nbCols; j++)
@@ -94,7 +100,7 @@
     {
         StringBuilder sb = new StringBuilder();
         for (Row row : rows)
-            sb.append(string(row.key.key)).append(":").append(toString(row.cf)).append("\n");
+            sb.append(string(row.key.getKey())).append(":").append(toString(row.cf)).append("\n");
         return sb.toString();
     }
 
@@ -104,16 +110,16 @@
             return "";
 
         StringBuilder sb = new StringBuilder();
-        for (Column c : cf)
+        for (Cell c : cf)
             sb.append(" ").append(string(c.name()));
         return sb.toString();
     }
 
     private static ReadCommand namesQuery(String key, String... names)
     {
-        SortedSet<ByteBuffer> s = new TreeSet<ByteBuffer>(cfs().metadata.comparator);
+        SortedSet<CellName> s = new TreeSet<CellName>(cfs().metadata.comparator);
         for (String name : names)
-            s.add(bytes(name));
+            s.add(CellNames.simpleDense(bytes(name)));
         return new SliceByNamesReadCommand(KS, bytes(key), CF, System.currentTimeMillis(), new NamesQueryFilter(s, true));
     }
 
@@ -124,22 +130,22 @@
 
     private static ReadCommand sliceQuery(String key, String start, String end, boolean reversed, int count)
     {
-        SliceQueryFilter filter = new SliceQueryFilter(bytes(start), bytes(end), reversed, count);
+        SliceQueryFilter filter = new SliceQueryFilter(CellNames.simpleDense(bytes(start)), CellNames.simpleDense(bytes(end)), reversed, count);
         // Note: for MultiQueryTest, we need the same timestamp/expireBefore for all queries, so we just use 0 as it doesn't matter here.
         return new SliceFromReadCommand(KS, bytes(key), CF, 0, filter);
     }
 
     private static RangeSliceCommand rangeNamesQuery(AbstractBounds<RowPosition> range, int count, String... names)
     {
-        SortedSet<ByteBuffer> s = new TreeSet<ByteBuffer>(cfs().metadata.comparator);
+        SortedSet<CellName> s = new TreeSet<CellName>(cfs().metadata.comparator);
         for (String name : names)
-            s.add(bytes(name));
+            s.add(CellNames.simpleDense(bytes(name)));
         return new RangeSliceCommand(KS, CF, System.currentTimeMillis(), new NamesQueryFilter(s, true), range, count);
     }
 
     private static RangeSliceCommand rangeSliceQuery(AbstractBounds<RowPosition> range, int count, String start, String end)
     {
-        SliceQueryFilter filter = new SliceQueryFilter(bytes(start), bytes(end), false, Integer.MAX_VALUE);
+        SliceQueryFilter filter = new SliceQueryFilter(CellNames.simpleDense(bytes(start)), CellNames.simpleDense(bytes(end)), false, Integer.MAX_VALUE);
         return new RangeSliceCommand(KS, CF, System.currentTimeMillis(), filter, range, count);
     }
 
@@ -153,22 +159,22 @@
 
     private static void assertRow(Row r, String key, ByteBuffer... names)
     {
-        assertEquals(key, string(r.key.key));
+        assertEquals(key, string(r.key.getKey()));
         assertNotNull(r.cf);
         int i = 0;
-        for (Column c : r.cf)
+        for (Cell c : r.cf)
         {
             // Ignore deleted cells if we have them
-            if (!c.isLive(0))
+            if (!c.isLive())
                 continue;
 
             ByteBuffer expected = names[i++];
-            assertEquals("column " + i + " doesn't match: " + toString(r.cf), expected, c.name());
+            assertEquals("column " + i + " doesn't match: " + toString(r.cf), expected, c.name().toByteBuffer());
         }
     }
 
     @Test
-    public void NamesQueryTest() throws Exception
+    public void namesQueryTest() throws Exception
     {
         QueryPager pager = QueryPagers.localPager(namesQuery("k0", "c1", "c5", "c7", "c8"));
 
@@ -181,7 +187,7 @@
     }
 
     @Test
-    public void SliceQueryTest() throws Exception
+    public void sliceQueryTest() throws Exception
     {
         QueryPager pager = QueryPagers.localPager(sliceQuery("k0", "c1", "c8", 10));
 
@@ -231,7 +237,7 @@
     }
 
     @Test
-    public void MultiQueryTest() throws Exception
+    public void multiQueryTest() throws Exception
     {
         QueryPager pager = QueryPagers.localPager(new Pageable.ReadCommands(new ArrayList<ReadCommand>() {{
             add(sliceQuery("k1", "c2", "c6", 10));
@@ -260,7 +266,7 @@
     }
 
     @Test
-    public void RangeNamesQueryTest() throws Exception
+    public void rangeNamesQueryTest() throws Exception
     {
         QueryPager pager = QueryPagers.localPager(rangeNamesQuery(range("k0", "k5"), 100, "c1", "c4", "c8"));
 
@@ -282,7 +288,7 @@
     }
 
     @Test
-    public void RangeSliceQueryTest() throws Exception
+    public void rangeSliceQueryTest() throws Exception
     {
         QueryPager pager = QueryPagers.localPager(rangeSliceQuery(range("k1", "k5"), 100, "c1", "c7"));
 
@@ -331,11 +337,11 @@
         String keyspace = "cql_keyspace";
         String table = "table2";
         ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
-        CompositeType ct = (CompositeType)cfs.metadata.comparator;
+        CompositeType ct = (CompositeType)cfs.metadata.comparator.asAbstractType();
 
         // Insert rows but with a tombstone as last cell
         for (int i = 0; i < 5; i++)
-            processInternal(String.format("INSERT INTO %s.%s (k, c, v) VALUES ('k%d', 'c%d', null)", keyspace, table, 0, i));
+            executeInternal(String.format("INSERT INTO %s.%s (k, c, v) VALUES ('k%d', 'c%d', null)", keyspace, table, 0, i));
 
         SliceQueryFilter filter = new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 100);
         QueryPager pager = QueryPagers.localPager(new SliceFromReadCommand(keyspace, bytes("k0"), table, 0, filter));

diff --git a/test/unit/org/apache/cassandra/service/RemoveTest.java b/test/unit/org/apache/cassandra/service/RemoveTest.java
index 62dd636..649a3f2 100644
--- a/test/unit/org/apache/cassandra/service/RemoveTest.java
+++ b/test/unit/org/apache/cassandra/service/RemoveTest.java

@@ -59,7 +59,7 @@
     UUID removalId;
 
     @BeforeClass
-    public static void setupClass() throws IOException, ConfigurationException
+    public static void setupClass() throws ConfigurationException
     {
         oldPartitioner = StorageService.instance.setPartitionerUnsafe(partitioner);
         SchemaLoader.loadSchema();

diff --git a/test/unit/org/apache/cassandra/service/RowResolverTest.java b/test/unit/org/apache/cassandra/service/RowResolverTest.java
index c2d57c6..286d037 100644
--- a/test/unit/org/apache/cassandra/service/RowResolverTest.java
+++ b/test/unit/org/apache/cassandra/service/RowResolverTest.java

@@ -26,9 +26,9 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.ArrayBackedSortedColumns;
 import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.db.TreeMapBackedSortedColumns;
 
 import static org.junit.Assert.*;
 import static org.apache.cassandra.Util.column;
@@ -39,10 +39,10 @@
     @Test
     public void testResolveSupersetNewer()
     {
-        ColumnFamily cf1 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf1.addColumn(column("c1", "v1", 0));
 
-        ColumnFamily cf2 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf2.addColumn(column("c1", "v2", 1));
 
         ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, cf2), System.currentTimeMillis());
@@ -54,10 +54,10 @@
     @Test
     public void testResolveSupersetDisjoint()
     {
-        ColumnFamily cf1 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf1.addColumn(column("c1", "v1", 0));
 
-        ColumnFamily cf2 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf2.addColumn(column("c2", "v2", 1));
 
         ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, cf2), System.currentTimeMillis());
@@ -69,7 +69,7 @@
     @Test
     public void testResolveSupersetNullOne()
     {
-        ColumnFamily cf2 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf2.addColumn(column("c2", "v2", 1));
 
         ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(null, cf2), System.currentTimeMillis());
@@ -81,7 +81,7 @@
     @Test
     public void testResolveSupersetNullTwo()
     {
-        ColumnFamily cf1 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf1.addColumn(column("c1", "v1", 0));
 
         ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, null), System.currentTimeMillis());
@@ -100,10 +100,10 @@
     public void testResolveDeleted()
     {
         // one CF with columns timestamped before a delete in another cf
-        ColumnFamily cf1 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf1.addColumn(column("one", "A", 0));
 
-        ColumnFamily cf2 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf2.delete(new DeletionInfo(1L, (int) (System.currentTimeMillis() / 1000)));
 
         ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, cf2), System.currentTimeMillis());
@@ -118,19 +118,19 @@
     {
         // deletes and columns with interleaved timestamp, with out of order return sequence
 
-        ColumnFamily cf1 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf1.delete(new DeletionInfo(0L, (int) (System.currentTimeMillis() / 1000)));
 
         // these columns created after the previous deletion
-        ColumnFamily cf2 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf2.addColumn(column("one", "A", 1));
         cf2.addColumn(column("two", "A", 1));
 
         //this column created after the next delete
-        ColumnFamily cf3 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf3 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf3.addColumn(column("two", "B", 3));
 
-        ColumnFamily cf4 = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        ColumnFamily cf4 = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
         cf4.delete(new DeletionInfo(2L, (int) (System.currentTimeMillis() / 1000)));
 
         ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, cf2, cf3, cf4), System.currentTimeMillis());

diff --git a/test/unit/org/apache/cassandra/service/SerializationsTest.java b/test/unit/org/apache/cassandra/service/SerializationsTest.java
index b47f4d8..49632f9 100644
--- a/test/unit/org/apache/cassandra/service/SerializationsTest.java
+++ b/test/unit/org/apache/cassandra/service/SerializationsTest.java

@@ -32,7 +32,9 @@
 import org.apache.cassandra.dht.RandomPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.repair.NodePair;
 import org.apache.cassandra.repair.RepairJobDesc;
 import org.apache.cassandra.repair.Validator;
@@ -49,11 +51,11 @@
 
     private static final UUID RANDOM_UUID = UUID.fromString("b5c3d033-75aa-4c2f-a819-947aac7a0c54");
     private static final Range<Token> FULL_RANGE = new Range<>(StorageService.getPartitioner().getMinimumToken(), StorageService.getPartitioner().getMinimumToken());
-    private static final RepairJobDesc DESC = new RepairJobDesc(RANDOM_UUID, "Keyspace1", "Standard1", FULL_RANGE);
+    private static final RepairJobDesc DESC = new RepairJobDesc(getVersion() < MessagingService.VERSION_21 ? null : RANDOM_UUID, RANDOM_UUID, "Keyspace1", "Standard1", FULL_RANGE);
 
     private void testRepairMessageWrite(String fileName, RepairMessage... messages) throws IOException
     {
-        try (DataOutputStream out = getOutput(fileName))
+        try (DataOutputStreamAndChannel out = getOutput(fileName))
         {
             for (RepairMessage message : messages)
             {
@@ -91,17 +93,18 @@
 
     private void testValidationCompleteWrite() throws IOException
     {
+        IPartitioner p = new RandomPartitioner();
         // empty validation
+        MerkleTree mt = new MerkleTree(p, FULL_RANGE, MerkleTree.RECOMMENDED_DEPTH, (int) Math.pow(2, 15));
         Validator v0 = new Validator(DESC, FBUtilities.getBroadcastAddress(),  -1);
-        ValidationComplete c0 = new ValidationComplete(DESC, v0.tree);
+        ValidationComplete c0 = new ValidationComplete(DESC, mt);
 
         // validation with a tree
-        IPartitioner p = new RandomPartitioner();
-        MerkleTree mt = new MerkleTree(p, FULL_RANGE, MerkleTree.RECOMMENDED_DEPTH, Integer.MAX_VALUE);
+        mt = new MerkleTree(p, FULL_RANGE, MerkleTree.RECOMMENDED_DEPTH, Integer.MAX_VALUE);
         for (int i = 0; i < 10; i++)
             mt.split(p.getRandomToken());
-        Validator v1 = new Validator(DESC, FBUtilities.getBroadcastAddress(), mt, -1);
-        ValidationComplete c1 = new ValidationComplete(DESC, v1.tree);
+        Validator v1 = new Validator(DESC, FBUtilities.getBroadcastAddress(), -1);
+        ValidationComplete c1 = new ValidationComplete(DESC, mt);
 
         // validation failed
         ValidationComplete c3 = new ValidationComplete(DESC);

diff --git a/test/unit/org/apache/cassandra/service/StorageServiceClientTest.java b/test/unit/org/apache/cassandra/service/StorageServiceClientTest.java
index 19efe3a..f3ba754 100644
--- a/test/unit/org/apache/cassandra/service/StorageServiceClientTest.java
+++ b/test/unit/org/apache/cassandra/service/StorageServiceClientTest.java

@@ -26,12 +26,11 @@
 import static org.junit.Assert.assertFalse;
 
 import java.io.File;
-import java.io.IOException;
 
 public class StorageServiceClientTest
 {
     @Test
-    public void testClientOnlyMode() throws IOException, ConfigurationException
+    public void testClientOnlyMode() throws ConfigurationException
     {
         SchemaLoader.mkdirs();
         SchemaLoader.cleanup();

diff --git a/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java b/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java
index d78c7d6..dd25b35 100644
--- a/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java
+++ b/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java

@@ -35,6 +35,8 @@
 import org.junit.runner.RunWith;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.Keyspace;
@@ -43,8 +45,6 @@
 import org.apache.cassandra.dht.StringToken;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.locator.IEndpointSnitch;
 import org.apache.cassandra.locator.PropertyFileSnitch;
 import org.apache.cassandra.locator.TokenMetadata;
@@ -60,10 +60,11 @@
     {
         IEndpointSnitch snitch = new PropertyFileSnitch();
         DatabaseDescriptor.setEndpointSnitch(snitch);
+        Keyspace.setInitialized();
     }
 
     @Test
-    public void testRegularMode() throws IOException, InterruptedException, ConfigurationException
+    public void testRegularMode() throws ConfigurationException
     {
         SchemaLoader.mkdirs();
         SchemaLoader.cleanup();
@@ -84,14 +85,14 @@
     public void testGetAllRangesEmpty()
     {
         List<Token> toks = Collections.emptyList();
-        assertEquals(Collections.emptyList(), StorageService.instance.getAllRanges(toks));
+        assertEquals(Collections.<Range<Token>>emptyList(), StorageService.instance.getAllRanges(toks));
     }
 
     @Test
     public void testSnapshot() throws IOException
     {
         // no need to insert extra data, even an "empty" database will have a little information in the system keyspace
-        StorageService.instance.takeSnapshot("snapshot", new String[0]);
+        StorageService.instance.takeSnapshot("snapshot");
     }
 
     @Test
@@ -102,6 +103,50 @@
     }
 
     @Test
+    public void testPrimaryRangeForEndpointWithinDCWithNetworkTopologyStrategy() throws Exception
+    {
+        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
+        metadata.clearUnsafe();
+
+        // DC1
+        metadata.updateNormalToken(new StringToken("A"), InetAddress.getByName("127.0.0.1"));
+        metadata.updateNormalToken(new StringToken("C"), InetAddress.getByName("127.0.0.2"));
+
+        // DC2
+        metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.4"));
+        metadata.updateNormalToken(new StringToken("D"), InetAddress.getByName("127.0.0.5"));
+
+        Map<String, String> configOptions = new HashMap<>();
+        configOptions.put("DC1", "1");
+        configOptions.put("DC2", "1");
+
+        Keyspace.clear("Keyspace1");
+        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
+        Schema.instance.setKeyspaceDefinition(meta);
+
+        Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name,
+                                                                                                            InetAddress.getByName("127.0.0.1"));
+        assertEquals(2, primaryRanges.size());
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("D"), new StringToken("A"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("C"), new StringToken("D"))));
+
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.2"));
+        assertEquals(2, primaryRanges.size());
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("A"), new StringToken("B"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("B"), new StringToken("C"))));
+
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.4"));
+        assertEquals(2, primaryRanges.size());
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("D"), new StringToken("A"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("A"), new StringToken("B"))));
+
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.5"));
+        assertEquals(2, primaryRanges.size());
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("B"), new StringToken("C"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("C"), new StringToken("D"))));
+    }
+
+    @Test
     public void testPrimaryRangesWithNetworkTopologyStrategy() throws Exception
     {
         TokenMetadata metadata = StorageService.instance.getTokenMetadata();
@@ -113,7 +158,7 @@
         metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.4"));
         metadata.updateNormalToken(new StringToken("D"), InetAddress.getByName("127.0.0.5"));
 
-        Map<String, String> configOptions = new HashMap<String, String>();
+        Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC1", "1");
         configOptions.put("DC2", "1");
 
@@ -150,7 +195,7 @@
         metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.4"));
         metadata.updateNormalToken(new StringToken("D"), InetAddress.getByName("127.0.0.5"));
 
-        Map<String, String> configOptions = new HashMap<String, String>();
+        Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC2", "2");
 
         Keyspace.clear("Keyspace1");
@@ -177,6 +222,45 @@
     }
 
     @Test
+    public void testPrimaryRangeForEndpointWithinDCWithNetworkTopologyStrategyOneDCOnly() throws Exception
+    {
+        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
+        metadata.clearUnsafe();
+        // DC1
+        metadata.updateNormalToken(new StringToken("A"), InetAddress.getByName("127.0.0.1"));
+        metadata.updateNormalToken(new StringToken("C"), InetAddress.getByName("127.0.0.2"));
+        // DC2
+        metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.4"));
+        metadata.updateNormalToken(new StringToken("D"), InetAddress.getByName("127.0.0.5"));
+
+        Map<String, String> configOptions = new HashMap<>();
+        configOptions.put("DC2", "2");
+
+        Keyspace.clear("Keyspace1");
+        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
+        Schema.instance.setKeyspaceDefinition(meta);
+
+        // endpoints in DC1 should not have primary range
+        Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.1"));
+        assertTrue(primaryRanges.isEmpty());
+
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name,
+                                                                                   InetAddress.getByName("127.0.0.2"));
+        assertTrue(primaryRanges.isEmpty());
+
+        // endpoints in DC2 should have primary ranges which also cover DC1
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.4"));
+        assertTrue(primaryRanges.size() == 2);
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("D"), new StringToken("A"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("A"), new StringToken("B"))));
+
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.5"));
+        assertTrue(primaryRanges.size() == 2);
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("C"), new StringToken("D"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("B"), new StringToken("C"))));
+    }
+
+    @Test
     public void testPrimaryRangesWithVnodes() throws Exception
     {
         TokenMetadata metadata = StorageService.instance.getTokenMetadata();
@@ -200,7 +284,7 @@
         dc2.put(InetAddress.getByName("127.0.0.5"), new StringToken("K"));
         metadata.updateNormalTokens(dc2);
 
-        Map<String, String> configOptions = new HashMap<String, String>();
+        Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC2", "2");
 
         Keyspace.clear("Keyspace1");
@@ -238,6 +322,86 @@
         assert primaryRanges.contains(new Range<Token>(new StringToken("H"), new StringToken("I")));
         assert primaryRanges.contains(new Range<Token>(new StringToken("I"), new StringToken("J")));
     }
+
+    @Test
+    public void testPrimaryRangeForEndpointWithinDCWithVnodes() throws Exception
+    {
+        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
+        metadata.clearUnsafe();
+
+        // DC1
+        Multimap<InetAddress, Token> dc1 = HashMultimap.create();
+        dc1.put(InetAddress.getByName("127.0.0.1"), new StringToken("A"));
+        dc1.put(InetAddress.getByName("127.0.0.1"), new StringToken("E"));
+        dc1.put(InetAddress.getByName("127.0.0.1"), new StringToken("H"));
+        dc1.put(InetAddress.getByName("127.0.0.2"), new StringToken("C"));
+        dc1.put(InetAddress.getByName("127.0.0.2"), new StringToken("I"));
+        dc1.put(InetAddress.getByName("127.0.0.2"), new StringToken("J"));
+        metadata.updateNormalTokens(dc1);
+
+        // DC2
+        Multimap<InetAddress, Token> dc2 = HashMultimap.create();
+        dc2.put(InetAddress.getByName("127.0.0.4"), new StringToken("B"));
+        dc2.put(InetAddress.getByName("127.0.0.4"), new StringToken("G"));
+        dc2.put(InetAddress.getByName("127.0.0.4"), new StringToken("L"));
+        dc2.put(InetAddress.getByName("127.0.0.5"), new StringToken("D"));
+        dc2.put(InetAddress.getByName("127.0.0.5"), new StringToken("F"));
+        dc2.put(InetAddress.getByName("127.0.0.5"), new StringToken("K"));
+        metadata.updateNormalTokens(dc2);
+
+        Map<String, String> configOptions = new HashMap<>();
+        configOptions.put("DC1", "1");
+        configOptions.put("DC2", "2");
+
+        Keyspace.clear("Keyspace1");
+        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
+        Schema.instance.setKeyspaceDefinition(meta);
+
+        // endpoints in DC1 should have primary ranges which also cover DC2
+        Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.1"));
+        assertEquals(8, primaryRanges.size());
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("J"), new StringToken("K"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("K"), new StringToken("L"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("L"), new StringToken("A"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("C"), new StringToken("D"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("D"), new StringToken("E"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("E"), new StringToken("F"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("F"), new StringToken("G"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("G"), new StringToken("H"))));
+
+        // endpoints in DC1 should have primary ranges which also cover DC2
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.2"));
+        assertEquals(4, primaryRanges.size());
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("B"), new StringToken("C"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("A"), new StringToken("B"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("H"), new StringToken("I"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("I"), new StringToken("J"))));
+
+        // endpoints in DC2 should have primary ranges which also cover DC1
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.4"));
+        assertEquals(4, primaryRanges.size());
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("A"), new StringToken("B"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("F"), new StringToken("G"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("K"), new StringToken("L"))));
+        // because /127.0.0.4 holds token "B" which is the next to token "A" from /127.0.0.1,
+        // the node covers range (L, A]
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("L"), new StringToken("A"))));
+
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.5"));
+        assertTrue(primaryRanges.size() == 8);
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("C"), new StringToken("D"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("E"), new StringToken("F"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("J"), new StringToken("K"))));
+        // ranges from /127.0.0.1
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("D"), new StringToken("E"))));
+        // the next token to "H" in DC2 is "K" in /127.0.0.5, so (G, H] goes to /127.0.0.5
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("G"), new StringToken("H"))));
+        // ranges from /127.0.0.2
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("B"), new StringToken("C"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("H"), new StringToken("I"))));
+        assertTrue(primaryRanges.contains(new Range<Token>(new StringToken("I"), new StringToken("J"))));
+    }
+
     @Test
     public void testPrimaryRangesWithSimpleStrategy() throws Exception
     {
@@ -248,7 +412,7 @@
         metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.2"));
         metadata.updateNormalToken(new StringToken("C"), InetAddress.getByName("127.0.0.3"));
 
-        Map<String, String> configOptions = new HashMap<String, String>();
+        Map<String, String> configOptions = new HashMap<>();
         configOptions.put("replication_factor", "2");
 
         Keyspace.clear("Keyspace1");
@@ -268,6 +432,37 @@
         assert primaryRanges.contains(new Range<Token>(new StringToken("B"), new StringToken("C")));
     }
 
+    /* Does not make much sense to use -local and -pr with simplestrategy, but just to prevent human errors */
+    @Test
+    public void testPrimaryRangeForEndpointWithinDCWithSimpleStrategy() throws Exception
+    {
+        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
+        metadata.clearUnsafe();
+
+        metadata.updateNormalToken(new StringToken("A"), InetAddress.getByName("127.0.0.1"));
+        metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.2"));
+        metadata.updateNormalToken(new StringToken("C"), InetAddress.getByName("127.0.0.3"));
+
+        Map<String, String> configOptions = new HashMap<>();
+        configOptions.put("replication_factor", "2");
+
+        Keyspace.clear("Keyspace1");
+        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "SimpleStrategy", configOptions, false);
+        Schema.instance.setKeyspaceDefinition(meta);
+
+        Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.1"));
+        assert primaryRanges.size() == 1;
+        assert primaryRanges.contains(new Range<Token>(new StringToken("C"), new StringToken("A")));
+
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.2"));
+        assert primaryRanges.size() == 1;
+        assert primaryRanges.contains(new Range<Token>(new StringToken("A"), new StringToken("B")));
+
+        primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.3"));
+        assert primaryRanges.size() == 1;
+        assert primaryRanges.contains(new Range<Token>(new StringToken("B"), new StringToken("C")));
+    }
+
     @Test
     public void testCreateRepairRangeFrom() throws Exception
     {
@@ -283,11 +478,6 @@
 
         Map<String, String> configOptions = new HashMap<String, String>();
         configOptions.put("replication_factor", "3");
-
-        Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "SimpleStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
-
         Collection<Range<Token>> repairRangeFrom = StorageService.instance.createRepairRangeFrom("1500", "3700");
         assert repairRangeFrom.size() == 3;
         assert repairRangeFrom.contains(new Range<Token>(new LongToken(1500L), new LongToken(2000L)));

diff --git a/test/unit/org/apache/cassandra/service/pager/AbstractQueryPagerTest.java b/test/unit/org/apache/cassandra/service/pager/AbstractQueryPagerTest.java
index 5467ec0..cc03e5d 100644
--- a/test/unit/org/apache/cassandra/service/pager/AbstractQueryPagerTest.java
+++ b/test/unit/org/apache/cassandra/service/pager/AbstractQueryPagerTest.java

@@ -26,6 +26,7 @@
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.*;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.db.filter.ColumnCounter;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -33,7 +34,7 @@
 public class AbstractQueryPagerTest
 {
     @Test
-    public void DiscardFirstTest()
+    public void discardFirstTest()
     {
         TestPager pager = new TestPager();
         List<Row> rows = Arrays.asList(createRow("r1", 1),
@@ -68,7 +69,7 @@
     }
 
     @Test
-    public void DiscardLastTest()
+    public void discardLastTest()
     {
         TestPager pager = new TestPager();
         List<Row> rows = Arrays.asList(createRow("r1", 2),
@@ -104,12 +105,12 @@
 
     private void assertRow(Row row, String name, int... values)
     {
-        assertEquals(row.key.key, ByteBufferUtil.bytes(name));
+        assertEquals(row.key.getKey(), ByteBufferUtil.bytes(name));
         assertEquals(values.length, row.cf.getColumnCount());
 
         int i = 0;
-        for (Column c : row.cf)
-            assertEquals(values[i++], i(c.name()));
+        for (Cell c : row.cf)
+            assertEquals(values[i++], i(c.name().toByteBuffer()));
     }
 
     private Row createRow(String name, int nbCol)
@@ -119,15 +120,15 @@
 
     private ColumnFamily createCF(int nbCol)
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(createMetadata());
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(createMetadata());
         for (int i = 0; i < nbCol; i++)
-            cf.addColumn(bb(i), bb(i), 0);
+            cf.addColumn(CellNames.simpleDense(bb(i)), bb(i), 0);
         return cf;
     }
 
     private CFMetaData createMetadata()
     {
-        return new CFMetaData("ks", "cf", ColumnFamilyType.Standard, Int32Type.instance);
+        return new CFMetaData("ks", "cf", ColumnFamilyType.Standard, CellNames.fromAbstractType(Int32Type.instance, false));
     }
 
     private static ByteBuffer bb(int i)

diff --git a/test/unit/org/apache/cassandra/streaming/SessionInfoTest.java b/test/unit/org/apache/cassandra/streaming/SessionInfoTest.java
index 60fbf40..f015a01 100644
--- a/test/unit/org/apache/cassandra/streaming/SessionInfoTest.java
+++ b/test/unit/org/apache/cassandra/streaming/SessionInfoTest.java

@@ -46,7 +46,7 @@
         }
 
         StreamSummary sending = new StreamSummary(cfId, 10, 100);
-        SessionInfo info = new SessionInfo(local, summaries, Collections.singleton(sending), StreamSession.State.PREPARING);
+        SessionInfo info = new SessionInfo(local, 0, summaries, Collections.singleton(sending), StreamSession.State.PREPARING);
 
         assert info.getTotalFilesToReceive() == 45;
         assert info.getTotalFilesToSend() == 10;
@@ -57,13 +57,13 @@
         assert info.getTotalFilesSent() == 0;
 
         // receive in progress
-        info.updateProgress(new ProgressInfo(local, "test.txt", ProgressInfo.Direction.IN, 50, 100));
+        info.updateProgress(new ProgressInfo(local, 0, "test.txt", ProgressInfo.Direction.IN, 50, 100));
         // still in progress, but not completed yet
         assert info.getTotalSizeReceived() == 50;
         assert info.getTotalSizeSent() == 0;
         assert info.getTotalFilesReceived() == 0;
         assert info.getTotalFilesSent() == 0;
-        info.updateProgress(new ProgressInfo(local, "test.txt", ProgressInfo.Direction.IN, 100, 100));
+        info.updateProgress(new ProgressInfo(local, 0, "test.txt", ProgressInfo.Direction.IN, 100, 100));
         // 1 file should be completed
         assert info.getTotalSizeReceived() == 100;
         assert info.getTotalSizeSent() == 0;

diff --git a/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java b/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
index ce0f9d0..4043ac8 100644
--- a/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
+++ b/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java

@@ -43,7 +43,7 @@
         String ks = "Keyspace1";
         String cf = "Standard1";
 
-        StreamSession session = new StreamSession(FBUtilities.getBroadcastAddress(), null);
+        StreamSession session = new StreamSession(FBUtilities.getBroadcastAddress(), null, 0);
         ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore(cf);
 
         // create two sstables
@@ -59,7 +59,7 @@
         {
             List<Range<Token>> ranges = new ArrayList<>();
             ranges.add(new Range<>(sstable.first.getToken(), sstable.last.getToken()));
-            task.addTransferFile(sstable, 1, sstable.getPositionsForRanges(ranges));
+            task.addTransferFile(sstable, 1, sstable.getPositionsForRanges(ranges), 0);
         }
         assertEquals(2, task.getTotalNumberOfFiles());
 

diff --git a/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java b/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java
index 4cd578d..7d3b274 100644
--- a/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java
+++ b/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java

@@ -47,15 +47,13 @@
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.sstable.SSTableUtils;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.thrift.IndexExpression;
-import org.apache.cassandra.thrift.IndexOperator;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.CounterId;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.HeapAllocator;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
+import static org.apache.cassandra.Util.cellname;
 import static org.apache.cassandra.Util.column;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
@@ -169,8 +167,8 @@
             String key = "key" + offs[i];
             String col = "col" + offs[i];
             assert cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk(key), cfs.name, System.currentTimeMillis())) != null;
-            assert rows.get(i).key.key.equals(ByteBufferUtil.bytes(key));
-            assert rows.get(i).cf.getColumn(ByteBufferUtil.bytes(col)) != null;
+            assert rows.get(i).key.getKey().equals(ByteBufferUtil.bytes(key));
+            assert rows.get(i).cf.getColumn(cellname(col)) != null;
         }
 
         // and that the max timestamp for the file was rediscovered
@@ -214,7 +212,7 @@
         {
             details.add(new StreamSession.SSTableStreamingSections(sstable,
                                                                    sstable.getPositionsForRanges(ranges),
-                                                                   sstable.estimatedKeysForRanges(ranges)));
+                                                                   sstable.estimatedKeysForRanges(ranges), sstable.getSSTableMetadata().repairedAt));
         }
         return details;
     }
@@ -229,10 +227,10 @@
             public void mutate(String key, String col, long timestamp) throws Exception
             {
                 long val = key.hashCode();
-                ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(keyspace.getName(), cfs.name);
+                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(keyspace.getName(), cfs.name);
                 cf.addColumn(column(col, "v", timestamp));
-                cf.addColumn(new Column(ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(val), timestamp));
-                RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes(key), cf);
+                cf.addColumn(new BufferCell(cellname("birthdate"), ByteBufferUtil.bytes(val), timestamp));
+                Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes(key), cf);
                 logger.debug("Applying row to transfer " + rm);
                 rm.apply();
             }
@@ -243,14 +241,14 @@
         {
             long val = key.hashCode();
             IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"),
-                                                              IndexOperator.EQ,
-                                                              ByteBufferUtil.bytes(val));
+                                                       IndexExpression.Operator.EQ,
+                                                       ByteBufferUtil.bytes(val));
             List<IndexExpression> clause = Arrays.asList(expr);
             IDiskAtomFilter filter = new IdentityQueryFilter();
             Range<RowPosition> range = Util.range("", "");
             List<Row> rows = cfs.search(range, clause, filter, 100);
             assertEquals(1, rows.size());
-            assert rows.get(0).key.key.equals(ByteBufferUtil.bytes(key));
+            assert rows.get(0).key.getKey().equals(ByteBufferUtil.bytes(key));
         }
     }
 
@@ -266,14 +264,14 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
 
         String key = "key1";
-        RowMutation rm = new RowMutation(ks, ByteBufferUtil.bytes(key));
+        Mutation rm = new Mutation(ks, ByteBufferUtil.bytes(key));
         // add columns of size slightly less than column_index_size to force insert column index
-        rm.add(cfname, ByteBufferUtil.bytes(1), ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize() - 64]), 2);
-        rm.add(cfname, ByteBufferUtil.bytes(6), ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize()]), 2);
+        rm.add(cfname, cellname(1), ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize() - 64]), 2);
+        rm.add(cfname, cellname(6), ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize()]), 2);
         ColumnFamily cf = rm.addOrGet(cfname);
         // add RangeTombstones
-        cf.delete(new DeletionInfo(ByteBufferUtil.bytes(2), ByteBufferUtil.bytes(3), cf.getComparator(), 1, (int) (System.currentTimeMillis() / 1000)));
-        cf.delete(new DeletionInfo(ByteBufferUtil.bytes(5), ByteBufferUtil.bytes(7), cf.getComparator(), 1, (int) (System.currentTimeMillis() / 1000)));
+        cf.delete(new DeletionInfo(cellname(2), cellname(3), cf.getComparator(), 1, (int) (System.currentTimeMillis() / 1000)));
+        cf.delete(new DeletionInfo(cellname(5), cellname(7), cf.getComparator(), 1, (int) (System.currentTimeMillis() / 1000)));
         rm.apply();
         cfs.forceBlockingFlush();
 
@@ -315,15 +313,15 @@
             public void mutate(String key, String col, long timestamp) throws Exception
             {
                 Map<String, ColumnFamily> entries = new HashMap<>();
-                ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(cfs.metadata);
-                ColumnFamily cfCleaned = TreeMapBackedSortedColumns.factory.create(cfs.metadata);
-                CounterContext.ContextState state = CounterContext.ContextState.allocate(0, 1, 3, HeapAllocator.instance);
+                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+                ColumnFamily cfCleaned = ArrayBackedSortedColumns.factory.create(cfs.metadata);
+                CounterContext.ContextState state = CounterContext.ContextState.allocate(0, 1, 3);
                 state.writeLocal(CounterId.fromInt(2), 9L, 3L);
                 state.writeRemote(CounterId.fromInt(4), 4L, 2L);
                 state.writeRemote(CounterId.fromInt(6), 3L, 3L);
                 state.writeRemote(CounterId.fromInt(8), 2L, 4L);
-                cf.addColumn(new CounterColumn(ByteBufferUtil.bytes(col), state.context, timestamp));
-                cfCleaned.addColumn(new CounterColumn(ByteBufferUtil.bytes(col), cc.clearAllLocal(state.context), timestamp));
+                cf.addColumn(new BufferCounterCell(cellname(col), state.context, timestamp));
+                cfCleaned.addColumn(new BufferCounterCell(cellname(col), cc.clearAllLocal(state.context), timestamp));
 
                 entries.put(key, cf);
                 cleanedEntries.put(key, cfCleaned);
@@ -384,8 +382,8 @@
         ColumnFamilyStore cfstore = Keyspace.open(keyspaceName).getColumnFamilyStore(cfname);
         List<Row> rows = Util.getRangeSlice(cfstore);
         assertEquals(2, rows.size());
-        assert rows.get(0).key.key.equals(ByteBufferUtil.bytes("test"));
-        assert rows.get(1).key.key.equals(ByteBufferUtil.bytes("transfer3"));
+        assert rows.get(0).key.getKey().equals(ByteBufferUtil.bytes("test"));
+        assert rows.get(1).key.getKey().equals(ByteBufferUtil.bytes("transfer3"));
         assert rows.get(0).cf.getColumnCount() == 1;
         assert rows.get(1).cf.getColumnCount() == 1;
 
@@ -424,9 +422,9 @@
         Map.Entry<DecoratedKey,String> last = keys.lastEntry();
         Map.Entry<DecoratedKey,String> secondtolast = keys.lowerEntry(last.getKey());
         List<Range<Token>> ranges = new ArrayList<>();
-        ranges.add(new Range<>(p.getMinimumToken(), first.getKey().token));
+        ranges.add(new Range<>(p.getMinimumToken(), first.getKey().getToken()));
         // the left hand side of the range is exclusive, so we transfer from the second-to-last token
-        ranges.add(new Range<>(secondtolast.getKey().token, p.getMinimumToken()));
+        ranges.add(new Range<>(secondtolast.getKey().getToken(), p.getMinimumToken()));
 
         // Acquiring references, transferSSTables needs it
         if (!SSTableReader.acquireReferences(ssTableReaders))
@@ -453,10 +451,10 @@
         {
             public void mutate(String key, String colName, long timestamp) throws Exception
             {
-                ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(keyspace.getName(), cfs.name);
+                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(keyspace.getName(), cfs.name);
                 cf.addColumn(column(colName, "value", timestamp));
-                cf.addColumn(new Column(ByteBufferUtil.bytes("birthdate"), ByteBufferUtil.bytes(new Date(timestamp).toString()), timestamp));
-                RowMutation rm = new RowMutation("Keyspace1", ByteBufferUtil.bytes(key), cf);
+                cf.addColumn(new BufferCell(cellname("birthdate"), ByteBufferUtil.bytes(new Date(timestamp).toString()), timestamp));
+                Mutation rm = new Mutation("Keyspace1", ByteBufferUtil.bytes(key), cf);
                 logger.debug("Applying row to transfer " + rm);
                 rm.apply();
             }

diff --git a/test/unit/org/apache/cassandra/streaming/compress/CompressedInputStreamTest.java b/test/unit/org/apache/cassandra/streaming/compress/CompressedInputStreamTest.java
index 532b506..42a83a0 100644
--- a/test/unit/org/apache/cassandra/streaming/compress/CompressedInputStreamTest.java
+++ b/test/unit/org/apache/cassandra/streaming/compress/CompressedInputStreamTest.java

@@ -26,6 +26,7 @@
 
 import org.junit.Test;
 
+import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.io.compress.CompressedSequentialWriter;
 import org.apache.cassandra.io.compress.CompressionMetadata;
@@ -33,7 +34,7 @@
 import org.apache.cassandra.io.compress.SnappyCompressor;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableMetadata;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -66,9 +67,9 @@
         // write compressed data file of longs
         File tmp = new File(File.createTempFile("cassandra", "unittest").getParent(), "ks-cf-ib-1-Data.db");
         Descriptor desc = Descriptor.fromFilename(tmp.getAbsolutePath());
-        SSTableMetadata.Collector collector = SSTableMetadata.createCollector(BytesType.instance);
+        MetadataCollector collector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance));
         CompressionParameters param = new CompressionParameters(SnappyCompressor.instance, 32, Collections.EMPTY_MAP);
-        CompressedSequentialWriter writer = new CompressedSequentialWriter(tmp, desc.filenameFor(Component.COMPRESSION_INFO), false, param, collector);
+        CompressedSequentialWriter writer = new CompressedSequentialWriter(tmp, desc.filenameFor(Component.COMPRESSION_INFO), param, collector);
         Map<Long, Long> index = new HashMap<Long, Long>();
         for (long l = 0L; l < 1000; l++)
         {

diff --git a/test/unit/org/apache/cassandra/thrift/MultiSliceTest.java b/test/unit/org/apache/cassandra/thrift/MultiSliceTest.java
new file mode 100644
index 0000000..50e409e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/thrift/MultiSliceTest.java

@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.thrift;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.nio.ByteBuffer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import junit.framework.Assert;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.service.EmbeddedCassandraService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.thrift.TException;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class MultiSliceTest extends SchemaLoader
+{
+    private static CassandraServer server;
+    
+    @BeforeClass
+    public static void setup() throws IOException, TException 
+    {
+        Schema.instance.clear(); // Schema are now written on disk and will be reloaded
+        new EmbeddedCassandraService().start();
+        ThriftSessionManager.instance.setCurrentSocket(new InetSocketAddress(9160));        
+        server = new CassandraServer();
+        server.set_keyspace("Keyspace1");
+    }
+
+    private static MultiSliceRequest makeMultiSliceRequest(ByteBuffer key)
+    {
+        ColumnParent cp = new ColumnParent("Standard1");
+        MultiSliceRequest req = new MultiSliceRequest();
+        req.setKey(key);
+        req.setCount(1000);
+        req.reversed = false;
+        req.setColumn_parent(cp);
+        return req;
+    }
+    
+    @Test
+    public void test_multi_slice_optional_column_slice() throws TException
+    {
+        ColumnParent cp = new ColumnParent("Standard1");
+        ByteBuffer key = ByteBuffer.wrap("multi_slice".getBytes());
+        List<String> expected = new ArrayList<String>();
+        for (char a = 'a'; a <= 'z'; a++)
+            expected.add(a + "");
+
+        addTheAlphabetToRow(key, cp);
+        MultiSliceRequest req = makeMultiSliceRequest(key);
+        req.setColumn_slices(new ArrayList<ColumnSlice>());
+        req.getColumn_slices().add(new ColumnSlice());
+        List<ColumnOrSuperColumn> list = server.get_multi_slice(req);
+        assertColumnNameMatches(expected, list);
+    }
+    
+    @Test
+    public void test_multi_slice() throws TException
+    {
+        ColumnParent cp = new ColumnParent("Standard1");
+        ByteBuffer key = ByteBuffer.wrap("multi_slice_two_slice".getBytes());
+        addTheAlphabetToRow(key, cp);
+        MultiSliceRequest req = makeMultiSliceRequest(key);
+        req.setColumn_slices(Arrays.asList(columnSliceFrom("a", "e"), columnSliceFrom("i", "n")));
+        assertColumnNameMatches(Arrays.asList("a", "b", "c", "d", "e", "i", "j", "k" , "l", "m" , "n"), server.get_multi_slice(req));
+    }
+    
+    @Test
+    public void test_with_overlap() throws TException
+    {
+        ColumnParent cp = new ColumnParent("Standard1");
+        ByteBuffer key = ByteBuffer.wrap("overlap".getBytes());
+        addTheAlphabetToRow(key, cp);
+        MultiSliceRequest req = makeMultiSliceRequest(key);
+        req.setColumn_slices(Arrays.asList(columnSliceFrom("a", "e"), columnSliceFrom("d", "g")));
+        assertColumnNameMatches(Arrays.asList("a", "b", "c", "d", "e", "f", "g"), server.get_multi_slice(req));
+    }
+    
+    @Test
+    public void test_with_overlap_reversed() throws TException
+    {
+        ColumnParent cp = new ColumnParent("Standard1");
+        ByteBuffer key = ByteBuffer.wrap("overlap_reversed".getBytes());
+        addTheAlphabetToRow(key, cp);
+        MultiSliceRequest req = makeMultiSliceRequest(key);
+        req.reversed = true;
+        req.setColumn_slices(Arrays.asList(columnSliceFrom("e", "a"), columnSliceFrom("g", "d")));
+        assertColumnNameMatches(Arrays.asList("g", "f", "e", "d", "c", "b", "a"), server.get_multi_slice(req));
+    }
+
+    @Test(expected=InvalidRequestException.class)
+    public void test_that_column_slice_is_proper() throws TException
+    {
+      ByteBuffer key = ByteBuffer.wrap("overlap".getBytes());
+      MultiSliceRequest req = makeMultiSliceRequest(key);
+      req.reversed = true;
+      req.setColumn_slices(Arrays.asList(columnSliceFrom("a", "e"), columnSliceFrom("g", "d")));
+      assertColumnNameMatches(Arrays.asList("a", "b", "c", "d", "e", "f", "g"), server.get_multi_slice(req));
+    }
+    
+    @Test
+    public void test_with_overlap_reversed_with_count() throws TException
+    {
+        ColumnParent cp = new ColumnParent("Standard1");
+        ByteBuffer key = ByteBuffer.wrap("overlap_reversed_count".getBytes());
+        addTheAlphabetToRow(key, cp);
+        MultiSliceRequest req = makeMultiSliceRequest(key);
+        req.setCount(6);
+        req.reversed = true;
+        req.setColumn_slices(Arrays.asList(columnSliceFrom("e", "a"), columnSliceFrom("g", "d")));
+        assertColumnNameMatches(Arrays.asList("g", "f", "e", "d", "c", "b"), server.get_multi_slice(req));
+    }
+
+    @Test
+    public void test_with_overlap_with_count() throws TException
+    {
+        ColumnParent cp = new ColumnParent("Standard1");
+        ByteBuffer key = ByteBuffer.wrap("overlap_reversed_count".getBytes());
+        addTheAlphabetToRow(key, cp);
+        MultiSliceRequest req = makeMultiSliceRequest(key);
+        req.setCount(6);
+        req.setColumn_slices(Arrays.asList(columnSliceFrom("a", "e"), columnSliceFrom("d", "g"), columnSliceFrom("d", "g")));
+        assertColumnNameMatches(Arrays.asList("a", "b", "c", "d", "e", "f"), server.get_multi_slice(req));
+    }
+
+    private static void addTheAlphabetToRow(ByteBuffer key, ColumnParent parent) 
+            throws InvalidRequestException, UnavailableException, TimedOutException
+    {
+        for (char a = 'a'; a <= 'z'; a++) {
+            Column c1 = new Column();
+            c1.setName(ByteBufferUtil.bytes(String.valueOf(a)));
+            c1.setValue(new byte [0]);
+            c1.setTimestamp(System.nanoTime());
+            server.insert(key, parent, c1, ConsistencyLevel.ONE); 
+         }
+    }
+    
+    private static void assertColumnNameMatches(List<String> expected , List<ColumnOrSuperColumn> actual)
+    {
+        Assert.assertEquals(actual+" "+expected +" did not have same number of elements", actual.size(), expected.size());
+        for (int i = 0 ; i< expected.size() ; i++)
+        {
+            Assert.assertEquals(actual.get(i) +" did not equal "+ expected.get(i), 
+                    expected.get(i), new String(actual.get(i).getColumn().getName()));
+        }
+    }
+    
+    private ColumnSlice columnSliceFrom(String startInclusive, String endInclusive)
+    {
+        ColumnSlice cs = new ColumnSlice();
+        cs.setStart(ByteBufferUtil.bytes(startInclusive));
+        cs.setFinish(ByteBufferUtil.bytes(endInclusive));
+        return cs;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/thrift/ThriftValidationTest.java b/test/unit/org/apache/cassandra/thrift/ThriftValidationTest.java
index 0e8bbb8..df0f98c 100644
--- a/test/unit/org/apache/cassandra/thrift/ThriftValidationTest.java
+++ b/test/unit/org/apache/cassandra/thrift/ThriftValidationTest.java

@@ -54,7 +54,7 @@
     public void testColumnNameEqualToKeyAlias() throws org.apache.cassandra.exceptions.InvalidRequestException
     {
         CFMetaData metaData = Schema.instance.getCFMetaData("Keyspace1", "Standard1");
-        CFMetaData newMetadata = metaData.clone();
+        CFMetaData newMetadata = metaData.copy();
 
         boolean gotException = false;
 
@@ -62,7 +62,7 @@
         // should not throw IRE here
         try
         {
-            newMetadata.addColumnDefinition(ColumnDefinition.partitionKeyDef(AsciiType.instance.decompose("id"), LongType.instance, null));
+            newMetadata.addColumnDefinition(ColumnDefinition.partitionKeyDef(metaData, AsciiType.instance.decompose("id"), LongType.instance, null));
             newMetadata.validate();
         }
         catch (ConfigurationException e)
@@ -78,7 +78,7 @@
         // add a column with name = "id"
         try
         {
-            newMetadata.addColumnDefinition(ColumnDefinition.regularDef(ByteBufferUtil.bytes("id"), LongType.instance, null));
+            newMetadata.addColumnDefinition(ColumnDefinition.regularDef(metaData, ByteBufferUtil.bytes("id"), LongType.instance, null));
             newMetadata.validate();
         }
         catch (ConfigurationException e)
@@ -92,7 +92,7 @@
         Column column = new Column(ByteBufferUtil.bytes("id"));
         column.setValue(ByteBufferUtil.bytes("not a long"));
         column.setTimestamp(1234);
-        ThriftValidation.validateColumnData(newMetadata, column, false);
+        ThriftValidation.validateColumnData(newMetadata, null, column);
     }
 
     @Test
@@ -101,13 +101,13 @@
         CFMetaData metaData = Schema.instance.getCFMetaData("Keyspace1", "UUIDKeys");
         ColumnDefinition definition = metaData.getColumnDefinition(ByteBufferUtil.bytes(CFMetaData.DEFAULT_KEY_ALIAS));
         assertNotNull(definition);
-        assertEquals(ColumnDefinition.Type.PARTITION_KEY, definition.type);
+        assertEquals(ColumnDefinition.Kind.PARTITION_KEY, definition.kind);
 
         // make sure the key alias does not affect validation of columns with the same name (CASSANDRA-6892)
         Column column = new Column(ByteBufferUtil.bytes(CFMetaData.DEFAULT_KEY_ALIAS));
         column.setValue(ByteBufferUtil.bytes("not a uuid"));
         column.setTimestamp(1234);
-        ThriftValidation.validateColumnData(metaData, column, false);
+        ThriftValidation.validateColumnData(metaData, null, column);
 
         IndexExpression expression = new IndexExpression(ByteBufferUtil.bytes(CFMetaData.DEFAULT_KEY_ALIAS), IndexOperator.EQ, ByteBufferUtil.bytes("a"));
         ThriftValidation.validateFilterClauses(metaData, Arrays.asList(expression));
@@ -124,7 +124,7 @@
         Column column = new Column(ByteBufferUtil.bytes(CFMetaData.DEFAULT_COLUMN_ALIAS + 1));
         column.setValue(ByteBufferUtil.bytes("not a long"));
         column.setTimestamp(1234);
-        ThriftValidation.validateColumnData(metaData, column, false);
+        ThriftValidation.validateColumnData(metaData, null, column);
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/tools/SSTableExportTest.java b/test/unit/org/apache/cassandra/tools/SSTableExportTest.java
index d0ab6a2..c3f3419 100644
--- a/test/unit/org/apache/cassandra/tools/SSTableExportTest.java
+++ b/test/unit/org/apache/cassandra/tools/SSTableExportTest.java

@@ -24,27 +24,25 @@
 import static org.apache.cassandra.io.sstable.SSTableUtils.tempSSTableFile;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytesToHex;
 import static org.apache.cassandra.utils.ByteBufferUtil.hexToBytes;
-import static org.junit.Assert.assertTrue;
 
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
-import java.io.OutputStream;
 import java.io.PrintStream;
-import java.nio.ByteBuffer;
-import java.util.SortedSet;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.sstable.SSTableWriter;
+import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
 import org.json.simple.JSONArray;
 import org.json.simple.JSONObject;
@@ -59,45 +57,20 @@
         return bytesToHex(ByteBufferUtil.bytes(str));
     }
     
-    public SSTableWriter getDummyWriter() throws IOException
-    {
-        File tempSS = tempSSTableFile("Keyspace1", "Standard1");
-        ColumnFamily cfamily = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
-        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2);
-
-        // Add rowA
-        cfamily.addColumn(ByteBufferUtil.bytes("colA"), ByteBufferUtil.bytes("valA"), System.currentTimeMillis());
-        writer.append(Util.dk("rowA"), cfamily);
-        cfamily.clear();
-        
-        cfamily.addColumn(ByteBufferUtil.bytes("colB"), ByteBufferUtil.bytes("valB"), System.currentTimeMillis());
-        writer.append(Util.dk("rowB"), cfamily);
-        cfamily.clear();
-        
-        
-        return writer;
-
-    }
-    
-    
-    public PrintStream dummyStream = new PrintStream(new OutputStream(){
-        public void write(int b) throws IOException { throw new IOException(); }
-    });
-
     @Test
     public void testEnumeratekeys() throws IOException
     {
         File tempSS = tempSSTableFile("Keyspace1", "Standard1");
-        ColumnFamily cfamily = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
-        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2);
+        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         // Add rowA
-        cfamily.addColumn(ByteBufferUtil.bytes("colA"), ByteBufferUtil.bytes("valA"), System.currentTimeMillis());
+        cfamily.addColumn(Util.cellname("colA"), ByteBufferUtil.bytes("valA"), System.currentTimeMillis());
         writer.append(Util.dk("rowA"), cfamily);
         cfamily.clear();
 
         // Add rowB
-        cfamily.addColumn(ByteBufferUtil.bytes("colB"), ByteBufferUtil.bytes("valB"), System.currentTimeMillis());
+        cfamily.addColumn(Util.cellname("colB"), ByteBufferUtil.bytes("valB"), System.currentTimeMillis());
         writer.append(Util.dk("rowB"), cfamily);
         cfamily.clear();
 
@@ -105,7 +78,9 @@
 
         // Enumerate and verify
         File temp = File.createTempFile("Standard1", ".txt");
-        SSTableExport.enumeratekeys(Descriptor.fromFilename(writer.getFilename()), new PrintStream(temp.getPath()));
+        final Descriptor descriptor = Descriptor.fromFilename(writer.getFilename());
+        SSTableExport.enumeratekeys(descriptor, new PrintStream(temp.getPath()),
+                CFMetaData.sparseCFMetaData(descriptor.ksname, descriptor.cfname, BytesType.instance));
 
 
         try (FileReader file = new FileReader(temp))
@@ -123,23 +98,23 @@
     public void testExportSimpleCf() throws IOException, ParseException
     {
         File tempSS = tempSSTableFile("Keyspace1", "Standard1");
-        ColumnFamily cfamily = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
-        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2);
+        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         int nowInSec = (int)(System.currentTimeMillis() / 1000) + 42; //live for 42 seconds
         // Add rowA
-        cfamily.addColumn(ByteBufferUtil.bytes("colA"), ByteBufferUtil.bytes("valA"), System.currentTimeMillis());
-        cfamily.addColumn(new ExpiringColumn(ByteBufferUtil.bytes("colExp"), ByteBufferUtil.bytes("valExp"), System.currentTimeMillis(), 42, nowInSec));
+        cfamily.addColumn(Util.cellname("colA"), ByteBufferUtil.bytes("valA"), System.currentTimeMillis());
+        cfamily.addColumn(new BufferExpiringCell(Util.cellname("colExp"), ByteBufferUtil.bytes("valExp"), System.currentTimeMillis(), 42, nowInSec));
         writer.append(Util.dk("rowA"), cfamily);
         cfamily.clear();
 
         // Add rowB
-        cfamily.addColumn(ByteBufferUtil.bytes("colB"), ByteBufferUtil.bytes("valB"), System.currentTimeMillis());
+        cfamily.addColumn(Util.cellname("colB"), ByteBufferUtil.bytes("valB"), System.currentTimeMillis());
         writer.append(Util.dk("rowB"), cfamily);
         cfamily.clear();
 
         // Add rowExclude
-        cfamily.addColumn(ByteBufferUtil.bytes("colX"), ByteBufferUtil.bytes("valX"), System.currentTimeMillis());
+        cfamily.addColumn(Util.cellname("colX"), ByteBufferUtil.bytes("valX"), System.currentTimeMillis());
         writer.append(Util.dk("rowExclude"), cfamily);
         cfamily.clear();
 
@@ -147,7 +122,8 @@
 
         // Export to JSON and verify
         File tempJson = File.createTempFile("Standard1", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[]{asHex("rowExclude")});
+        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[]{asHex("rowExclude")},
+                CFMetaData.sparseCFMetaData("Keyspace1", "Standard1", BytesType.instance));
 
         JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
         assertEquals("unexpected number of rows", 2, json.size());
@@ -156,7 +132,7 @@
         assertEquals("unexpected number of keys", 2, rowA.keySet().size());
         assertEquals("unexpected row key",asHex("rowA"),rowA.get("key"));
 
-        JSONArray colsA = (JSONArray)rowA.get("columns");
+        JSONArray colsA = (JSONArray)rowA.get("cells");
         JSONArray colA = (JSONArray)colsA.get(0);
         assert hexToBytes((String)colA.get(1)).equals(ByteBufferUtil.bytes("valA"));
 
@@ -168,7 +144,7 @@
         assertEquals("unexpected number of keys", 2, rowB.keySet().size());
         assertEquals("unexpected row key",asHex("rowB"),rowB.get("key"));
 
-        JSONArray colsB = (JSONArray)rowB.get("columns");
+        JSONArray colsB = (JSONArray)rowB.get("cells");
         JSONArray colB = (JSONArray)colsB.get(0);
         assert colB.size() == 3;
 
@@ -177,17 +153,18 @@
     @Test
     public void testRoundTripStandardCf() throws IOException
     {
+        ColumnFamilyStore cfs = Keyspace.open("Keyspace1").getColumnFamilyStore("Standard1");
         File tempSS = tempSSTableFile("Keyspace1", "Standard1");
-        ColumnFamily cfamily = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
-        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2);
+        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         // Add rowA
-        cfamily.addColumn(ByteBufferUtil.bytes("name"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
+        cfamily.addColumn(Util.cellname("name"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
         writer.append(Util.dk("rowA"), cfamily);
         cfamily.clear();
 
         // Add rowExclude
-        cfamily.addColumn(ByteBufferUtil.bytes("name"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
+        cfamily.addColumn(Util.cellname("name"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
         writer.append(Util.dk("rowExclude"), cfamily);
         cfamily.clear();
 
@@ -195,21 +172,21 @@
 
         // Export to JSON and verify
         File tempJson = File.createTempFile("Standard1", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[]{asHex("rowExclude")});
+        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[]{asHex("rowExclude")},
+                CFMetaData.sparseCFMetaData("Keyspace1", "Standard1", BytesType.instance));
 
         // Import JSON to another SSTable file
         File tempSS2 = tempSSTableFile("Keyspace1", "Standard1");
         new SSTableImport().importJson(tempJson.getPath(), "Keyspace1", "Standard1", tempSS2.getPath());
 
         reader = SSTableReader.open(Descriptor.fromFilename(tempSS2.getPath()));
-        SortedSet<ByteBuffer> names = FBUtilities.singleton(ByteBufferUtil.bytes("name"), cfamily.getComparator());
-        QueryFilter qf = QueryFilter.getNamesFilter(Util.dk("rowA"), "Standard1", names, System.currentTimeMillis());
+        QueryFilter qf = Util.namesQueryFilter(cfs, Util.dk("rowA"), "name");
         ColumnFamily cf = qf.getSSTableColumnIterator(reader).getColumnFamily();
         qf.collateOnDiskAtom(cf, qf.getSSTableColumnIterator(reader), Integer.MIN_VALUE);
-        assertTrue(cf != null);
-        assertTrue(cf.getColumn(ByteBufferUtil.bytes("name")).value().equals(hexToBytes("76616c")));
+        assertNotNull(cf);
+        assertEquals(hexToBytes("76616c"), cf.getColumn(Util.cellname("name")).value());
 
-        qf = QueryFilter.getNamesFilter(Util.dk("rowExclude"), "Standard1", names, System.currentTimeMillis());
+        qf = Util.namesQueryFilter(cfs, Util.dk("rowExclude"), "name");
         cf = qf.getSSTableColumnIterator(reader).getColumnFamily();
         assert cf == null;
     }
@@ -218,11 +195,11 @@
     public void testExportCounterCf() throws IOException, ParseException
     {
         File tempSS = tempSSTableFile("Keyspace1", "Counter1");
-        ColumnFamily cfamily = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Counter1");
-        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2);
+        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create("Keyspace1", "Counter1");
+        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         // Add rowA
-        cfamily.addColumn(new CounterColumn(ByteBufferUtil.bytes("colA"), 42, System.currentTimeMillis()));
+        cfamily.addColumn(BufferCounterCell.createLocal(Util.cellname("colA"), 42, System.currentTimeMillis(), Long.MIN_VALUE));
         writer.append(Util.dk("rowA"), cfamily);
         cfamily.clear();
 
@@ -230,7 +207,8 @@
 
         // Export to JSON and verify
         File tempJson = File.createTempFile("Counter1", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0]);
+        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0],
+                CFMetaData.sparseCFMetaData("Keyspace1", "Counter1", BytesType.instance));
         JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
         assertEquals("unexpected number of rows", 1, json.size());
 
@@ -238,7 +216,7 @@
         assertEquals("unexpected number of keys", 2, row.keySet().size());
         assertEquals("unexpected row key",asHex("rowA"),row.get("key"));
 
-        JSONArray cols = (JSONArray)row.get("columns");
+        JSONArray cols = (JSONArray)row.get("cells");
         JSONArray colA = (JSONArray)cols.get(0);
         assert hexToBytes((String)colA.get(0)).equals(ByteBufferUtil.bytes("colA"));
         assert ((String) colA.get(3)).equals("c");
@@ -249,11 +227,11 @@
     public void testEscapingDoubleQuotes() throws IOException, ParseException
     {
         File tempSS = tempSSTableFile("Keyspace1", "ValuesWithQuotes");
-        ColumnFamily cfamily = TreeMapBackedSortedColumns.factory.create("Keyspace1", "ValuesWithQuotes");
-        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2);
+        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create("Keyspace1", "ValuesWithQuotes");
+        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         // Add rowA
-        cfamily.addColumn(new Column(ByteBufferUtil.bytes("data"), UTF8Type.instance.fromString("{\"foo\":\"bar\"}")));
+        cfamily.addColumn(new BufferCell(Util.cellname("data"), UTF8Type.instance.fromString("{\"foo\":\"bar\"}")));
         writer.append(Util.dk("rowA"), cfamily);
         cfamily.clear();
 
@@ -261,7 +239,8 @@
 
         // Export to JSON and verify
         File tempJson = File.createTempFile("ValuesWithQuotes", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0]);
+        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0],
+                CFMetaData.sparseCFMetaData("Keyspace1", "ValuesWithQuotes", BytesType.instance));
 
         JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
         assertEquals("unexpected number of rows", 1, json.size());
@@ -270,7 +249,7 @@
         assertEquals("unexpected number of keys", 2, row.keySet().size());
         assertEquals("unexpected row key",asHex("rowA"),row.get("key"));
 
-        JSONArray cols = (JSONArray)row.get("columns");
+        JSONArray cols = (JSONArray)row.get("cells");
         JSONArray colA = (JSONArray)cols.get(0);
         assert hexToBytes((String)colA.get(0)).equals(ByteBufferUtil.bytes("data"));
         assert colA.get(1).equals("{\"foo\":\"bar\"}");
@@ -279,24 +258,23 @@
     @Test
     public void testExportColumnsWithMetadata() throws IOException, ParseException
     {
-
         File tempSS = tempSSTableFile("Keyspace1", "Standard1");
-        ColumnFamily cfamily = TreeMapBackedSortedColumns.factory.create("Keyspace1", "Standard1");
-        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2);
+        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
+        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         // Add rowA
-        cfamily.addColumn(ByteBufferUtil.bytes("colName"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
-        cfamily.addColumn(ByteBufferUtil.bytes("colName1"), ByteBufferUtil.bytes("val1"), System.currentTimeMillis());
+        cfamily.addColumn(Util.cellname("colName"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
+        cfamily.addColumn(Util.cellname("colName1"), ByteBufferUtil.bytes("val1"), System.currentTimeMillis());
         cfamily.delete(new DeletionInfo(0, 0));
         writer.append(Util.dk("rowA"), cfamily);
 
         SSTableReader reader = writer.closeAndOpenReader();
         // Export to JSON and verify
         File tempJson = File.createTempFile("CFWithDeletionInfo", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0]);
+        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0],
+                CFMetaData.sparseCFMetaData("Keyspace1", "Counter1", BytesType.instance));
 
         JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
-        System.out.println(json.toJSONString());
         assertEquals("unexpected number of rows", 1, json.size());
 
         JSONObject row = (JSONObject)json.get(0);
@@ -323,7 +301,7 @@
                 serializedDeletionInfo.toJSONString());
 
         // check the colums are what we put in
-        JSONArray cols = (JSONArray) row.get("columns");
+        JSONArray cols = (JSONArray) row.get("cells");
         assertNotNull("expecing columns to be present", cols);
         assertEquals("expecting two columns", 2, cols.size());
 
@@ -334,7 +312,6 @@
         JSONArray col2 = (JSONArray) cols.get(1);
         assertEquals("column name did not match", ByteBufferUtil.bytes("colName1"), hexToBytes((String) col2.get(0)));
         assertEquals("column value did not match", ByteBufferUtil.bytes("val1"), hexToBytes((String) col2.get(1)));
-
     }
 
     /**
@@ -344,8 +321,8 @@
     public void testColumnNameEqualToDefaultKeyAlias() throws IOException, ParseException
     {
         File tempSS = tempSSTableFile("Keyspace1", "UUIDKeys");
-        ColumnFamily cfamily = TreeMapBackedSortedColumns.factory.create("Keyspace1", "UUIDKeys");
-        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2);
+        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create("Keyspace1", "UUIDKeys");
+        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
 
         // Add a row
         cfamily.addColumn(column(CFMetaData.DEFAULT_KEY_ALIAS, "not a uuid", 1L));
@@ -354,13 +331,14 @@
         SSTableReader reader = writer.closeAndOpenReader();
         // Export to JSON and verify
         File tempJson = File.createTempFile("CFWithColumnNameEqualToDefaultKeyAlias", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0]);
+        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0],
+                CFMetaData.sparseCFMetaData("Keyspace1", "UUIDKeys", BytesType.instance));
 
         JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
         assertEquals(1, json.size());
 
         JSONObject row = (JSONObject)json.get(0);
-        JSONArray cols = (JSONArray) row.get("columns");
+        JSONArray cols = (JSONArray) row.get("cells");
         assertEquals(1, cols.size());
 
         // check column name and value
@@ -368,4 +346,31 @@
         assertEquals(CFMetaData.DEFAULT_KEY_ALIAS, ByteBufferUtil.string(hexToBytes((String) col.get(0))));
         assertEquals("not a uuid", ByteBufferUtil.string(hexToBytes((String) col.get(1))));
     }
+
+    @Test
+    public void testAsciiKeyValidator() throws IOException, ParseException
+    {
+        File tempSS = tempSSTableFile("Keyspace1", "AsciiKeys");
+        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create("Keyspace1", "AsciiKeys");
+        SSTableWriter writer = new SSTableWriter(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
+
+        // Add a row
+        cfamily.addColumn(column("column", "value", 1L));
+        writer.append(Util.dk("key", AsciiType.instance), cfamily);
+
+        SSTableReader reader = writer.closeAndOpenReader();
+        // Export to JSON and verify
+        File tempJson = File.createTempFile("CFWithAsciiKeys", ".json");
+        SSTableExport.export(reader,
+                             new PrintStream(tempJson.getPath()),
+                             new String[0],
+                             CFMetaData.sparseCFMetaData("Keyspace1", "AsciiKeys", BytesType.instance));
+
+        JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
+        assertEquals(1, json.size());
+
+        JSONObject row = (JSONObject)json.get(0);
+        // check row key
+        assertEquals("key", row.get("key"));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/tools/SSTableImportTest.java b/test/unit/org/apache/cassandra/tools/SSTableImportTest.java
index ae2ade2..38e5914 100644
--- a/test/unit/org/apache/cassandra/tools/SSTableImportTest.java
+++ b/test/unit/org/apache/cassandra/tools/SSTableImportTest.java

@@ -18,7 +18,11 @@
 */
 package org.apache.cassandra.tools;
 
+import static org.hamcrest.CoreMatchers.is;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+import static org.junit.matchers.JUnitMatchers.hasItem;
+
 import static org.apache.cassandra.io.sstable.SSTableUtils.tempSSTableFile;
 import static org.apache.cassandra.utils.ByteBufferUtil.hexToBytes;
 
@@ -27,17 +31,23 @@
 import java.net.URI;
 import java.net.URISyntaxException;
 
+import org.hamcrest.Description;
+import org.hamcrest.Matcher;
 import org.junit.Test;
+import org.junit.internal.matchers.TypeSafeMatcher;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.UntypedResultSet.Row;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableReader;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class SSTableImportTest extends SchemaLoader
 {
@@ -55,12 +65,12 @@
         OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
         ColumnFamily cf = cloneForAdditions(iter);
         while (iter.hasNext()) cf.addAtom(iter.next());
-        assert cf.getColumn(ByteBufferUtil.bytes("colAA")).value().equals(hexToBytes("76616c4141"));
-        assert !(cf.getColumn(ByteBufferUtil.bytes("colAA")) instanceof DeletedColumn);
-        Column expCol = cf.getColumn(ByteBufferUtil.bytes("colAC"));
+        assert cf.getColumn(Util.cellname("colAA")).value().equals(hexToBytes("76616c4141"));
+        assert !(cf.getColumn(Util.cellname("colAA")) instanceof BufferDeletedCell);
+        Cell expCol = cf.getColumn(Util.cellname("colAC"));
         assert expCol.value().equals(hexToBytes("76616c4143"));
-        assert expCol instanceof ExpiringColumn;
-        assert ((ExpiringColumn)expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
+        assert expCol instanceof ExpiringCell;
+        assert ((ExpiringCell)expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
     }
 
     private ColumnFamily cloneForAdditions(OnDiskAtomIterator iter)
@@ -76,57 +86,6 @@
     }
 
     @Test
-    public void testImportSimpleCfOldFormat() throws IOException, URISyntaxException
-    {
-        // Import JSON to temp SSTable file
-        String jsonUrl = resourcePath("SimpleCF.oldformat.json");
-        File tempSS = tempSSTableFile("Keyspace1", "Standard1");
-        new SSTableImport(true).importJson(jsonUrl, "Keyspace1", "Standard1", tempSS.getPath());
-
-        // Verify results
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("rowA"), "Standard1", System.currentTimeMillis());
-        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
-        ColumnFamily cf = cloneForAdditions(iter);
-        while (iter.hasNext()) cf.addAtom(iter.next());
-        assert cf.getColumn(ByteBufferUtil.bytes("colAA")).value().equals(hexToBytes("76616c4141"));
-        assert !(cf.getColumn(ByteBufferUtil.bytes("colAA")) instanceof DeletedColumn);
-        Column expCol = cf.getColumn(ByteBufferUtil.bytes("colAC"));
-        assert expCol.value().equals(hexToBytes("76616c4143"));
-        assert expCol instanceof ExpiringColumn;
-        assert ((ExpiringColumn)expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
-    }
-
-    @Test
-    public void testImportSuperCf() throws IOException, URISyntaxException
-    {
-        String jsonUrl = resourcePath("SuperCF.json");
-        File tempSS = tempSSTableFile("Keyspace1", "Super4");
-        new SSTableImport(true, true).importJson(jsonUrl, "Keyspace1", "Super4", tempSS.getPath());
-
-        // Verify results
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("rowA"), "Super4", System.currentTimeMillis());
-        ColumnFamily cf = cloneForAdditions(qf.getSSTableColumnIterator(reader));
-        qf.collateOnDiskAtom(cf, qf.getSSTableColumnIterator(reader), Integer.MIN_VALUE);
-
-        DeletionTime delTime = cf.deletionInfo().rangeCovering(CompositeType.build(ByteBufferUtil.bytes("superA")));
-        assertEquals("supercolumn deletion time did not match the expected time", new DeletionInfo(0, 0), new DeletionInfo(delTime));
-        Column subColumn = cf.getColumn(CompositeType.build(ByteBufferUtil.bytes("superA"), ByteBufferUtil.bytes("636f6c4141")));
-        assert subColumn.value().equals(hexToBytes("76616c75654141"));
-    }
-
-    @Test
-    public void testImportUnsortedDataWithSortedOptionFails() throws IOException, URISyntaxException
-    {
-        String jsonUrl = resourcePath("UnsortedSuperCF.json");
-        File tempSS = tempSSTableFile("Keyspace1", "Super4");
-
-        int result = new SSTableImport(3,true, true).importJson(jsonUrl, "Keyspace1","Super4", tempSS.getPath());
-        assert result == -1;
-    }
-
-    @Test
     public void testImportUnsortedMode() throws IOException, URISyntaxException
     {
         String jsonUrl = resourcePath("UnsortedCF.json");
@@ -140,12 +99,12 @@
         ColumnFamily cf = cloneForAdditions(iter);
         while (iter.hasNext())
             cf.addAtom(iter.next());
-        assert cf.getColumn(ByteBufferUtil.bytes("colAA")).value().equals(hexToBytes("76616c4141"));
-        assert !(cf.getColumn(ByteBufferUtil.bytes("colAA")) instanceof DeletedColumn);
-        Column expCol = cf.getColumn(ByteBufferUtil.bytes("colAC"));
+        assert cf.getColumn(Util.cellname("colAA")).value().equals(hexToBytes("76616c4141"));
+        assert !(cf.getColumn(Util.cellname("colAA")) instanceof BufferDeletedCell);
+        Cell expCol = cf.getColumn(Util.cellname("colAC"));
         assert expCol.value().equals(hexToBytes("76616c4143"));
-        assert expCol instanceof ExpiringColumn;
-        assert ((ExpiringColumn) expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
+        assert expCol instanceof ExpiringCell;
+        assert ((ExpiringCell) expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
     }
 
     @Test
@@ -164,12 +123,12 @@
         assertEquals(cf.deletionInfo(), new DeletionInfo(0, 0));
         while (iter.hasNext())
             cf.addAtom(iter.next());
-        assert cf.getColumn(ByteBufferUtil.bytes("colAA")).value().equals(hexToBytes("76616c4141"));
-        assert !(cf.getColumn(ByteBufferUtil.bytes("colAA")) instanceof DeletedColumn);
-        Column expCol = cf.getColumn(ByteBufferUtil.bytes("colAC"));
+        assert cf.getColumn(Util.cellname("colAA")).value().equals(hexToBytes("76616c4141"));
+        assert !(cf.getColumn(Util.cellname("colAA")) instanceof BufferDeletedCell);
+        Cell expCol = cf.getColumn(Util.cellname("colAC"));
         assert expCol.value().equals(hexToBytes("76616c4143"));
-        assert expCol instanceof ExpiringColumn;
-        assert ((ExpiringColumn) expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
+        assert expCol instanceof ExpiringCell;
+        assert ((ExpiringCell) expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
     }
 
     @Test
@@ -186,8 +145,96 @@
         OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
         ColumnFamily cf = cloneForAdditions(iter);
         while (iter.hasNext()) cf.addAtom(iter.next());
-        Column c = cf.getColumn(ByteBufferUtil.bytes("colAA"));
-        assert c instanceof CounterColumn: c;
-        assert ((CounterColumn) c).total() == 42;
+        Cell c = cf.getColumn(Util.cellname("colAA"));
+        assert c instanceof CounterCell : c;
+        assert ((CounterCell) c).total() == 42;
+    }
+
+    @Test
+    public void testImportWithAsciiKeyValidator() throws IOException, URISyntaxException
+    {
+        // Import JSON to temp SSTable file
+        String jsonUrl = resourcePath("SimpleCF.json");
+        File tempSS = tempSSTableFile("Keyspace1", "AsciiKeys");
+        new SSTableImport(true).importJson(jsonUrl, "Keyspace1", "AsciiKeys", tempSS.getPath());
+
+        // Verify results
+        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
+        // check that keys are treated as ascii
+        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("726f7741", AsciiType.instance), "AsciiKeys", System.currentTimeMillis());
+        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
+        assert iter.hasNext(); // "ascii" key exists
+        QueryFilter qf2 = QueryFilter.getIdentityFilter(Util.dk("726f7741", BytesType.instance), "AsciiKeys", System.currentTimeMillis());
+        OnDiskAtomIterator iter2 = qf2.getSSTableColumnIterator(reader);
+        assert !iter2.hasNext(); // "bytes" key does not exist
+    }
+
+    @Test
+    public void testBackwardCompatibilityOfImportWithAsciiKeyValidator() throws IOException, URISyntaxException
+    {
+        // Import JSON to temp SSTable file
+        String jsonUrl = resourcePath("SimpleCF.json");
+        File tempSS = tempSSTableFile("Keyspace1", "AsciiKeys");
+        // To ignore current key validator
+        System.setProperty("skip.key.validator", "true");
+        new SSTableImport(true).importJson(jsonUrl, "Keyspace1", "AsciiKeys", tempSS.getPath());
+
+        // Verify results
+        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
+        // check that keys are treated as bytes
+        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("rowA"), "AsciiKeys", System.currentTimeMillis());
+        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
+        assert iter.hasNext(); // "bytes" key exists
+    }
+    
+    @Test
+    /* 
+     *  The schema is 
+     *      CREATE TABLE cql_keyspace.table1 (k int PRIMARY KEY, v1 text, v2 int)
+     * */
+    public void shouldImportCqlTable() throws IOException, URISyntaxException
+    {
+        String cql_keyspace = "cql_keyspace";
+        String cql_table = "table1";
+        String jsonUrl = resourcePath("CQLTable.json");
+        File tempSS = tempSSTableFile(cql_keyspace, cql_table);
+        new SSTableImport(true).importJson(jsonUrl, cql_keyspace, cql_table, tempSS.getPath());
+        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
+        Keyspace.open(cql_keyspace).getColumnFamilyStore(cql_table).addSSTable(reader);
+        
+        UntypedResultSet result = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s", cql_keyspace, cql_table));
+        assertThat(result.size(), is(2));
+        assertThat(result, hasItem(withElements(1, "NY", 1980)));
+        assertThat(result, hasItem(withElements(2, "CA", 2014)));
+    }
+
+    @Test(expected=AssertionError.class)
+    public void shouldRejectEmptyCellNamesForNonCqlTables() throws IOException, URISyntaxException
+    {
+        String jsonUrl = resourcePath("CQLTable.json");
+        File tempSS = tempSSTableFile("Keyspace1", "Counter1");
+        new SSTableImport(true).importJson(jsonUrl, "Keyspace1", "Counter1", tempSS.getPath());
+    }
+    
+    private static Matcher<UntypedResultSet.Row> withElements(final int key, final String v1, final int v2) {
+        return new TypeSafeMatcher<UntypedResultSet.Row>()
+        {
+            @Override
+            public boolean matchesSafely(Row input)
+            {
+                if (!input.has("k") || !input.has("v1") || !input.has("v2"))
+                    return false;
+                return input.getInt("k") == key
+                        && input.getString("v1").equals(v1)
+                        && input.getInt("v2") == v2;
+            }
+
+            @Override
+            public void describeTo(Description description)
+            {
+                description.appendText(String.format("a row containing: %s, %s, %s", key, v1, v2));
+            }
+        };
+        
     }
 }

diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
new file mode 100644
index 0000000..9b66efb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java

@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.transport;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import io.netty.buffer.Unpooled;
+import io.netty.buffer.ByteBuf;
+
+import org.junit.Test;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.serializers.CollectionSerializer;
+import org.apache.cassandra.transport.Event.TopologyChange;
+import org.apache.cassandra.transport.Event.SchemaChange;
+import org.apache.cassandra.transport.Event.StatusChange;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+/**
+ * Serialization/deserialization tests for protocol objects and messages.
+ */
+public class SerDeserTest
+{
+    @Test
+    public void collectionSerDeserTest() throws Exception
+    {
+        collectionSerDeserTest(2);
+        collectionSerDeserTest(3);
+    }
+
+    public void collectionSerDeserTest(int version) throws Exception
+    {
+        // Lists
+        ListType<?> lt = ListType.getInstance(Int32Type.instance);
+        List<Integer> l = Arrays.asList(2, 6, 1, 9);
+
+        List<ByteBuffer> lb = new ArrayList<>(l.size());
+        for (Integer i : l)
+            lb.add(Int32Type.instance.decompose(i));
+
+        assertEquals(l, lt.getSerializer().deserializeForNativeProtocol(CollectionSerializer.pack(lb, lb.size(), version), version));
+
+        // Sets
+        SetType<?> st = SetType.getInstance(UTF8Type.instance);
+        Set<String> s = new LinkedHashSet<>();
+        s.addAll(Arrays.asList("bar", "foo", "zee"));
+
+        List<ByteBuffer> sb = new ArrayList<>(s.size());
+        for (String t : s)
+            sb.add(UTF8Type.instance.decompose(t));
+
+        assertEquals(s, st.getSerializer().deserializeForNativeProtocol(CollectionSerializer.pack(sb, sb.size(), version), version));
+
+        // Maps
+        MapType<?, ?> mt = MapType.getInstance(UTF8Type.instance, LongType.instance);
+        Map<String, Long> m = new LinkedHashMap<>();
+        m.put("bar", 12L);
+        m.put("foo", 42L);
+        m.put("zee", 14L);
+
+        List<ByteBuffer> mb = new ArrayList<>(m.size() * 2);
+        for (Map.Entry<String, Long> entry : m.entrySet())
+        {
+            mb.add(UTF8Type.instance.decompose(entry.getKey()));
+            mb.add(LongType.instance.decompose(entry.getValue()));
+        }
+
+        assertEquals(m, mt.getSerializer().deserializeForNativeProtocol(CollectionSerializer.pack(mb, m.size(), version), version));
+    }
+
+    @Test
+    public void eventSerDeserTest() throws Exception
+    {
+        eventSerDeserTest(2);
+        eventSerDeserTest(3);
+    }
+
+    public void eventSerDeserTest(int version) throws Exception
+    {
+        List<Event> events = new ArrayList<>();
+
+        events.add(TopologyChange.newNode(FBUtilities.getBroadcastAddress(), 42));
+        events.add(TopologyChange.removedNode(FBUtilities.getBroadcastAddress(), 42));
+        events.add(TopologyChange.movedNode(FBUtilities.getBroadcastAddress(), 42));
+
+        events.add(StatusChange.nodeUp(FBUtilities.getBroadcastAddress(), 42));
+        events.add(StatusChange.nodeDown(FBUtilities.getBroadcastAddress(), 42));
+
+        events.add(new SchemaChange(SchemaChange.Change.CREATED, "ks"));
+        events.add(new SchemaChange(SchemaChange.Change.UPDATED, "ks"));
+        events.add(new SchemaChange(SchemaChange.Change.DROPPED, "ks"));
+
+        events.add(new SchemaChange(SchemaChange.Change.CREATED, SchemaChange.Target.TABLE, "ks", "table"));
+        events.add(new SchemaChange(SchemaChange.Change.UPDATED, SchemaChange.Target.TABLE, "ks", "table"));
+        events.add(new SchemaChange(SchemaChange.Change.DROPPED, SchemaChange.Target.TABLE, "ks", "table"));
+
+        if (version >= 3)
+        {
+            events.add(new SchemaChange(SchemaChange.Change.CREATED, SchemaChange.Target.TYPE, "ks", "type"));
+            events.add(new SchemaChange(SchemaChange.Change.UPDATED, SchemaChange.Target.TYPE, "ks", "type"));
+            events.add(new SchemaChange(SchemaChange.Change.DROPPED, SchemaChange.Target.TYPE, "ks", "type"));
+        }
+
+        for (Event ev : events)
+        {
+            ByteBuf buf = Unpooled.buffer(ev.serializedSize(version));
+            ev.serialize(buf, version);
+            assertEquals(ev, Event.deserialize(buf, version));
+        }
+    }
+
+    private static ByteBuffer bb(String str)
+    {
+        return UTF8Type.instance.decompose(str);
+    }
+
+    private static ColumnIdentifier ci(String name)
+    {
+        return new ColumnIdentifier(name, false);
+    }
+
+    private static Constants.Literal lit(long v)
+    {
+        return Constants.Literal.integer(String.valueOf(v));
+    }
+
+    private static Constants.Literal lit(String v)
+    {
+        return Constants.Literal.string(v);
+    }
+
+    private static ColumnSpecification columnSpec(String name, AbstractType<?> type)
+    {
+        return new ColumnSpecification("ks", "cf", ci(name), type);
+    }
+
+    @Test
+    public void udtSerDeserTest() throws Exception
+    {
+        udtSerDeserTest(2);
+        udtSerDeserTest(3);
+    }
+
+    public void udtSerDeserTest(int version) throws Exception
+    {
+        ListType<?> lt = ListType.getInstance(Int32Type.instance);
+        SetType<?> st = SetType.getInstance(UTF8Type.instance);
+        MapType<?, ?> mt = MapType.getInstance(UTF8Type.instance, LongType.instance);
+
+        UserType udt = new UserType("ks",
+                                    bb("myType"),
+                                    Arrays.asList(bb("f1"), bb("f2"), bb("f3"), bb("f4")),
+                                    Arrays.asList(LongType.instance, lt, st, mt));
+
+        Map<ColumnIdentifier, Term.Raw> value = new HashMap<>();
+        value.put(ci("f1"), lit(42));
+        value.put(ci("f2"), new Lists.Literal(Arrays.<Term.Raw>asList(lit(3), lit(1))));
+        value.put(ci("f3"), new Sets.Literal(Arrays.<Term.Raw>asList(lit("foo"), lit("bar"))));
+        value.put(ci("f4"), new Maps.Literal(Arrays.<Pair<Term.Raw, Term.Raw>>asList(
+                                   Pair.<Term.Raw, Term.Raw>create(lit("foo"), lit(24)),
+                                   Pair.<Term.Raw, Term.Raw>create(lit("bar"), lit(12)))));
+
+        UserTypes.Literal u = new UserTypes.Literal(value);
+        Term t = u.prepare("ks", columnSpec("myValue", udt));
+
+        QueryOptions options = QueryOptions.DEFAULT;
+        if (version == 2)
+            options = QueryOptions.fromProtocolV2(ConsistencyLevel.ONE, Collections.<ByteBuffer>emptyList());
+        else if (version != 3)
+            throw new AssertionError("Invalid protocol version for test");
+
+        ByteBuffer serialized = t.bindAndGet(options);
+
+        ByteBuffer[] fields = udt.split(serialized);
+
+        assertEquals(4, fields.length);
+
+        assertEquals(bytes(42L), fields[0]);
+
+        // Note that no matter what the protocol version has been used in bindAndGet above, the collections inside
+        // a UDT should alway be serialized with version 3 of the protocol. Which is why we don't use 'version'
+        // on purpose below.
+
+        assertEquals(Arrays.asList(3, 1), lt.getSerializer().deserializeForNativeProtocol(fields[1], 3));
+
+        LinkedHashSet<String> s = new LinkedHashSet<>();
+        s.addAll(Arrays.asList("bar", "foo"));
+        assertEquals(s, st.getSerializer().deserializeForNativeProtocol(fields[2], 3));
+
+        LinkedHashMap<String, Long> m = new LinkedHashMap<>();
+        m.put("bar", 12L);
+        m.put("foo", 24L);
+        assertEquals(m, mt.getSerializer().deserializeForNativeProtocol(fields[3], 3));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/triggers/TriggerExecutorTest.java b/test/unit/org/apache/cassandra/triggers/TriggerExecutorTest.java
index ab7f7c4..3d505c8 100644
--- a/test/unit/org/apache/cassandra/triggers/TriggerExecutorTest.java
+++ b/test/unit/org/apache/cassandra/triggers/TriggerExecutorTest.java

@@ -25,6 +25,7 @@
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.config.TriggerDefinition;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.composites.CellName;
 import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.ConfigurationException;
@@ -63,7 +64,7 @@
     public void noTriggerMutations() throws ConfigurationException, InvalidRequestException
     {
         CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", NoOpTrigger.class.getName()));
-        RowMutation rm = new RowMutation(bytes("k1"), makeCf(metadata, "v1", null));
+        Mutation rm = new Mutation(bytes("k1"), makeCf(metadata, "v1", null));
         assertNull(TriggerExecutor.instance.execute(Collections.singletonList(rm)));
     }
 
@@ -73,8 +74,8 @@
         CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeySameCfTrigger.class.getName()));
         ColumnFamily cf1 = makeCf(metadata, "k1v1", null);
         ColumnFamily cf2 = makeCf(metadata, "k2v1", null);
-        RowMutation rm1 = new RowMutation(bytes("k1"), cf1);
-        RowMutation rm2 = new RowMutation(bytes("k2"), cf2);
+        Mutation rm1 = new Mutation(bytes("k1"), cf1);
+        Mutation rm2 = new Mutation(bytes("k2"), cf2);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm1, rm2)));
         assertEquals(2, tmutations.size());
@@ -97,8 +98,8 @@
         CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeySameCfPartialTrigger.class.getName()));
         ColumnFamily cf1 = makeCf(metadata, "k1v1", null);
         ColumnFamily cf2 = makeCf(metadata, "k2v1", null);
-        RowMutation rm1 = new RowMutation(bytes("k1"), cf1);
-        RowMutation rm2 = new RowMutation(bytes("k2"), cf2);
+        Mutation rm1 = new Mutation(bytes("k1"), cf1);
+        Mutation rm2 = new Mutation(bytes("k2"), cf2);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm1, rm2)));
         assertEquals(2, tmutations.size());
@@ -121,8 +122,8 @@
         CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeyDifferentCfTrigger.class.getName()));
         ColumnFamily cf1 = makeCf(metadata, "k1v1", null);
         ColumnFamily cf2 = makeCf(metadata, "k2v1", null);
-        RowMutation rm1 = new RowMutation(bytes("k1"), cf1);
-        RowMutation rm2 = new RowMutation(bytes("k2"), cf2);
+        Mutation rm1 = new Mutation(bytes("k1"), cf1);
+        Mutation rm2 = new Mutation(bytes("k2"), cf2);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm1, rm2)));
         assertEquals(2, tmutations.size());
@@ -153,8 +154,8 @@
         CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeyDifferentKsTrigger.class.getName()));
         ColumnFamily cf1 = makeCf(metadata, "k1v1", null);
         ColumnFamily cf2 = makeCf(metadata, "k2v1", null);
-        RowMutation rm1 = new RowMutation(bytes("k1"), cf1);
-        RowMutation rm2 = new RowMutation(bytes("k2"), cf2);
+        Mutation rm1 = new Mutation(bytes("k1"), cf1);
+        Mutation rm2 = new Mutation(bytes("k2"), cf2);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm1, rm2)));
         assertEquals(4, tmutations.size());
@@ -186,7 +187,7 @@
     {
         CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", DifferentKeyTrigger.class.getName()));
         ColumnFamily cf = makeCf(metadata, "v1", null);
-        RowMutation rm = new RowMutation(UTF8Type.instance.fromString("k1"), cf);
+        Mutation rm = new Mutation(UTF8Type.instance.fromString("k1"), cf);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm)));
         assertEquals(2, tmutations.size());
@@ -209,16 +210,19 @@
     private static CFMetaData makeCfMetaData(String ks, String cf, TriggerDefinition trigger)
     {
 
-        CFMetaData metadata = new CFMetaData(ks, cf, ColumnFamilyType.Standard, CompositeType.getInstance(UTF8Type.instance));
+        CFMetaData metadata = CFMetaData.sparseCFMetaData(ks, cf, CompositeType.getInstance(UTF8Type.instance));
 
         metadata.keyValidator(UTF8Type.instance);
-        metadata.addOrReplaceColumnDefinition(ColumnDefinition.partitionKeyDef(UTF8Type.instance.fromString("pkey"),
+        metadata.addOrReplaceColumnDefinition(ColumnDefinition.partitionKeyDef(metadata,
+                                                                               UTF8Type.instance.fromString("pkey"),
                                                                                UTF8Type.instance,
                                                                                null));
-        metadata.addOrReplaceColumnDefinition(ColumnDefinition.regularDef(UTF8Type.instance.fromString("c1"),
+        metadata.addOrReplaceColumnDefinition(ColumnDefinition.regularDef(metadata,
+                                                                          UTF8Type.instance.fromString("c1"),
                                                                           UTF8Type.instance,
                                                                           0));
-        metadata.addOrReplaceColumnDefinition(ColumnDefinition.regularDef(UTF8Type.instance.fromString("c2"),
+        metadata.addOrReplaceColumnDefinition(ColumnDefinition.regularDef(metadata,
+                                                                          UTF8Type.instance.fromString("c2"),
                                                                           UTF8Type.instance,
                                                                           0));
         try
@@ -226,7 +230,7 @@
             if (trigger != null)
                 metadata.addTriggerDefinition(trigger);
         }
-        catch (ConfigurationException e)
+        catch (InvalidRequestException e)
         {
             throw new AssertionError(e);
         }
@@ -239,22 +243,22 @@
         ColumnFamily cf = ArrayBackedSortedColumns.factory.create(metadata);
 
         if (columnValue1 != null)
-            cf.addColumn(new Column(getColumnName(metadata, "c1"), bytes(columnValue1)));
+            cf.addColumn(new BufferCell(getColumnName(metadata, "c1"), bytes(columnValue1)));
 
         if (columnValue2 != null)
-            cf.addColumn(new Column(getColumnName(metadata, "c2"), bytes(columnValue2)));
+            cf.addColumn(new BufferCell(getColumnName(metadata, "c2"), bytes(columnValue2)));
 
         return cf;
     }
 
-    private static ByteBuffer getColumnName(CFMetaData metadata, String stringName)
+    private static CellName getColumnName(CFMetaData metadata, String stringName)
     {
-        return ((CompositeType) metadata.comparator).builder().add(bytes(stringName)).build();
+        return metadata.comparator.makeCellName(stringName);
     }
 
     public static class NoOpTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             return null;
         }
@@ -262,54 +266,54 @@
 
     public static class SameKeySameCfTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             ColumnFamily cf = ArrayBackedSortedColumns.factory.create(update.metadata());
-            cf.addColumn(new Column(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new RowMutation(update.metadata().ksName, key, cf));
+            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
+            return Collections.singletonList(new Mutation(update.metadata().ksName, key, cf));
         }
     }
 
     public static class SameKeySameCfPartialTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             if (!key.equals(bytes("k2")))
                 return null;
 
             ColumnFamily cf = ArrayBackedSortedColumns.factory.create(update.metadata());
-            cf.addColumn(new Column(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new RowMutation(update.metadata().ksName, key, cf));
+            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
+            return Collections.singletonList(new Mutation(update.metadata().ksName, key, cf));
         }
     }
 
     public static class SameKeyDifferentCfTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             ColumnFamily cf = ArrayBackedSortedColumns.factory.create(makeCfMetaData(update.metadata().ksName, "otherCf", null));
-            cf.addColumn(new Column(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new RowMutation(cf.metadata().ksName, key, cf));
+            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
+            return Collections.singletonList(new Mutation(cf.metadata().ksName, key, cf));
         }
     }
 
     public static class SameKeyDifferentKsTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             ColumnFamily cf = ArrayBackedSortedColumns.factory.create(makeCfMetaData("otherKs", "otherCf", null));
-            cf.addColumn(new Column(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new RowMutation(cf.metadata().ksName, key, cf));
+            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
+            return Collections.singletonList(new Mutation(cf.metadata().ksName, key, cf));
         }
     }
 
     public static class DifferentKeyTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             ColumnFamily cf = ArrayBackedSortedColumns.factory.create(update.metadata());
-            cf.addColumn(new Column(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new RowMutation(cf.metadata().ksName, bytes("otherKey"), cf));
+            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
+            return Collections.singletonList(new Mutation(cf.metadata().ksName, bytes("otherKey"), cf));
         }
     }
 

diff --git a/test/unit/org/apache/cassandra/triggers/TriggersSchemaTest.java b/test/unit/org/apache/cassandra/triggers/TriggersSchemaTest.java
index f9d71ee..c6a1ac5 100644
--- a/test/unit/org/apache/cassandra/triggers/TriggersSchemaTest.java
+++ b/test/unit/org/apache/cassandra/triggers/TriggersSchemaTest.java

@@ -92,7 +92,7 @@
                                                 Collections.singletonList(cfm1));
         MigrationManager.announceNewKeyspace(ksm);
 
-        CFMetaData cfm2 = Schema.instance.getCFMetaData(ksName, cfName).clone();
+        CFMetaData cfm2 = Schema.instance.getCFMetaData(ksName, cfName).copy();
         TriggerDefinition td = TriggerDefinition.create(triggerName, triggerClass);
         cfm2.addTriggerDefinition(td);
         MigrationManager.announceColumnFamilyUpdate(cfm2, false);
@@ -116,11 +116,11 @@
                                                 Collections.singletonList(cfm1));
         MigrationManager.announceNewKeyspace(ksm);
 
-        CFMetaData cfm2 = Schema.instance.getCFMetaData(ksName, cfName).clone();
+        CFMetaData cfm2 = Schema.instance.getCFMetaData(ksName, cfName).copy();
         cfm2.removeTrigger(triggerName);
         MigrationManager.announceColumnFamilyUpdate(cfm2, false);
 
-        CFMetaData cfm3 = Schema.instance.getCFMetaData(ksName, cfName).clone();
+        CFMetaData cfm3 = Schema.instance.getCFMetaData(ksName, cfName).copy();
         assertTrue(cfm3.getTriggers().isEmpty());
     }
 }

diff --git a/test/unit/org/apache/cassandra/triggers/TriggersTest.java b/test/unit/org/apache/cassandra/triggers/TriggersTest.java
index bda13ff..74fde69 100644
--- a/test/unit/org/apache/cassandra/triggers/TriggersTest.java
+++ b/test/unit/org/apache/cassandra/triggers/TriggersTest.java

@@ -31,26 +31,20 @@
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.Column;
+import org.apache.cassandra.db.BufferCell;
 import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.db.RowMutation;
+import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.thrift.Cassandra;
-import org.apache.cassandra.thrift.ColumnOrSuperColumn;
-import org.apache.cassandra.thrift.ColumnParent;
-import org.apache.cassandra.thrift.InvalidRequestException;
-import org.apache.cassandra.thrift.Mutation;
-import org.apache.cassandra.thrift.TFramedTransportFactory;
-import org.apache.cassandra.thrift.ThriftServer;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.thrift.*;
 import org.apache.thrift.protocol.TBinaryProtocol;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.apache.cassandra.utils.ByteBufferUtil.toInt;
 
 public class TriggersTest extends SchemaLoader
 {
@@ -67,7 +61,7 @@
         StorageService.instance.initServer(0);
         if (thriftServer == null || ! thriftServer.isRunning())
         {
-            thriftServer = new ThriftServer(InetAddress.getLocalHost(), 9170);
+            thriftServer = new ThriftServer(InetAddress.getLocalHost(), 9170, 50);
             thriftServer.start();
         }
 
@@ -144,7 +138,7 @@
                                         new TFramedTransportFactory().openTransport(
                                             InetAddress.getLocalHost().getHostName(), 9170)));
         client.set_keyspace(ksName);
-        Mutation mutation = new Mutation();
+        org.apache.cassandra.thrift.Mutation mutation = new org.apache.cassandra.thrift.Mutation();
         ColumnOrSuperColumn cosc = new ColumnOrSuperColumn();
         cosc.setColumn(getColumnForInsert("v1", 3));
         mutation.setColumn_or_supercolumn(cosc);
@@ -187,7 +181,7 @@
         client.set_keyspace(ksName);
         client.cas(bytes(6),
                    cfName,
-                   Collections.<org.apache.cassandra.thrift.Column>emptyList(),
+                   Collections.<Column>emptyList(),
                    Collections.singletonList(getColumnForInsert("v1", 6)),
                    org.apache.cassandra.thrift.ConsistencyLevel.LOCAL_SERIAL,
                    org.apache.cassandra.thrift.ConsistencyLevel.ONE);
@@ -245,7 +239,7 @@
             client.set_keyspace(ksName);
             client.cas(bytes(9),
                        cf,
-                       Collections.<org.apache.cassandra.thrift.Column>emptyList(),
+                       Collections.<Column>emptyList(),
                        Collections.singletonList(getColumnForInsert("v1", 9)),
                        org.apache.cassandra.thrift.ConsistencyLevel.LOCAL_SERIAL,
                        org.apache.cassandra.thrift.ConsistencyLevel.ONE);
@@ -270,7 +264,7 @@
             client.set_keyspace(ksName);
             client.cas(bytes(10),
                        cf,
-                       Collections.<org.apache.cassandra.thrift.Column>emptyList(),
+                       Collections.<Column>emptyList(),
                        Collections.singletonList(getColumnForInsert("v1", 10)),
                        org.apache.cassandra.thrift.ConsistencyLevel.LOCAL_SERIAL,
                        org.apache.cassandra.thrift.ConsistencyLevel.ONE);
@@ -295,7 +289,7 @@
 
     private void assertUpdateIsAugmented(int key)
     {
-        UntypedResultSet rs = QueryProcessor.processInternal(
+        UntypedResultSet rs = QueryProcessor.executeInternal(
                                 String.format("SELECT * FROM %s.%s WHERE k=%s", ksName, cfName, key));
         assertTrue(String.format("Expected value (%s) for augmented cell v2 was not found", key), rs.one().has("v2"));
         assertEquals(999, rs.one().getInt("v2"));
@@ -303,7 +297,7 @@
 
     private void assertUpdateNotExecuted(String cf, int key)
     {
-        UntypedResultSet rs = QueryProcessor.processInternal(
+        UntypedResultSet rs = QueryProcessor.executeInternal(
                 String.format("SELECT * FROM %s.%s WHERE k=%s", ksName, cf, key));
         assertTrue(rs.isEmpty());
     }
@@ -311,7 +305,7 @@
     private org.apache.cassandra.thrift.Column getColumnForInsert(String columnName, int value)
     {
         org.apache.cassandra.thrift.Column column = new org.apache.cassandra.thrift.Column();
-        column.setName(Schema.instance.getCFMetaData(ksName, cfName).comparator.fromString(columnName));
+        column.setName(Schema.instance.getCFMetaData(ksName, cfName).comparator.asAbstractType().fromString(columnName));
         column.setValue(bytes(value));
         column.setTimestamp(System.currentTimeMillis());
         return column;
@@ -319,36 +313,33 @@
 
     public static class TestTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             ColumnFamily extraUpdate = update.cloneMeShallow(ArrayBackedSortedColumns.factory, false);
-            extraUpdate.addColumn(new Column(update.metadata().comparator.fromString("v2"),
-                                             bytes(999)));
-            return Collections.singletonList(new RowMutation(ksName, key, extraUpdate));
+            extraUpdate.addColumn(new BufferCell(update.metadata().comparator.makeCellName(bytes("v2")), bytes(999)));
+            return Collections.singletonList(new Mutation(ksName, key, extraUpdate));
         }
     }
 
     public static class CrossPartitionTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             ColumnFamily extraUpdate = update.cloneMeShallow(ArrayBackedSortedColumns.factory, false);
-            extraUpdate.addColumn(new Column(update.metadata().comparator.fromString("v2"),
-                                             bytes(999)));
+            extraUpdate.addColumn(new BufferCell(update.metadata().comparator.makeCellName(bytes("v2")), bytes(999)));
 
-            int newKey = ByteBufferUtil.toInt(key) + 1000;
-            return Collections.singletonList(new RowMutation(ksName, bytes(newKey), extraUpdate));
+            int newKey = toInt(key) + 1000;
+            return Collections.singletonList(new Mutation(ksName, bytes(newKey), extraUpdate));
         }
     }
 
     public static class CrossTableTrigger implements ITrigger
     {
-        public Collection<RowMutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
         {
             ColumnFamily extraUpdate = ArrayBackedSortedColumns.factory.create(ksName, otherCf);
-            extraUpdate.addColumn(new Column(extraUpdate.metadata().comparator.fromString("v2"),
-                                             bytes(999)));
-            return Collections.singletonList(new RowMutation(ksName, key, extraUpdate));
+            extraUpdate.addColumn(new BufferCell(extraUpdate.metadata().comparator.makeCellName(bytes("v2")), bytes(999)));
+            return Collections.singletonList(new Mutation(ksName, key, extraUpdate));
         }
     }
 }

diff --git a/test/unit/org/apache/cassandra/utils/BTreeTest.java b/test/unit/org/apache/cassandra/utils/BTreeTest.java
new file mode 100644
index 0000000..a6d4528
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/BTreeTest.java

@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.BTreeSet;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+
+public class BTreeTest
+{
+
+    static Integer[] ints = new Integer[20];
+    static
+    {
+        System.setProperty("cassandra.btree.fanfactor", "4");
+        for (int i = 0 ; i < ints.length ; i++)
+            ints[i] = new Integer(i);
+    }
+
+    static final UpdateFunction<Integer> updateF = new UpdateFunction<Integer>()
+    {
+        public Integer apply(Integer replacing, Integer update)
+        {
+            return ints[update];
+        }
+
+        public boolean abortEarly()
+        {
+            return false;
+        }
+
+        public void allocated(long heapSize)
+        {
+
+        }
+
+        public Integer apply(Integer integer)
+        {
+            return ints[integer];
+        }
+    };
+
+    private static List<Integer> seq(int count)
+    {
+        List<Integer> r = new ArrayList<>();
+        for (int i = 0 ; i < count ; i++)
+            r.add(i);
+        return r;
+    }
+
+    private static List<Integer> rand(int count)
+    {
+        Random rand = ThreadLocalRandom.current();
+        List<Integer> r = seq(count);
+        for (int i = 0 ; i < count - 1 ; i++)
+        {
+            int swap = i + rand.nextInt(count - i);
+            Integer tmp = r.get(i);
+            r.set(i, r.get(swap));
+            r.set(swap, tmp);
+        }
+        return r;
+    }
+
+    private static final Comparator<Integer> CMP = new Comparator<Integer>()
+    {
+        public int compare(Integer o1, Integer o2)
+        {
+            return Integer.compare(o1, o2);
+        }
+    };
+
+    @Test
+    public void testBuilding_UpdateFunctionReplacement()
+    {
+        for (int i = 0; i < 20 ; i++)
+        {
+            checkResult(i, BTree.build(seq(i), CMP, true, updateF));
+            checkResult(i, BTree.build(rand(i), CMP, false, updateF));
+        }
+    }
+
+    @Test
+    public void testUpdate_UpdateFunctionReplacement()
+    {
+        for (int i = 0; i < 20 ; i++)
+        {
+            checkResult(i, BTree.update(BTree.build(seq(i), CMP, true, UpdateFunction.NoOp.<Integer>instance()), CMP, seq(i), true, updateF));
+            checkResult(i, BTree.update(BTree.build(rand(i), CMP, false, UpdateFunction.NoOp.<Integer>instance()), CMP, rand(i), false, updateF));
+        }
+    }
+
+    private static void checkResult(int count, Object[] btree)
+    {
+        BTreeSet<Integer> vs = new BTreeSet<>(btree, CMP);
+        assert vs.size() == count;
+        int i = 0;
+        for (Integer j : vs)
+            Assert.assertEquals(j, ints[i++]);
+    }
+
+    @Test
+    public void testClearOnAbort()
+    {
+        final Comparator<String> cmp = new Comparator<String>()
+        {
+            public int compare(String o1, String o2)
+            {
+                return o1.compareTo(o2);
+            }
+        };
+
+        Object[] btree = BTree.build(ranges(range(0, 8)), cmp, true, UpdateFunction.NoOp.<String>instance());
+        BTree.update(btree, cmp, ranges(range(0, 94)), false, new AbortAfterX(90));
+        btree = BTree.update(btree, cmp, ranges(range(0, 94)), false, UpdateFunction.NoOp.<String>instance());
+        Assert.assertTrue(BTree.isWellFormed(btree, cmp));
+    }
+
+    private static final class AbortAfterX implements UpdateFunction<String>
+    {
+        int counter;
+        final int abortAfter;
+        private AbortAfterX(int abortAfter)
+        {
+            this.abortAfter = abortAfter;
+        }
+        public String apply(String replacing, String update)
+        {
+            return update;
+        }
+        public boolean abortEarly()
+        {
+            return counter++ > abortAfter;
+        }
+        public void allocated(long heapSize)
+        {
+        }
+        public String apply(String v)
+        {
+            return v;
+        }
+    }
+
+    private static int[] range(int lb, int ub)
+    {
+        return new int[] { lb, ub };
+    }
+
+    private static List<String> ranges(int[] ... ranges)
+    {
+
+        List<String> r = new ArrayList<>();
+        for (int[] range : ranges)
+        {
+            for (int i = range[0] ; i < range[1] ; i+=1)
+                r.add(Integer.toString(i));
+        }
+        return r;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
index f0dc602..4180a8c 100644
--- a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
+++ b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java

@@ -36,6 +36,7 @@
 import org.junit.Test;
 
 import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.io.util.FileUtils;
 
 public class BloomFilterTest
@@ -162,7 +163,7 @@
         File file = FileUtils.createTempFile("bloomFilterTest-", ".dat");
         BloomFilter filter = (BloomFilter) FilterFactory.getFilter(((long)Integer.MAX_VALUE / 8) + 1, 0.01d, true);
         filter.add(test);
-        DataOutputStream out = new DataOutputStream(new FileOutputStream(file));
+        DataOutputStreamAndChannel out = new DataOutputStreamAndChannel(new FileOutputStream(file));
         FilterFactory.serialize(filter, out);
         filter.bitset.serialize(out);
         out.close();

diff --git a/test/unit/org/apache/cassandra/utils/ByteBufferUtilTest.java b/test/unit/org/apache/cassandra/utils/ByteBufferUtilTest.java
index 172cdb1..44cb20d 100644
--- a/test/unit/org/apache/cassandra/utils/ByteBufferUtilTest.java
+++ b/test/unit/org/apache/cassandra/utils/ByteBufferUtilTest.java

@@ -32,6 +32,9 @@
 
 import org.junit.Test;
 
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.FastByteArrayOutputStream;
+
 public class ByteBufferUtilTest
 {
     private static final String s = "cassandra";
@@ -171,12 +174,11 @@
 
     private void checkReadWrite(ByteBuffer bb) throws IOException
     {
-        ByteArrayOutputStream bos = new ByteArrayOutputStream();
-        DataOutputStream out = new DataOutputStream(bos);
+        DataOutputBuffer out = new DataOutputBuffer();
         ByteBufferUtil.writeWithLength(bb, out);
         ByteBufferUtil.writeWithShortLength(bb, out);
 
-        DataInputStream in = new DataInputStream(new ByteArrayInputStream(bos.toByteArray()));
+        DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.toByteArray()));
         assert bb.equals(ByteBufferUtil.readWithLength(in));
         assert bb.equals(ByteBufferUtil.readWithShortLength(in));
     }

diff --git a/test/unit/org/apache/cassandra/utils/EncodedStreamsTest.java b/test/unit/org/apache/cassandra/utils/EncodedStreamsTest.java
index 7032d73..2a67d42 100644
--- a/test/unit/org/apache/cassandra/utils/EncodedStreamsTest.java
+++ b/test/unit/org/apache/cassandra/utils/EncodedStreamsTest.java

@@ -26,8 +26,8 @@
 import java.io.IOException;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.ArrayBackedSortedColumns;
 import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.TreeMapBackedSortedColumns;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.vint.EncodedDataInputStream;
@@ -97,7 +97,7 @@
 
     private ColumnFamily createCF()
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(keyspaceName, standardCFName);
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(keyspaceName, standardCFName);
         cf.addColumn(column("vijay", "try", 1));
         cf.addColumn(column("to", "be_nice", 1));
         return cf;
@@ -105,9 +105,9 @@
 
     private ColumnFamily createCounterCF()
     {
-        ColumnFamily cf = TreeMapBackedSortedColumns.factory.create(keyspaceName, counterCFName);
-        cf.addColumn(counterColumn("vijay", 1L, 1));
-        cf.addColumn(counterColumn("wants", 1000000, 1));
+        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(keyspaceName, counterCFName);
+        cf.addCounter(cellname("vijay"), 1);
+        cf.addCounter(cellname("wants"), 1000000);
         return cf;
     }
 
@@ -128,14 +128,16 @@
     @Test
     public void testCounterCFSerialization() throws IOException
     {
+        ColumnFamily counterCF = createCounterCF();
+
         ByteArrayOutputStream byteArrayOStream1 = new ByteArrayOutputStream();
         EncodedDataOutputStream odos = new EncodedDataOutputStream(byteArrayOStream1);
-        ColumnFamily.serializer.serialize(createCounterCF(), odos, version);
+        ColumnFamily.serializer.serialize(counterCF, odos, version);
 
         ByteArrayInputStream byteArrayIStream1 = new ByteArrayInputStream(byteArrayOStream1.toByteArray());
         EncodedDataInputStream odis = new EncodedDataInputStream(new DataInputStream(byteArrayIStream1));
         ColumnFamily cf = ColumnFamily.serializer.deserialize(odis, version);
-        Assert.assertEquals(cf, createCounterCF());
+        Assert.assertEquals(cf, counterCF);
         Assert.assertEquals(byteArrayOStream1.size(), (int) ColumnFamily.serializer.serializedSize(cf, TypeSizes.VINT, version));
     }
 }

diff --git a/test/unit/org/apache/cassandra/utils/FastByteOperationsTest.java b/test/unit/org/apache/cassandra/utils/FastByteOperationsTest.java
new file mode 100644
index 0000000..4f6d652
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/FastByteOperationsTest.java

@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Random;
+
+import org.junit.Test;
+
+public class FastByteOperationsTest
+{
+
+    private static final FastByteOperations.PureJavaOperations PJO = new FastByteOperations.PureJavaOperations();
+    private static final FastByteOperations.UnsafeOperations UO = new FastByteOperations.UnsafeOperations();
+    private static final Random rand = new Random(0);
+    private static final ByteBuffer dbuf1 = ByteBuffer.allocateDirect(150);
+    private static final ByteBuffer dbuf2 = ByteBuffer.allocateDirect(150);
+    private static final ByteBuffer hbuf1 = ByteBuffer.allocate(150);
+    private static final ByteBuffer hbuf2 = ByteBuffer.allocate(150);
+
+    @Test
+    public void testFastByteCopy()
+    {
+        byte[] bytes1 = new byte[128];
+        byte[] empty = new byte[128];
+        rand.nextBytes(bytes1);
+        testCopy(bytes1, wrap1(bytes1, true), wrap2(empty, true), PJO);
+        testCopy(bytes1, wrap1(bytes1, true), wrap2(empty, false), PJO);
+        testCopy(bytes1, wrap1(bytes1, false), wrap2(empty, true), PJO);
+        testCopy(bytes1, wrap1(bytes1, false), wrap2(empty, false), PJO);
+        testCopy(bytes1, wrap1(bytes1, true), wrap2(empty, true), UO);
+        testCopy(bytes1, wrap1(bytes1, true), wrap2(empty, false), UO);
+        testCopy(bytes1, wrap1(bytes1, false), wrap2(empty, true), UO);
+        testCopy(bytes1, wrap1(bytes1, false), wrap2(empty, false), UO);
+    }
+
+    private void testCopy(byte[] canon, ByteBuffer src, ByteBuffer trg, FastByteOperations.ByteOperations ops)
+    {
+        byte[] result = new byte[src.remaining()];
+        ops.copy(src, src.position(), trg, trg.position(), src.remaining());
+        ops.copy(trg, trg.position(), result, 0, trg.remaining());
+        assert firstdiff(canon, result) < 0;
+    }
+
+    private static int firstdiff(byte[] canon, byte[] test)
+    {
+        for (int i = 0 ; i < canon.length ; i++)
+            if (canon[i] != test[i])
+                return i;
+        return -1;
+    }
+
+    @Test
+    public void testFastByteComparisons()
+    {
+        byte[] bytes1 = new byte[128];
+        for (int i = 0 ; i < 1000 ; i++)
+        {
+            rand.nextBytes(bytes1);
+            for (int j = 0 ; j < 16 ; j++)
+            {
+                byte[] bytes2 = Arrays.copyOf(bytes1, bytes1.length - j);
+                testTwiddleOneByteComparisons(bytes1, bytes2, 16, true, 1);
+                testTwiddleOneByteComparisons(bytes1, bytes2, 16, true, -1);
+                testTwiddleOneByteComparisons(bytes1, bytes2, 16, false, 1);
+                testTwiddleOneByteComparisons(bytes1, bytes2, 16, false, -1);
+                testTwiddleOneByteComparisons(bytes1, bytes2, 16, true, 128);
+                testTwiddleOneByteComparisons(bytes1, bytes2, 16, false, 128);
+            }
+        }
+    }
+
+    private void testTwiddleOneByteComparisons(byte[] bytes1, byte[] bytes2, int count, boolean start, int inc)
+    {
+        for (int j = 0 ; j < count ; j++)
+        {
+            int index = start ? j : bytes2.length - (j + 1);
+            bytes2[index] += inc;
+            testComparisons(bytes1, bytes2);
+            bytes2[index] -= inc;
+        }
+    }
+
+    private static ByteBuffer wrap1(byte[] bytes, boolean direct)
+    {
+        return slice(bytes, direct ? dbuf1 : hbuf1);
+    }
+
+    private static ByteBuffer wrap2(byte[] bytes, boolean direct)
+    {
+        return slice(bytes, direct ? dbuf2 : hbuf2);
+    }
+
+    private static ByteBuffer slice(byte[] bytes, ByteBuffer buf)
+    {
+        buf = buf.duplicate();
+        buf.position((buf.limit() - bytes.length) / 2);
+        buf.limit(buf.position() + bytes.length);
+        buf.duplicate().put(bytes);
+        return buf;
+    }
+
+    private void testComparisons(byte[] bytes1, byte[] bytes2)
+    {
+        testComparison(bytes1, bytes2);
+        testComparison(bytes2, bytes1);
+        testComparison(wrap1(bytes1, false), bytes2);
+        testComparison(wrap2(bytes2, false), bytes1);
+        testComparison(wrap1(bytes1, false), wrap2(bytes2, false));
+        testComparison(wrap2(bytes2, false), wrap1(bytes1, false));
+        testComparison(wrap1(bytes1, true), bytes2);
+        testComparison(wrap2(bytes2, true), bytes1);
+        testComparison(wrap1(bytes1, true), wrap2(bytes2, true));
+        testComparison(wrap2(bytes2, true), wrap1(bytes1, true));
+        testComparison(wrap1(bytes1, true), wrap2(bytes2, false));
+        testComparison(wrap1(bytes1, false), wrap2(bytes2, true));
+        testComparison(wrap2(bytes2, true), wrap1(bytes1, false));
+        testComparison(wrap2(bytes2, false), wrap1(bytes1, true));
+    }
+
+    private void testComparison(byte[] bytes1, byte[] bytes2)
+    {
+        assert sameComparisonResult(PJO.compare(bytes1, 0, bytes1.length, bytes2, 0, bytes2.length), UO.compare(bytes1, 0, bytes1.length, bytes2, 0, bytes2.length));
+        assert sameComparisonResult(PJO.compare(bytes1, 10, bytes1.length - 10, bytes2, 10, bytes2.length - 10), UO.compare(bytes1, 10, bytes1.length - 10, bytes2, 10, bytes2.length - 10));
+    }
+
+    private void testComparison(ByteBuffer bytes1, byte[] bytes2)
+    {
+        assert sameComparisonResult(PJO.compare(bytes1, bytes2, 0, bytes2.length), UO.compare(bytes1, bytes2, 0, bytes2.length));
+        assert sameComparisonResult(PJO.compare(bytes1, bytes2, 10, bytes2.length - 10), UO.compare(bytes1, bytes2, 10, bytes2.length - 10));
+    }
+
+    private void testComparison(ByteBuffer bytes1, ByteBuffer bytes2)
+    {
+        assert sameComparisonResult(PJO.compare(bytes1, bytes2), UO.compare(bytes1, bytes2));
+    }
+
+    static boolean sameComparisonResult(int exp, int act)
+    {
+        if (exp < 0)
+            return act < 0;
+        if (exp > 0)
+            return act > 0;
+        return act == 0;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java b/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java
index 8113703..ea88092 100644
--- a/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java
+++ b/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java

@@ -23,6 +23,7 @@
 
 import org.junit.Test;
 
+import java.lang.reflect.Constructor;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@@ -33,6 +34,8 @@
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class IntervalTreeTest
 {
@@ -144,25 +147,24 @@
         IVersionedSerializer<IntervalTree<Integer, String, Interval<Integer, String>>> serializer = IntervalTree.serializer(
             new ISerializer<Integer>()
             {
-                public void serialize(Integer i, DataOutput out) throws IOException { out.writeInt(i); }
+                public void serialize(Integer i, DataOutputPlus out) throws IOException { out.writeInt(i); }
                 public Integer deserialize(DataInput in) throws IOException { return in.readInt(); }
-                public long serializedSize(Integer i, TypeSizes ts) { return 4; }
+                public long serializedSize(Integer i, TypeSizes s) { return 4; }
             },
             new ISerializer<String>()
             {
-                public void serialize(String v, DataOutput out) throws IOException { out.writeUTF(v); }
+                public void serialize(String v, DataOutputPlus out) throws IOException { out.writeUTF(v); }
                 public String deserialize(DataInput in) throws IOException { return in.readUTF(); }
-                public long serializedSize(String v, TypeSizes ts) { return v.length(); }
+                public long serializedSize(String v, TypeSizes s) { return v.length(); }
             },
-            Interval.class.getConstructor(Object.class, Object.class, Object.class)
+            (Constructor<Interval<Integer, String>>) (Object) Interval.class.getConstructor(Object.class, Object.class, Object.class)
         );
 
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        DataOutputStream out = new DataOutputStream(baos);
+        DataOutputBuffer out = new DataOutputBuffer();
 
         serializer.serialize(it, out, 0);
 
-        DataInputStream in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
+        DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.toByteArray()));
 
         IntervalTree<Integer, String, Interval<Integer, String>> it2 = serializer.deserialize(in, 0);
         List<Interval<Integer, String>> intervals2 = new ArrayList<Interval<Integer, String>>();

diff --git a/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
index 2b459c6..116e194 100644
--- a/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
+++ b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java

@@ -30,10 +30,10 @@
 import org.junit.Test;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.dht.*;
+import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.MerkleTree.Hashable;
 import org.apache.cassandra.utils.MerkleTree.RowHash;
-import org.apache.cassandra.utils.MerkleTree.TreeDifference;
 import org.apache.cassandra.utils.MerkleTree.TreeRange;
 import org.apache.cassandra.utils.MerkleTree.TreeRangeIterator;
 
@@ -393,7 +393,7 @@
 
         byte[] initialhash = mt.hash(full);
 
-        ByteArrayDataOutput out = ByteStreams.newDataOutput();
+        DataOutputBuffer out = new DataOutputBuffer();
         MerkleTree.serializer.serialize(mt, out, MessagingService.current_version);
         byte[] serialized = out.toByteArray();
 

diff --git a/test/unit/org/apache/cassandra/utils/SerializationsTest.java b/test/unit/org/apache/cassandra/utils/SerializationsTest.java
index f2112c2..976a3eb 100644
--- a/test/unit/org/apache/cassandra/utils/SerializationsTest.java
+++ b/test/unit/org/apache/cassandra/utils/SerializationsTest.java

@@ -19,6 +19,7 @@
 package org.apache.cassandra.utils;
 
 import org.apache.cassandra.AbstractSerializationsTester;
+import org.apache.cassandra.io.util.DataOutputStreamAndChannel;
 import org.apache.cassandra.service.StorageService;
 
 import org.junit.Test;
@@ -35,7 +36,7 @@
         IFilter bf = FilterFactory.getFilter(1000000, 0.0001, offheap);
         for (int i = 0; i < 100; i++)
             bf.add(StorageService.getPartitioner().getTokenFactory().toByteArray(StorageService.getPartitioner().getRandomToken()));
-        DataOutputStream out = getOutput("utils.BloomFilter.bin");
+        DataOutputStreamAndChannel out = getOutput("utils.BloomFilter.bin");
         FilterFactory.serialize(bf, out);
         out.close();
     }
@@ -65,7 +66,7 @@
         data[offsets.length] = 100000;
         EstimatedHistogram hist2 = new EstimatedHistogram(offsets, data);
 
-        DataOutputStream out = getOutput("utils.EstimatedHistogram.bin");
+        DataOutputStreamAndChannel out = getOutput("utils.EstimatedHistogram.bin");
         EstimatedHistogram.serializer.serialize(hist0, out);
         EstimatedHistogram.serializer.serialize(hist1, out);
         EstimatedHistogram.serializer.serialize(hist2, out);

diff --git a/test/unit/org/apache/cassandra/utils/StreamingHistogramTest.java b/test/unit/org/apache/cassandra/utils/StreamingHistogramTest.java
index 8f75935..13828eb 100644
--- a/test/unit/org/apache/cassandra/utils/StreamingHistogramTest.java
+++ b/test/unit/org/apache/cassandra/utils/StreamingHistogramTest.java

@@ -25,6 +25,8 @@
 import java.io.DataOutputStream;
 import java.util.*;
 
+import org.apache.cassandra.io.util.DataOutputBuffer;
+
 import static org.junit.Assert.assertEquals;
 
 public class StreamingHistogramTest
@@ -97,8 +99,8 @@
             hist.update(samples[i]);
         }
 
-        ByteArrayOutputStream out = new ByteArrayOutputStream();
-        StreamingHistogram.serializer.serialize(hist, new DataOutputStream(out));
+        DataOutputBuffer out = new DataOutputBuffer();
+        StreamingHistogram.serializer.serialize(hist, out);
         byte[] bytes = out.toByteArray();
 
         StreamingHistogram deserialized = StreamingHistogram.serializer.deserialize(new DataInputStream(new ByteArrayInputStream(bytes)));

diff --git a/test/unit/org/apache/cassandra/utils/UUIDTests.java b/test/unit/org/apache/cassandra/utils/UUIDTests.java
index 970c753..99cd5ae 100644
--- a/test/unit/org/apache/cassandra/utils/UUIDTests.java
+++ b/test/unit/org/apache/cassandra/utils/UUIDTests.java

@@ -24,9 +24,6 @@
 import org.apache.cassandra.db.marshal.TimeUUIDType;
 import org.junit.Test;
 
-import java.math.BigInteger;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.util.UUID;
 
@@ -34,7 +31,7 @@
 public class UUIDTests
 {
     @Test
-    public void verifyType1() throws UnknownHostException
+    public void verifyType1()
     {
 
         UUID uuid = UUIDGen.getTimeUUID();
@@ -42,7 +39,7 @@
     }
 
     @Test
-    public void verifyOrdering1() throws UnknownHostException
+    public void verifyOrdering1()
     {
         UUID one = UUIDGen.getTimeUUID();
         UUID two = UUIDGen.getTimeUUID();
@@ -51,7 +48,7 @@
 
 
     @Test
-    public void testDecomposeAndRaw() throws UnknownHostException
+    public void testDecomposeAndRaw()
     {
         UUID a = UUIDGen.getTimeUUID();
         byte[] decomposed = UUIDGen.decompose(a);
@@ -60,7 +57,7 @@
     }
 
     @Test
-    public void testTimeUUIDType() throws UnknownHostException
+    public void testTimeUUIDType()
     {
         TimeUUIDType comp = TimeUUIDType.instance;
         ByteBuffer first = ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes());
@@ -72,9 +69,8 @@
     }
 
     @Test
-    public void testUUIDTimestamp() throws UnknownHostException
+    public void testUUIDTimestamp()
     {
-        InetAddress addr = InetAddress.getByName("127.0.0.1");
         long now = System.currentTimeMillis();
         UUID uuid = UUIDGen.getTimeUUID();
         long tstamp = UUIDGen.getAdjustedTimestamp(uuid);
@@ -82,9 +78,4 @@
         // I'll be damn is the uuid timestamp is more than 10ms after now
         assert now <= tstamp && now >= tstamp - 10 : "now = " + now + ", timestamp = " + tstamp;
     }
-
-    private void assertNonZero(BigInteger i)
-    {
-        assert i.toString(2).indexOf("1") > -1;
-    }
 }

diff --git a/tools/bin/cassandra-stress b/tools/bin/cassandra-stress
index 39257cd..53440ac 100755
--- a/tools/bin/cassandra-stress
+++ b/tools/bin/cassandra-stress

@@ -42,4 +42,6 @@
     exit 1
 fi
 
-$JAVA -server -cp $CLASSPATH org.apache.cassandra.stress.Stress $@
+$JAVA -server -ea -cp "$CLASSPATH" \
+      -Dcassandra.storagedir="$cassandra_storagedir" \
+      org.apache.cassandra.stress.Stress $@

diff --git a/tools/bin/cassandra-stress.bat b/tools/bin/cassandra-stress.bat
index 288b8fe..f1bbcc9 100644
--- a/tools/bin/cassandra-stress.bat
+++ b/tools/bin/cassandra-stress.bat

@@ -14,22 +14,9 @@
 @REM  limitations under the License.

 

 @echo off

+if "%OS%" == "Windows_NT" setlocal

 

-setlocal

-

-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%CD%\..\..

+pushd "%~dp0"

+call cassandra.in.bat

 if NOT DEFINED STRESS_HOME set STRESS_HOME=%CD%\..

-

-@REM Include the build\classes\main directory so it works in development

-set CLASSPATH="%CASSANDRA_HOME%\build\classes\stress";"%CASSANDRA_HOME%\build\classes\main";"%CASSANDRA_HOME%\build\classes\thrift"

-

-for %%i in ("%STRESS_HOME%\lib\*.jar") do call :append "%%i"

-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

-goto start

-

-:append

-set CLASSPATH=%CLASSPATH%;%1

-goto :eof

-

-:start

-"%JAVA_HOME%\bin\java" -cp %CLASSPATH% org.apache.cassandra.stress.Stress %*

+"%JAVA_HOME%\bin\java" -cp %CASSANDRA_CLASSPATH% org.apache.cassandra.stress.Stress %*


diff --git a/tools/bin/cassandra-stressd b/tools/bin/cassandra-stressd
index 8d337e5..6532707 100755
--- a/tools/bin/cassandra-stressd
+++ b/tools/bin/cassandra-stressd

@@ -17,23 +17,25 @@
 # limitations under the License.
 
 DESC="Cassandra Stress Test Daemon"
+if [ "x$CASSANDRA_INCLUDE" = "x" ]; then 
+    for include in "`dirname $0`/cassandra.in.sh" \
+                   "$HOME/.cassandra.in.sh" \
+                   /usr/share/cassandra/cassandra.in.sh \
+                   /usr/local/share/cassandra/cassandra.in.sh \
+                   /opt/cassandra/cassandra.in.sh; do
+        if [ -r $include ]; then 
+            . $include
+            break   
+        fi      
+    done    
+elif [ -r $CASSANDRA_INCLUDE ]; then 
+    . $CASSANDRA_INCLUDE
+fi
 
-if [ "x$CLASSPATH" = "x" ]; then
-    
-    # execute from the build dir.
-    if [ -d `dirname $0`/../../build/classes ]; then
-        for directory in `dirname $0`/../../build/classes/*; do
-            CLASSPATH=$CLASSPATH:$directory
-        done
-    else
-        if [ -f `dirname $0`/../lib/stress.jar ]; then
-            CLASSPATH=`dirname $0`/../lib/stress.jar
-        fi
-    fi
-
-    for jar in `dirname $0`/../../lib/*.jar; do
-        CLASSPATH=$CLASSPATH:$jar
-    done
+if [ -x $JAVA_HOME/bin/java ]; then 
+    JAVA=$JAVA_HOME/bin/java
+else
+    JAVA=`which java`
 fi
 
 if [ -x $JAVA_HOME/bin/java ]; then
@@ -50,7 +52,9 @@
 case "$1" in
   start)
     echo "Starting $DESC: "
-    $JAVA -server -cp $CLASSPATH org.apache.cassandra.stress.StressServer $@ 1> ./stressd.out.log 2> ./stressd.err.log &
+    $JAVA -server -cp $CLASSPATH \
+          -Dcassandra.storagedir="$cassandra_storagedir" \
+          org.apache.cassandra.stress.StressServer $@ 1> ./stressd.out.log 2> ./stressd.err.log &
     echo $! > ./stressd.pid
     echo "done."
   ;;

diff --git a/bin/sstablesplit.bat b/tools/bin/cassandra.in.bat
similarity index 65%
copy from bin/sstablesplit.bat
copy to tools/bin/cassandra.in.bat
index ef88670..889fa9b 100644
--- a/bin/sstablesplit.bat
+++ b/tools/bin/cassandra.in.bat

@@ -1,61 +1,45 @@
-@REM
-@REM  Licensed to the Apache Software Foundation (ASF) under one or more
-@REM  contributor license agreements.  See the NOTICE file distributed with
-@REM  this work for additional information regarding copyright ownership.
-@REM  The ASF licenses this file to You under the Apache License, Version 2.0
-@REM  (the "License"); you may not use this file except in compliance with
-@REM  the License.  You may obtain a copy of the License at
-@REM
-@REM      http://www.apache.org/licenses/LICENSE-2.0
-@REM
-@REM  Unless required by applicable law or agreed to in writing, software
-@REM  distributed under the License is distributed on an "AS IS" BASIS,
-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM  See the License for the specific language governing permissions and
-@REM  limitations under the License.
-
-@echo off
-if "%OS%" == "Windows_NT" setlocal
-
-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..
-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"
-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneSplitter
-if NOT DEFINED JAVA_HOME goto :err
-
-REM ***** JAVA options *****
-set JAVA_OPTS=^
- -Dlog4j.configuration=log4j-tools.properties
-
-REM ***** CLASSPATH library setting *****
-
-REM Ensure that any user defined CLASSPATH variables are not used on startup
-set CLASSPATH="%CASSANDRA_HOME%\conf"
-
-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.
-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"
-goto okClasspath
-
-:append
-set CLASSPATH=%CLASSPATH%;%1
-goto :eof
-
-:okClasspath
-REM Include the build\classes\main directory so it works in development
-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"
-
-set CASSANDRA_PARAMS=
-set TOOLS_PARAMS=
-
-goto runTool
-
-:runTool
-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*
-goto finally
-
-:err
-echo JAVA_HOME environment variable must be set!
-pause
-
-:finally
-
-ENDLOCAL
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%CD%\..\..

+if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"

+

+REM JAVA_HOME can optionally be set here

+REM set JAVA_HOME="<directory>"

+

+REM ***** CLASSPATH library setting *****

+

+REM Ensure that any user defined CLASSPATH variables are not used on startup

+set CLASSPATH="%CASSANDRA_HOME%\conf"

+

+REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.

+for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

+for %%i in ("%CASSANDRA_HOME%\tools\lib\*.jar") do call :append "%%i"

+for %%i in ("%CASSANDRA_HOME%\build\*.jar") do call :append "%%i"

+goto :okClasspath

+

+:append

+set CLASSPATH=%CLASSPATH%;%1

+goto :eof

+

+:okClasspath

+

+REM Include the build\classes\main directory so it works in development

+set CASSANDRA_CLASSPATH=%CLASSPATH%;%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\main";"%CASSANDRA_HOME%\build\classes\thrift";"%CASSANDRA_HOME%\build\classes\stress"

+

+REM Add the default storage location.  Can be overridden in conf\cassandra.yaml

+set CASSANDRA_PARAMS=%CASSANDRA_PARAMS% "-Dcassandra.storagedir=%CASSANDRA_HOME%\data"


diff --git a/tools/bin/cassandra.in.sh b/tools/bin/cassandra.in.sh
index 74f1cc8..004f394 100644
--- a/tools/bin/cassandra.in.sh
+++ b/tools/bin/cassandra.in.sh

@@ -15,31 +15,35 @@
 # limitations under the License.
 
 if [ "x$CASSANDRA_HOME" = "x" ]; then
-    CASSANDRA_HOME=`dirname $0`/../../
+    CASSANDRA_HOME="`dirname $0`/../.."
 fi
 
 # The directory where Cassandra's configs live (required)
 if [ "x$CASSANDRA_CONF" = "x" ]; then
-    CASSANDRA_CONF=$CASSANDRA_HOME/conf
+    CASSANDRA_CONF="$CASSANDRA_HOME/conf"
 fi
 
 # This can be the path to a jar file, or a directory containing the
 # compiled classes. NOTE: This isn't needed by the startup script,
 # it's just used here in constructing the classpath.
-cassandra_bin=$CASSANDRA_HOME/build/classes/main
-cassandra_bin=$cassandra_bin:$CASSANDRA_HOME/build/classes/stress
-cassandra_bin=$cassandra_bin:$CASSANDRA_HOME/build/classes/thrift
-#cassandra_bin=$cassandra_home/build/cassandra.jar
+cassandra_bin="$CASSANDRA_HOME/build/classes/main"
+cassandra_bin="$cassandra_bin:$CASSANDRA_HOME/build/classes/stress"
+cassandra_bin="$cassandra_bin:$CASSANDRA_HOME/build/classes/thrift"
+#cassandra_bin="$cassandra_home/build/cassandra.jar"
+
+# the default location for commitlogs, sstables, and saved caches
+# if not set in cassandra.yaml
+cassandra_storagedir="$CASSANDRA_HOME/data"
 
 # JAVA_HOME can optionally be set here
 #JAVA_HOME=/usr/local/jdk6
 
 # The java classpath (required)
-CLASSPATH=$CASSANDRA_CONF:$cassandra_bin
+CLASSPATH="$CASSANDRA_CONF:$cassandra_bin"
 
-for jar in $CASSANDRA_HOME/tools/lib/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
+for jar in "$CASSANDRA_HOME"/tools/lib/*.jar; do
+    CLASSPATH="$CLASSPATH:$jar"
 done
-for jar in $CASSANDRA_HOME/lib/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
+for jar in "$CASSANDRA_HOME"/lib/*.jar; do
+    CLASSPATH="$CLASSPATH:$jar"
 done

diff --git a/bin/json2sstable b/tools/bin/json2sstable
similarity index 93%
rename from bin/json2sstable
rename to tools/bin/json2sstable
index b1bb069..bb10b51 100755
--- a/bin/json2sstable
+++ b/tools/bin/json2sstable

@@ -44,7 +44,8 @@
 fi
 
 "$JAVA" $JAVA_AGENT -cp "$CLASSPATH" -Dstorage-config="$CASSANDRA_CONF" \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.SSTableImport "$@"
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/json2sstable.bat b/tools/bin/json2sstable.bat
similarity index 63%
rename from bin/json2sstable.bat
rename to tools/bin/json2sstable.bat
index dc1e8f6..db0fa91 100644
--- a/bin/json2sstable.bat
+++ b/tools/bin/json2sstable.bat

@@ -17,37 +17,18 @@
 @echo off

 if "%OS%" == "Windows_NT" setlocal

 

-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..

-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"

+pushd "%~dp0"

+call cassandra.in.bat

+

 if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.SSTableImport

 if NOT DEFINED JAVA_HOME goto :err

 

 REM ***** JAVA options *****

 set JAVA_OPTS=^

- -Dlog4j.configuration=log4j-tools.properties

+ -Dlogback.configurationFile=logback-tools.xml

 

-REM ***** CLASSPATH library setting *****

-

-REM Ensure that any user defined CLASSPATH variables are not used on startup

-set CLASSPATH="%CASSANDRA_HOME%\conf"

-

-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.

-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

-goto okClasspath

-

-:append

-set CLASSPATH=%CLASSPATH%;%1

-goto :eof

-

-:okClasspath

-REM Include the build\classes\main directory so it works in development

-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"

-

-set CASSANDRA_PARAMS=

 set TOOLS_PARAMS=

-

 FOR %%A IN (%*) DO call :appendToolsParams %%A

-

 goto runTool

 

 :appendToolsParams


diff --git a/bin/sstable2json b/tools/bin/sstable2json
similarity index 93%
rename from bin/sstable2json
rename to tools/bin/sstable2json
index 142bd3e..a7882bf 100755
--- a/bin/sstable2json
+++ b/tools/bin/sstable2json

@@ -45,7 +45,8 @@
 fi
 
 "$JAVA" $JAVA_AGENT -cp "$CLASSPATH" -Dstorage-config="$CASSANDRA_CONF" \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.SSTableExport "$@"
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/sstable2json.bat b/tools/bin/sstable2json.bat
similarity index 63%
rename from bin/sstable2json.bat
rename to tools/bin/sstable2json.bat
index 9544e0c..17669c0 100644
--- a/bin/sstable2json.bat
+++ b/tools/bin/sstable2json.bat

@@ -17,37 +17,18 @@
 @echo off

 if "%OS%" == "Windows_NT" setlocal

 

-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..

-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"

+pushd "%~dp0"

+call cassandra.in.bat

+

 if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.SSTableExport

 if NOT DEFINED JAVA_HOME goto :err

 

 REM ***** JAVA options *****

 set JAVA_OPTS=^

- -Dlog4j.configuration=log4j-tools.properties

+ -Dlogback.configurationFile=logback-tools.xml

 

-REM ***** CLASSPATH library setting *****

-

-REM Ensure that any user defined CLASSPATH variables are not used on startup

-set CLASSPATH="%CASSANDRA_HOME%\conf"

-

-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.

-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

-goto okClasspath

-

-:append

-set CLASSPATH=%CLASSPATH%;%1

-goto :eof

-

-:okClasspath

-REM Include the build\classes\main directory so it works in development

-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"

-

-set CASSANDRA_PARAMS=

 set TOOLS_PARAMS=

-

 FOR %%A IN (%*) DO call :appendToolsParams %%A

-

 goto runTool

 

 :appendToolsParams


diff --git a/tools/bin/sstablelevelreset b/tools/bin/sstablelevelreset
index a6fb33b..497d604 100755
--- a/tools/bin/sstablelevelreset
+++ b/tools/bin/sstablelevelreset

@@ -45,5 +45,6 @@
 fi
 
 $JAVA -cp $CLASSPATH  -Dstorage-config=$CASSANDRA_CONF \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.SSTableLevelResetter "$@"

diff --git a/tools/bin/sstablemetadata b/tools/bin/sstablemetadata
index 5fe8cc4..f7ce141 100755
--- a/tools/bin/sstablemetadata
+++ b/tools/bin/sstablemetadata

@@ -42,5 +42,6 @@
 fi
 
 $JAVA -cp $CLASSPATH \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.SSTableMetadataViewer "$@"

diff --git a/tools/bin/sstablemetadata.bat b/tools/bin/sstablemetadata.bat
index bf06bca..3cb6308 100644
--- a/tools/bin/sstablemetadata.bat
+++ b/tools/bin/sstablemetadata.bat

@@ -15,18 +15,9 @@
 

 @echo off

 

-setlocal

+if "%OS%" == "Windows_NT" setlocal

 

-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%CD%\..\..

+pushd "%~dp0"

+call cassandra.in.bat

 

-set CLASSPATH=""

-for %%i in ("%CASSANDRA_HOME%\build\*.jar") do call :append "%%i"

-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"

-goto start

-

-:append

-set CLASSPATH=%CLASSPATH%;%1

-goto :eof

-

-:start

 "%JAVA_HOME%\bin\java" -cp %CLASSPATH% org.apache.cassandra.tools.SSTableMetadataViewer %*


diff --git a/tools/bin/sstablerepairedset b/tools/bin/sstablerepairedset
new file mode 100755
index 0000000..1aa30a3
--- /dev/null
+++ b/tools/bin/sstablerepairedset

@@ -0,0 +1,47 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ "x$CLASSPATH" = "x" ]; then
+    
+    # execute from the build dir.
+    if [ -d `dirname $0`/../../build/classes ]; then
+        for directory in `dirname $0`/../../build/classes/*; do
+            CLASSPATH=$CLASSPATH:$directory
+        done
+    else
+        if [ -f `dirname $0`/../lib/stress.jar ]; then
+            CLASSPATH=`dirname $0`/../lib/stress.jar
+        fi
+    fi
+
+    for jar in `dirname $0`/../../lib/*.jar; do
+        CLASSPATH=$CLASSPATH:$jar
+    done
+fi
+
+# Use JAVA_HOME if set, otherwise look for java in PATH
+if [ -x $JAVA_HOME/bin/java ]; then
+    JAVA=$JAVA_HOME/bin/java
+else
+    JAVA=`which java`
+fi
+
+$JAVA -cp $CLASSPATH \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
+        org.apache.cassandra.tools.SSTableRepairedAtSetter "$@"

diff --git a/bin/sstablesplit b/tools/bin/sstablesplit
similarity index 93%
rename from bin/sstablesplit
rename to tools/bin/sstablesplit
index 81a4232..73e736e 100755
--- a/bin/sstablesplit
+++ b/tools/bin/sstablesplit

@@ -44,7 +44,8 @@
 fi
 
 $JAVA $JAVA_AGENT -ea -cp $CLASSPATH -Xmx256M \
-        -Dlog4j.configuration=log4j-tools.properties \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
         org.apache.cassandra.tools.StandaloneSplitter "$@"
 
 # vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/sstablesplit.bat b/tools/bin/sstablesplit.bat
similarity index 60%
copy from bin/sstablesplit.bat
copy to tools/bin/sstablesplit.bat
index ef88670..2abed84 100644
--- a/bin/sstablesplit.bat
+++ b/tools/bin/sstablesplit.bat

@@ -1,61 +1,41 @@
-@REM
-@REM  Licensed to the Apache Software Foundation (ASF) under one or more
-@REM  contributor license agreements.  See the NOTICE file distributed with
-@REM  this work for additional information regarding copyright ownership.
-@REM  The ASF licenses this file to You under the Apache License, Version 2.0
-@REM  (the "License"); you may not use this file except in compliance with
-@REM  the License.  You may obtain a copy of the License at
-@REM
-@REM      http://www.apache.org/licenses/LICENSE-2.0
-@REM
-@REM  Unless required by applicable law or agreed to in writing, software
-@REM  distributed under the License is distributed on an "AS IS" BASIS,
-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM  See the License for the specific language governing permissions and
-@REM  limitations under the License.
-
-@echo off
-if "%OS%" == "Windows_NT" setlocal
-
-if NOT DEFINED CASSANDRA_HOME set CASSANDRA_HOME=%~dp0..
-if NOT DEFINED CASSANDRA_CONF set CASSANDRA_CONF="%CASSANDRA_HOME%\conf"
-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneSplitter
-if NOT DEFINED JAVA_HOME goto :err
-
-REM ***** JAVA options *****
-set JAVA_OPTS=^
- -Dlog4j.configuration=log4j-tools.properties
-
-REM ***** CLASSPATH library setting *****
-
-REM Ensure that any user defined CLASSPATH variables are not used on startup
-set CLASSPATH="%CASSANDRA_HOME%\conf"
-
-REM For each jar in the CASSANDRA_HOME lib directory call append to build the CLASSPATH variable.
-for %%i in ("%CASSANDRA_HOME%\lib\*.jar") do call :append "%%i"
-goto okClasspath
-
-:append
-set CLASSPATH=%CLASSPATH%;%1
-goto :eof
-
-:okClasspath
-REM Include the build\classes\main directory so it works in development
-set CASSANDRA_CLASSPATH=%CLASSPATH%;"%CASSANDRA_HOME%\build\classes\main";%CASSANDRA_CONF%;"%CASSANDRA_HOME%\build\classes\thrift"
-
-set CASSANDRA_PARAMS=
-set TOOLS_PARAMS=
-
-goto runTool
-
-:runTool
-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*
-goto finally
-
-:err
-echo JAVA_HOME environment variable must be set!
-pause
-
-:finally
-
-ENDLOCAL
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+if "%OS%" == "Windows_NT" setlocal

+

+pushd "%~dp0"

+call cassandra.in.bat

+

+if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneSplitter

+if NOT DEFINED JAVA_HOME goto :err

+

+REM ***** JAVA options *****

+set JAVA_OPTS=^

+ -Dlogback.configurationFile=logback-tools.xml

+

+set TOOLS_PARAMS=

+

+"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*

+goto finally

+

+:err

+echo JAVA_HOME environment variable must be set!

+pause

+

+:finally

+

+ENDLOCAL


diff --git a/tools/cqlstress-counter-example.yaml b/tools/cqlstress-counter-example.yaml
new file mode 100644
index 0000000..f8f70ea
--- /dev/null
+++ b/tools/cqlstress-counter-example.yaml

@@ -0,0 +1,78 @@
+#
+# This is an example YAML profile for cassandra-stress
+#
+# insert data
+# cassandra-stress user profile=/home/jake/stress1.yaml ops(insert=1)
+#
+# read, using query simple1:
+# cassandra-stress profile=/home/jake/stress1.yaml ops(simple1=1)
+#
+# mixed workload (90/10)
+# cassandra-stress user profile=/home/jake/stress1.yaml ops(insert=1,simple1=9)
+
+
+#
+# Keyspace info
+#
+keyspace: stresscql
+
+#
+# The CQL for creating a keyspace (optional if it already exists)
+#
+keyspace_definition: |
+  CREATE KEYSPACE stresscql WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
+
+#
+# Table info
+#
+table: counttest
+
+#
+# The CQL for creating a table you wish to stress (optional if it already exists)
+#
+table_definition: |
+  CREATE TABLE counttest (
+        name text PRIMARY KEY,
+        count counter
+  ) WITH comment='A table of many types to test wide rows'
+
+#
+# Optional meta information on the generated columns in the above table
+# The min and max only apply to text and blob types
+# The distribution field represents the total unique population
+# distribution of that column across rows.  Supported types are
+# 
+#      EXP(min..max)                        An exponential distribution over the range [min..max]
+#      EXTREME(min..max,shape)              An extreme value (Weibull) distribution over the range [min..max]
+#      GAUSSIAN(min..max,stdvrng)           A gaussian/normal distribution, where mean=(min+max)/2, and stdev is (mean-min)/stdvrng
+#      GAUSSIAN(min..max,mean,stdev)        A gaussian/normal distribution, with explicitly defined mean and stdev
+#      UNIFORM(min..max)                    A uniform distribution over the range [min, max]
+#      FIXED(val)                           A fixed distribution, always returning the same value
+#      Aliases: extr, gauss, normal, norm, weibull
+#
+#      If preceded by ~, the distribution is inverted
+#
+# Defaults for all columns are size: uniform(4..8), population: uniform(1..100B), cluster: fixed(1)
+#
+
+columnspec:
+  - name: name
+    size: uniform(1..4)
+  - name: count
+    population: fixed(1)
+
+insert:
+  partitions: fixed(1)             # number of unique partitions to update in a single operation
+                                  # if batchcount > 1, multiple batches will be used but all partitions will
+                                  # occur in all batches (unless they finish early); only the row counts will vary
+  batchtype: LOGGED               # type of batch to use
+  select: fixed(1)/1              # uniform chance any single generated CQL row will be visited in a partition;
+                                  # generated for each partition independently, each time we visit it
+
+#
+# A list of queries you wish to run against the schema
+#
+queries:
+   simple1:
+      cql: select * from counttest where name = ?
+      fields: samerow             # samerow or multirow (select arguments from the same row, or randomly from all rows in the partition)
\ No newline at end of file

diff --git a/tools/cqlstress-example.yaml b/tools/cqlstress-example.yaml
new file mode 100644
index 0000000..4dd5e4a
--- /dev/null
+++ b/tools/cqlstress-example.yaml

@@ -0,0 +1,94 @@
+#
+# This is an example YAML profile for cassandra-stress
+#
+# insert data
+# cassandra-stress user profile=/home/jake/stress1.yaml ops(insert=1)
+#
+# read, using query simple1:
+# cassandra-stress profile=/home/jake/stress1.yaml ops(simple1=1)
+#
+# mixed workload (90/10)
+# cassandra-stress user profile=/home/jake/stress1.yaml ops(insert=1,simple1=9)
+
+
+#
+# Keyspace info
+#
+keyspace: stresscql
+
+#
+# The CQL for creating a keyspace (optional if it already exists)
+#
+keyspace_definition: |
+  CREATE KEYSPACE stresscql WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
+
+#
+# Table info
+#
+table: typestest
+
+#
+# The CQL for creating a table you wish to stress (optional if it already exists)
+#
+table_definition: |
+  CREATE TABLE typestest (
+        name text,
+        choice boolean,
+        date timestamp,
+        address inet,
+        dbl double,
+        lval bigint,
+        ival int,
+        uid timeuuid,
+        value blob,
+        PRIMARY KEY((name,choice), date, address, dbl, lval, ival, uid)
+  ) WITH COMPACT STORAGE 
+    AND compaction = { 'class':'LeveledCompactionStrategy' }
+    AND comment='A table of many types to test wide rows'
+
+#
+# Optional meta information on the generated columns in the above table
+# The min and max only apply to text and blob types
+# The distribution field represents the total unique population
+# distribution of that column across rows.  Supported types are
+# 
+#      EXP(min..max)                        An exponential distribution over the range [min..max]
+#      EXTREME(min..max,shape)              An extreme value (Weibull) distribution over the range [min..max]
+#      GAUSSIAN(min..max,stdvrng)           A gaussian/normal distribution, where mean=(min+max)/2, and stdev is (mean-min)/stdvrng
+#      GAUSSIAN(min..max,mean,stdev)        A gaussian/normal distribution, with explicitly defined mean and stdev
+#      UNIFORM(min..max)                    A uniform distribution over the range [min, max]
+#      FIXED(val)                           A fixed distribution, always returning the same value
+#      Aliases: extr, gauss, normal, norm, weibull
+#
+#      If preceded by ~, the distribution is inverted
+#
+# Defaults for all columns are size: uniform(4..8), population: uniform(1..100B), cluster: fixed(1)
+#
+columnspec:
+  - name: name
+    size: uniform(1..10)
+    population: uniform(1..1M)     # the range of unique values to select for the field (default is 100Billion)
+  - name: date
+    cluster: uniform(20..40)
+  - name: lval
+    population: gaussian(1..1000)
+    cluster: uniform(1..4)
+
+insert:
+  partitions: uniform(1..50)       # number of unique partitions to update in a single operation
+                                  # if batchcount > 1, multiple batches will be used but all partitions will
+                                  # occur in all batches (unless they finish early); only the row counts will vary
+  batchtype: LOGGED               # type of batch to use
+  select: uniform(1..10)/10       # uniform chance any single generated CQL row will be visited in a partition;
+                                  # generated for each partition independently, each time we visit it
+
+#
+# A list of queries you wish to run against the schema
+#
+queries:
+   simple1:
+      cql: select * from typestest where name = ? and choice = ? LIMIT 100
+      fields: samerow             # samerow or multirow (select arguments from the same row, or randomly from all rows in the partition)
+   range1:
+      cql: select * from typestest where name = ? and choice = ? and date >= ? LIMIT 100
+      fields: multirow            # samerow or multirow (select arguments from the same row, or randomly from all rows in the partition)

diff --git a/tools/cqlstress-insanity-example.yaml b/tools/cqlstress-insanity-example.yaml
new file mode 100644
index 0000000..ea4f97f
--- /dev/null
+++ b/tools/cqlstress-insanity-example.yaml

@@ -0,0 +1,90 @@
+#
+# This is an example YAML profile for cassandra-stress
+#
+# insert data
+# cassandra-stress user profile=/home/jake/stress1.yaml ops(insert=1)
+#
+# read, using query simple1:
+# cassandra-stress profile=/home/jake/stress1.yaml ops(simple1=1)
+#
+# mixed workload (90/10)
+# cassandra-stress user profile=/home/jake/stress1.yaml ops(insert=1,simple1=9)
+
+
+#
+# Keyspace info
+#
+keyspace: stresscql
+
+#
+# The CQL for creating a keyspace (optional if it already exists)
+#
+keyspace_definition: |
+  CREATE KEYSPACE stresscql WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
+
+#
+# Table info
+#
+table: insanitytest
+
+#
+# The CQL for creating a table you wish to stress (optional if it already exists)
+#
+table_definition: |
+  CREATE TABLE insanitytest (
+        name text,
+        choice boolean,
+        date timestamp,
+        address inet,
+        dbl double,
+        lval bigint,
+        fval float,
+        ival int,
+        uid timeuuid,
+        dates list<timestamp>,
+        inets set<inet>,
+        value blob,
+        PRIMARY KEY((name, choice), date)
+  ) WITH compaction = { 'class':'LeveledCompactionStrategy' }
+    AND comment='A table of many types to test wide rows and collections'
+
+#
+# Optional meta information on the generated columns in the above table
+# The min and max only apply to text and blob types
+# The distribution field represents the total unique population
+# distribution of that column across rows.  Supported types are
+# 
+#      EXP(min..max)                        An exponential distribution over the range [min..max]
+#      EXTREME(min..max,shape)              An extreme value (Weibull) distribution over the range [min..max]
+#      GAUSSIAN(min..max,stdvrng)           A gaussian/normal distribution, where mean=(min+max)/2, and stdev is (mean-min)/stdvrng
+#      GAUSSIAN(min..max,mean,stdev)        A gaussian/normal distribution, with explicitly defined mean and stdev
+#      UNIFORM(min..max)                    A uniform distribution over the range [min, max]
+#      FIXED(val)                           A fixed distribution, always returning the same value
+#      Aliases: extr, gauss, normal, norm, weibull
+#
+#      If preceded by ~, the distribution is inverted
+#
+# Defaults for all columns are size: uniform(4..8), population: uniform(1..100B), cluster: fixed(1)
+#
+columnspec:
+  - name: date
+    cluster: gaussian(1..20)
+  - name: lval
+    population: fixed(1)
+
+
+insert:
+  partitions: fixed(1)             # number of unique partitions to update in a single operation
+                                  # if batchcount > 1, multiple batches will be used but all partitions will
+                                  # occur in all batches (unless they finish early); only the row counts will vary
+  batchtype: LOGGED               # type of batch to use
+  select: fixed(1)/1              # uniform chance any single generated CQL row will be visited in a partition;
+                                  # generated for each partition independently, each time we visit it
+
+#
+# A list of queries you wish to run against the schema
+#
+queries:
+   simple1:
+      cql: select * from insanitytest where name = ? and choice = ? LIMIT 100
+      fields: samerow             # samerow or multirow (select arguments from the same row, or randomly from all rows in the partition)

diff --git a/tools/lib/cassandra-driver-core-2.0.5.jar b/tools/lib/cassandra-driver-core-2.0.5.jar
new file mode 100644
index 0000000..260183e
--- /dev/null
+++ b/tools/lib/cassandra-driver-core-2.0.5.jar
Binary files differ

diff --git a/tools/lib/netty-3.9.0.Final.jar b/tools/lib/netty-3.9.0.Final.jar
new file mode 100644
index 0000000..872340e
--- /dev/null
+++ b/tools/lib/netty-3.9.0.Final.jar
Binary files differ

diff --git a/tools/stress/README.txt b/tools/stress/README.txt
index f39a8d7..9f745c1 100644
--- a/tools/stress/README.txt
+++ b/tools/stress/README.txt

@@ -30,8 +30,8 @@
     -y or --family-type:
         Sets the ColumnFamily type.  One of 'Standard' or 'Super'.  If using super,
         you probably want to set the -u option also.
-    -c or --columns:
-        the number of columns per row, defaults to 5
+    -c or --cells:
+        the number of cells per row, defaults to 5
     -u or --supercolumns:
         use the number of supercolumns specified NOTE: you must set the -y
         option appropriately, or this option has no effect.

diff --git a/tools/stress/src/org/apache/cassandra/stress/Operation.java b/tools/stress/src/org/apache/cassandra/stress/Operation.java
new file mode 100644
index 0000000..5560240
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/Operation.java

@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.Partition;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.*;
+import org.apache.cassandra.stress.util.JavaDriverClient;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.InvalidRequestException;
+import org.apache.cassandra.transport.SimpleClient;
+
+public abstract class Operation
+{
+    public final StressSettings settings;
+    public final Timer timer;
+    public final PartitionGenerator generator;
+    public final Distribution partitionCount;
+
+    protected List<Partition> partitions;
+
+    public Operation(Timer timer, PartitionGenerator generator, StressSettings settings, Distribution partitionCount)
+    {
+        this.generator = generator;
+        this.timer = timer;
+        this.settings = settings;
+        this.partitionCount = partitionCount;
+    }
+
+    public static interface RunOp
+    {
+        public boolean run() throws Exception;
+        public int partitionCount();
+        public int rowCount();
+    }
+
+    protected void setPartitions(List<Partition> partitions)
+    {
+        this.partitions = partitions;
+    }
+
+    public boolean isWrite()
+    {
+        return false;
+    }
+
+    /**
+     * Run operation
+     * @param client Cassandra Thrift client connection
+     * @throws IOException on any I/O error.
+     */
+    public abstract void run(ThriftClient client) throws IOException;
+
+    public void run(SimpleClient client) throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    public void run(JavaDriverClient client) throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    public void timeWithRetry(RunOp run) throws IOException
+    {
+        timer.start();
+
+        boolean success = false;
+        String exceptionMessage = null;
+
+        int tries = 0;
+        for (; tries < settings.errors.tries; tries++)
+        {
+            try
+            {
+                success = run.run();
+                break;
+            }
+            catch (Exception e)
+            {
+                switch (settings.log.level)
+                {
+                    case MINIMAL:
+                        break;
+
+                    case NORMAL:
+                        System.err.println(e);
+                        break;
+
+                    case VERBOSE:
+                        e.printStackTrace(System.err);
+                        break;
+
+                    default:
+                        throw new AssertionError();
+                }
+                exceptionMessage = getExceptionMessage(e);
+            }
+        }
+
+        timer.stop(run.partitionCount(), run.rowCount());
+
+        if (!success)
+        {
+            error(String.format("Operation x%d on key(s) %s: %s%n",
+                    tries,
+                    key(),
+                    (exceptionMessage == null)
+                        ? "Data returned was not validated"
+                        : "Error executing: " + exceptionMessage));
+        }
+
+    }
+
+    private String key()
+    {
+        List<String> keys = new ArrayList<>();
+        for (Partition partition : partitions)
+            keys.add(partition.getKeyAsString());
+        return keys.toString();
+    }
+
+    protected String getExceptionMessage(Exception e)
+    {
+        String className = e.getClass().getSimpleName();
+        String message = (e instanceof InvalidRequestException) ? ((InvalidRequestException) e).getWhy() : e.getMessage();
+        return (message == null) ? "(" + className + ")" : String.format("(%s): %s", className, message);
+    }
+
+    protected void error(String message) throws IOException
+    {
+        if (!settings.errors.ignore)
+            throw new IOException(message);
+        else if (settings.log.level.compareTo(SettingsLog.Level.MINIMAL) > 0)
+            System.err.println(message);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/Session.java b/tools/stress/src/org/apache/cassandra/stress/Session.java
deleted file mode 100644
index 9ac865d..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/Session.java
+++ /dev/null

@@ -1,867 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress;
-
-import java.io.*;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import com.google.common.base.Joiner;
-import org.apache.commons.cli.*;
-import org.apache.commons.lang3.StringUtils;
-import com.yammer.metrics.Metrics;
-
-import org.apache.cassandra.auth.IAuthenticator;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions;
-import org.apache.cassandra.config.EncryptionOptions;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.transport.SimpleClient;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.thrift.protocol.TBinaryProtocol;
-import org.apache.thrift.transport.TTransport;
-
-public class Session implements Serializable
-{
-    // command line options
-    public static final Options availableOptions = new Options();
-
-    public static final String KEYSPACE_NAME = "Keyspace1";
-    public static final String DEFAULT_COMPARATOR = "AsciiType";
-    public static final String DEFAULT_VALIDATOR  = "BytesType";
-
-    private static InetAddress localInetAddress;
-
-    public final AtomicInteger operations = new AtomicInteger();
-    public final AtomicInteger keys = new AtomicInteger();
-    public final com.yammer.metrics.core.Timer latency = Metrics.newTimer(Session.class, "latency");
-
-    private static final String SSL_TRUSTSTORE = "truststore";
-    private static final String SSL_TRUSTSTORE_PW = "truststore-password";
-    private static final String SSL_PROTOCOL = "ssl-protocol";
-    private static final String SSL_ALGORITHM = "ssl-alg";
-    private static final String SSL_STORE_TYPE = "store-type";
-    private static final String SSL_CIPHER_SUITES = "ssl-ciphers";
-
-    static
-    {
-        availableOptions.addOption("h",  "help",                 false,  "Show this help message and exit");
-        availableOptions.addOption("n",  "num-keys",             true,   "Number of keys, default:1000000");
-        availableOptions.addOption("F",  "num-different-keys",   true,   "Number of different keys (if < NUM-KEYS, the same key will re-used multiple times), default:NUM-KEYS");
-        availableOptions.addOption("N",  "skip-keys",            true,   "Fraction of keys to skip initially, default:0");
-        availableOptions.addOption("t",  "threads",              true,   "Number of threads to use, default:50");
-        availableOptions.addOption("c",  "columns",              true,   "Number of columns per key, default:5");
-        availableOptions.addOption("S",  "column-size",          true,   "Size of column values in bytes, default:34");
-        availableOptions.addOption("C",  "cardinality",          true,   "Number of unique values stored in columns, default:50");
-        availableOptions.addOption("d",  "nodes",                true,   "Host nodes (comma separated), default:locahost");
-        availableOptions.addOption("D",  "nodesfile",            true,   "File containing host nodes (one per line)");
-        availableOptions.addOption("s",  "stdev",                true,   "Standard Deviation Factor, default:0.1");
-        availableOptions.addOption("r",  "random",               false,  "Use random key generator (STDEV will have no effect), default:false");
-        availableOptions.addOption("f",  "file",                 true,   "Write output to given file");
-        availableOptions.addOption("p",  "port",                 true,   "Thrift port, default:9160");
-        availableOptions.addOption("o",  "operation",            true,   "Operation to perform (INSERT, READ, RANGE_SLICE, INDEXED_RANGE_SLICE, MULTI_GET, COUNTER_ADD, COUNTER_GET), default:INSERT");
-        availableOptions.addOption("u",  "supercolumns",         true,   "Number of super columns per key, default:1");
-        availableOptions.addOption("y",  "family-type",          true,   "Column Family Type (Super, Standard), default:Standard");
-        availableOptions.addOption("K",  "keep-trying",          true,   "Retry on-going operation N times (in case of failure). positive integer, default:10");
-        availableOptions.addOption("k",  "keep-going",           false,  "Ignore errors inserting or reading (when set, --keep-trying has no effect), default:false");
-        availableOptions.addOption("i",  "progress-interval",    true,   "Progress Report Interval (seconds), default:10");
-        availableOptions.addOption("g",  "keys-per-call",        true,   "Number of keys to get_range_slices or multiget per call, default:1000");
-        availableOptions.addOption("l",  "replication-factor",   true,   "Replication Factor to use when creating needed column families, default:1");
-        availableOptions.addOption("L",  "enable-cql",           false,  "Perform queries using CQL2 (Cassandra Query Language v 2.0.0)");
-        availableOptions.addOption("L3", "enable-cql3",          false,  "Perform queries using CQL3 (Cassandra Query Language v 3.0.0)");
-        availableOptions.addOption("b",  "enable-native-protocol",  false,  "Use the binary native protocol (only work along with -L3)");
-        availableOptions.addOption("P",  "use-prepared-statements", false, "Perform queries using prepared statements (only applicable to CQL).");
-        availableOptions.addOption("e",  "consistency-level",    true,   "Consistency Level to use (ONE, QUORUM, LOCAL_QUORUM, EACH_QUORUM, ALL, ANY), default:ONE");
-        availableOptions.addOption("x",  "create-index",         true,   "Type of index to create on needed column families (KEYS)");
-        availableOptions.addOption("R",  "replication-strategy", true,   "Replication strategy to use (only on insert if keyspace does not exist), default:org.apache.cassandra.locator.SimpleStrategy");
-        availableOptions.addOption("O",  "strategy-properties",  true,   "Replication strategy properties in the following format <dc_name>:<num>,<dc_name>:<num>,...");
-        availableOptions.addOption("W",  "no-replicate-on-write",false,  "Set replicate_on_write to false for counters. Only counter add with CL=ONE will work");
-        availableOptions.addOption("V",  "average-size-values",  false,  "Generate column values of average rather than specific size");
-        availableOptions.addOption("T",  "send-to",              true,   "Send this as a request to the stress daemon at specified address.");
-        availableOptions.addOption("I",  "compression",          true,   "Specify the compression to use for sstable, default:no compression");
-        availableOptions.addOption("Q",  "query-names",          true,   "Comma-separated list of column names to retrieve from each row.");
-        availableOptions.addOption("Z",  "compaction-strategy",  true,   "CompactionStrategy to use.");
-        availableOptions.addOption("U",  "comparator",           true,   "Column Comparator to use. Currently supported types are: TimeUUIDType, AsciiType, UTF8Type.");
-        availableOptions.addOption("tf", "transport-factory",    true,   "Fully-qualified TTransportFactory class name for creating a connection. Note: For Thrift over SSL, use org.apache.cassandra.stress.SSLTransportFactory.");
-        availableOptions.addOption("ns", "no-statistics",        false,  "Turn off the aggegate statistics that is normally output after completion.");
-        availableOptions.addOption("ts", SSL_TRUSTSTORE,         true, "SSL: full path to truststore");
-        availableOptions.addOption("tspw", SSL_TRUSTSTORE_PW,    true, "SSL: full path to truststore");
-        availableOptions.addOption("prtcl", SSL_PROTOCOL,        true, "SSL: connections protocol to use (default: TLS)");
-        availableOptions.addOption("alg", SSL_ALGORITHM,         true, "SSL: algorithm (default: SunX509)");
-        availableOptions.addOption("st", SSL_STORE_TYPE,         true, "SSL: type of store");
-        availableOptions.addOption("ciphers", SSL_CIPHER_SUITES, true, "SSL: comma-separated list of encryption suites to use");
-        availableOptions.addOption("th", "throttle",             true, "Throttle the total number of operations per second to a maximum amount.");
-        availableOptions.addOption("un", "username",             true, "Username for authentication.");
-        availableOptions.addOption("pw", "password",             true, "Password for authentication.");
-    }
-
-    private int numKeys          = 1000 * 1000;
-    private int numDifferentKeys = numKeys;
-    private float skipKeys       = 0;
-    private int threads          = 50;
-    private int columns          = 5;
-    private int columnSize       = 34;
-    private int cardinality      = 50;
-    public String[] nodes        = new String[] { "127.0.0.1" };
-    private boolean random       = false;
-    private int retryTimes       = 10;
-    public int port              = 9160;
-    private int superColumns     = 1;
-    private String compression   = null;
-    private String compactionStrategy = null;
-    private String username      = null;
-    private String password      = null;
-
-    private int progressInterval  = 10;
-    private int keysPerCall       = 1000;
-    private boolean replicateOnWrite = true;
-    private boolean ignoreErrors  = false;
-    private boolean enable_cql    = false;
-    private boolean use_prepared  = false;
-    private boolean trace         = false;
-    private boolean captureStatistics = true;
-    public boolean use_native_protocol = false;
-    private double maxOpsPerSecond = Double.MAX_VALUE;
-
-    private final String outFileName;
-
-    private IndexType indexType = null;
-    private Stress.Operations operation = Stress.Operations.INSERT;
-    private ColumnFamilyType columnFamilyType = ColumnFamilyType.Standard;
-    private ConsistencyLevel consistencyLevel = ConsistencyLevel.ONE;
-    private String replicationStrategy = "org.apache.cassandra.locator.SimpleStrategy";
-    private Map<String, String> replicationStrategyOptions = new HashMap<String, String>();
-
-    // if we know exactly column names that we want to read (set by -Q option)
-    public final List<ByteBuffer> columnNames;
-
-    public String cqlVersion;
-
-    public final boolean averageSizeValues;
-
-    // required by Gaussian distribution.
-    protected int   mean;
-    protected float sigma;
-
-    public final InetAddress sendToDaemon;
-    public final String comparator;
-    public final boolean timeUUIDComparator;
-    public double traceProbability = 0.0;
-    public EncryptionOptions encOptions = new ClientEncryptionOptions();
-    public ITransportFactory transportFactory = new TFramedTransportFactory();
-
-    public Session(String[] arguments) throws IllegalArgumentException, SyntaxException
-    {
-        float STDev = 0.1f;
-        CommandLineParser parser = new PosixParser();
-
-        try
-        {
-            CommandLine cmd = parser.parse(availableOptions, arguments);
-
-            if (cmd.getArgs().length > 0)
-            {
-                System.err.println("Application does not allow arbitrary arguments: " + StringUtils.join(cmd.getArgList(), ", "));
-                System.exit(1);
-            }
-
-            if (cmd.hasOption("h"))
-                throw new IllegalArgumentException("help");
-
-            if (cmd.hasOption("n"))
-                numKeys = Integer.parseInt(cmd.getOptionValue("n"));
-
-            if (cmd.hasOption("F"))
-                numDifferentKeys = Integer.parseInt(cmd.getOptionValue("F"));
-            else
-                numDifferentKeys = numKeys;
-
-            if (cmd.hasOption("N"))
-                skipKeys = Float.parseFloat(cmd.getOptionValue("N"));
-
-            if (cmd.hasOption("t"))
-                threads = Integer.parseInt(cmd.getOptionValue("t"));
-
-            if (cmd.hasOption("c"))
-                columns = Integer.parseInt(cmd.getOptionValue("c"));
-
-            if (cmd.hasOption("S"))
-                columnSize = Integer.parseInt(cmd.getOptionValue("S"));
-
-            if (cmd.hasOption("C"))
-                cardinality = Integer.parseInt(cmd.getOptionValue("C"));
-
-            if (cmd.hasOption("d"))
-                nodes = cmd.getOptionValue("d").split(",");
-
-            if (cmd.hasOption("D"))
-            {
-                try
-                {
-                    String node;
-                    List<String> tmpNodes = new ArrayList<String>();
-                    BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(cmd.getOptionValue("D"))));
-                    try
-                    {
-                        while ((node = in.readLine()) != null)
-                        {
-                            if (node.length() > 0)
-                                tmpNodes.add(node);
-                        }
-                        nodes = tmpNodes.toArray(new String[tmpNodes.size()]);
-                    }
-                    finally
-                    {
-                        in.close();
-                    }
-                }
-                catch(IOException ioe)
-                {
-                    throw new RuntimeException(ioe);
-                }
-            }
-
-            if (cmd.hasOption("s"))
-                STDev = Float.parseFloat(cmd.getOptionValue("s"));
-
-            if (cmd.hasOption("r"))
-                random = true;
-
-            outFileName = (cmd.hasOption("f")) ? cmd.getOptionValue("f") : null;
-
-            if (cmd.hasOption("p"))
-                port = Integer.parseInt(cmd.getOptionValue("p"));
-
-            if (cmd.hasOption("o"))
-                operation = Stress.Operations.valueOf(cmd.getOptionValue("o").toUpperCase());
-
-            if (cmd.hasOption("u"))
-                superColumns = Integer.parseInt(cmd.getOptionValue("u"));
-
-            if (cmd.hasOption("y"))
-                columnFamilyType = ColumnFamilyType.valueOf(cmd.getOptionValue("y"));
-
-            if (cmd.hasOption("K"))
-            {
-                retryTimes = Integer.valueOf(cmd.getOptionValue("K"));
-
-                if (retryTimes <= 0)
-                {
-                    throw new RuntimeException("--keep-trying option value should be > 0");
-                }
-            }
-
-            if (cmd.hasOption("k"))
-            {
-                retryTimes = 1;
-                ignoreErrors = true;
-            }
-
-
-            if (cmd.hasOption("i"))
-                progressInterval = Integer.parseInt(cmd.getOptionValue("i"));
-
-            if (cmd.hasOption("g"))
-                keysPerCall = Integer.parseInt(cmd.getOptionValue("g"));
-
-            if (cmd.hasOption("th"))
-                maxOpsPerSecond = Double.parseDouble(cmd.getOptionValue("th"));
-
-            if (cmd.hasOption("e"))
-                consistencyLevel = ConsistencyLevel.valueOf(cmd.getOptionValue("e").toUpperCase());
-
-            if (cmd.hasOption("x"))
-                indexType = IndexType.valueOf(cmd.getOptionValue("x").toUpperCase());
-
-            if (cmd.hasOption("R"))
-                replicationStrategy = cmd.getOptionValue("R");
-
-            if (cmd.hasOption("l"))
-                replicationStrategyOptions.put("replication_factor", String.valueOf(Integer.parseInt(cmd.getOptionValue("l"))));
-            else if (replicationStrategy.endsWith("SimpleStrategy"))
-                replicationStrategyOptions.put("replication_factor", "1");
-
-            if (cmd.hasOption("L"))
-            {
-                enable_cql = true;
-                cqlVersion = "2.0.0";
-            }
-
-            if (cmd.hasOption("L3"))
-            {
-                enable_cql = true;
-                cqlVersion = "3.0.0";
-            }
-
-            if (cmd.hasOption("b"))
-            {
-                if (!(enable_cql && cqlVersion.startsWith("3")))
-                    throw new IllegalArgumentException("Cannot use binary protocol without -L3");
-                use_native_protocol = true;
-            }
-
-            if (cmd.hasOption("P"))
-            {
-                if (!enable_cql)
-                {
-                    System.err.println("-P/--use-prepared-statements is only applicable with CQL (-L/--enable-cql)");
-                    System.exit(-1);
-                }
-                use_prepared = true;
-            }
-
-            if (cmd.hasOption("O"))
-            {
-                String[] pairs = StringUtils.split(cmd.getOptionValue("O"), ',');
-
-                for (String pair : pairs)
-                {
-                    String[] keyAndValue = StringUtils.split(pair, ':');
-
-                    if (keyAndValue.length != 2)
-                        throw new RuntimeException("Invalid --strategy-properties value.");
-
-                    replicationStrategyOptions.put(keyAndValue[0], keyAndValue[1]);
-                }
-            }
-
-            if (cmd.hasOption("W"))
-                replicateOnWrite = false;
-
-            if (cmd.hasOption("I"))
-                compression = cmd.getOptionValue("I");
-
-            averageSizeValues = cmd.hasOption("V");
-
-            try
-            {
-                sendToDaemon = cmd.hasOption("send-to")
-                                ? InetAddress.getByName(cmd.getOptionValue("send-to"))
-                                : null;
-            }
-            catch (UnknownHostException e)
-            {
-                throw new RuntimeException(e);
-            }
-
-            if (cmd.hasOption("Q"))
-            {
-                AbstractType comparator = TypeParser.parse(DEFAULT_COMPARATOR);
-
-                String[] names = StringUtils.split(cmd.getOptionValue("Q"), ",");
-                columnNames = new ArrayList<ByteBuffer>(names.length);
-
-                for (String columnName : names)
-                    columnNames.add(comparator.fromString(columnName));
-            }
-            else
-            {
-                columnNames = null;
-            }
-
-            if (cmd.hasOption("Z"))
-            {
-                compactionStrategy = cmd.getOptionValue("Z");
-
-                try
-                {
-                    // validate compaction strategy class
-                    CFMetaData.createCompactionStrategy(compactionStrategy);
-                }
-                catch (ConfigurationException e)
-                {
-                    System.err.println(e.getMessage());
-                    System.exit(1);
-                }
-            }
-
-            if (cmd.hasOption("U"))
-            {
-                AbstractType parsed = null;
-
-                try
-                {
-                    parsed = TypeParser.parse(cmd.getOptionValue("U"));
-                }
-                catch (ConfigurationException e)
-                {
-                    System.err.println(e.getMessage());
-                    System.exit(1);
-                }
-
-                comparator = cmd.getOptionValue("U");
-                timeUUIDComparator = parsed instanceof TimeUUIDType;
-
-                if (!(parsed instanceof TimeUUIDType || parsed instanceof AsciiType || parsed instanceof UTF8Type))
-                {
-                    System.err.println("Currently supported types are: TimeUUIDType, AsciiType, UTF8Type.");
-                    System.exit(1);
-                }
-            }
-            else
-            {
-                comparator = null;
-                timeUUIDComparator = false;
-            }
-
-            if (cmd.hasOption("ns"))
-            {
-                captureStatistics = false;
-            }
-
-            if(cmd.hasOption(SSL_TRUSTSTORE))
-                encOptions.truststore = cmd.getOptionValue(SSL_TRUSTSTORE);
-
-            if(cmd.hasOption(SSL_TRUSTSTORE_PW))
-                encOptions.truststore_password = cmd.getOptionValue(SSL_TRUSTSTORE_PW);
-
-            if(cmd.hasOption(SSL_PROTOCOL))
-                encOptions.protocol = cmd.getOptionValue(SSL_PROTOCOL);
-
-            if(cmd.hasOption(SSL_ALGORITHM))
-                encOptions.algorithm = cmd.getOptionValue(SSL_ALGORITHM);
-
-            if(cmd.hasOption(SSL_STORE_TYPE))
-                encOptions.store_type = cmd.getOptionValue(SSL_STORE_TYPE);
-
-            if(cmd.hasOption(SSL_CIPHER_SUITES))
-                encOptions.cipher_suites = cmd.getOptionValue(SSL_CIPHER_SUITES).split(",");
-
-            if (cmd.hasOption("tf"))
-            {
-                transportFactory = validateAndSetTransportFactory(cmd.getOptionValue("tf"));
-                configureTransportFactory(transportFactory, encOptions);
-            }
-
-            if (cmd.hasOption("un"))
-                username = cmd.getOptionValue("un");
-
-            if (cmd.hasOption("pw"))
-                password = cmd.getOptionValue("pw");
-        }
-        catch (ParseException e)
-        {
-            throw new IllegalArgumentException(e.getMessage(), e);
-        }
-        catch (ConfigurationException e)
-        {
-            throw new IllegalStateException(e.getMessage(), e);
-        }
-
-        mean  = numDifferentKeys / 2;
-        sigma = numDifferentKeys * STDev;
-    }
-
-    private ITransportFactory validateAndSetTransportFactory(String transportFactory)
-    {
-        try
-        {
-            Class factory = Class.forName(transportFactory);
-
-            if(!ITransportFactory.class.isAssignableFrom(factory))
-                throw new IllegalArgumentException(String.format("transport factory '%s' " +
-                        "not derived from ITransportFactory", transportFactory));
-
-            return (ITransportFactory) factory.newInstance();
-        }
-        catch (Exception e)
-        {
-            throw new IllegalArgumentException(String.format("Cannot create a transport factory '%s'.", transportFactory), e);
-        }
-    }
-
-    private void configureTransportFactory(ITransportFactory transportFactory, EncryptionOptions encOptions)
-    {
-        Map<String, String> options = new HashMap<>();
-        // If the supplied factory supports the same set of options as our SSL impl, set those
-        if (transportFactory.supportedOptions().contains(SSLTransportFactory.TRUSTSTORE))
-            options.put(SSLTransportFactory.TRUSTSTORE, encOptions.truststore);
-        if (transportFactory.supportedOptions().contains(SSLTransportFactory.TRUSTSTORE_PASSWORD))
-            options.put(SSLTransportFactory.TRUSTSTORE_PASSWORD, encOptions.truststore_password);
-        if (transportFactory.supportedOptions().contains(SSLTransportFactory.PROTOCOL))
-            options.put(SSLTransportFactory.PROTOCOL, encOptions.protocol);
-        if (transportFactory.supportedOptions().contains(SSLTransportFactory.CIPHER_SUITES))
-            options.put(SSLTransportFactory.CIPHER_SUITES, Joiner.on(',').join(encOptions.cipher_suites));
-
-        if (transportFactory.supportedOptions().contains(SSLTransportFactory.KEYSTORE)
-                && encOptions.require_client_auth)
-            options.put(SSLTransportFactory.KEYSTORE, encOptions.keystore);
-        if (transportFactory.supportedOptions().contains(SSLTransportFactory.KEYSTORE_PASSWORD)
-                && encOptions.require_client_auth)
-            options.put(SSLTransportFactory.KEYSTORE_PASSWORD, encOptions.keystore_password);
-
-        // Now check if any of the factory's supported options are set as system properties
-        for (String optionKey : transportFactory.supportedOptions())
-            if (System.getProperty(optionKey) != null)
-                options.put(optionKey, System.getProperty(optionKey));
-
-        transportFactory.setOptions(options);
-    }
-
-    public int getCardinality()
-    {
-        return cardinality;
-    }
-
-    public int getColumnSize()
-    {
-        return columnSize;
-    }
-
-    public int getColumnsPerKey()
-    {
-        return columns;
-    }
-
-    public ColumnFamilyType getColumnFamilyType()
-    {
-        return columnFamilyType;
-    }
-
-    public int getNumKeys()
-    {
-        return numKeys;
-    }
-
-    public int getNumDifferentKeys()
-    {
-        return numDifferentKeys;
-    }
-
-    public int getThreads()
-    {
-        return threads;
-    }
-
-    public double getMaxOpsPerSecond()
-    {
-        return maxOpsPerSecond;
-    }
-
-    public float getSkipKeys()
-    {
-        return skipKeys;
-    }
-
-    public int getSuperColumns()
-    {
-        return superColumns;
-    }
-
-    public int getKeysPerThread()
-    {
-        return numKeys / threads;
-    }
-
-    public int getTotalKeysLength()
-    {
-        return Integer.toString(numDifferentKeys).length();
-    }
-
-    public ConsistencyLevel getConsistencyLevel()
-    {
-        return consistencyLevel;
-    }
-
-    public int getRetryTimes()
-    {
-        return retryTimes;
-    }
-
-    public boolean ignoreErrors()
-    {
-        return ignoreErrors;
-    }
-
-    public Stress.Operations getOperation()
-    {
-        return operation;
-    }
-
-    public PrintStream getOutputStream()
-    {
-        try
-        {
-            return (outFileName == null) ? System.out : new PrintStream(new FileOutputStream(outFileName));
-        }
-        catch (FileNotFoundException e)
-        {
-            throw new RuntimeException(e.getMessage(), e);
-        }
-    }
-
-    public int getProgressInterval()
-    {
-        return progressInterval;
-    }
-
-    public boolean useRandomGenerator()
-    {
-        return random;
-    }
-
-    public int getKeysPerCall()
-    {
-        return keysPerCall;
-    }
-
-    // required by Gaussian distribution
-    public int getMean()
-    {
-        return mean;
-    }
-
-    // required by Gaussian distribution
-    public float getSigma()
-    {
-        return sigma;
-    }
-
-    public boolean isCQL()
-    {
-        return enable_cql;
-    }
-
-    public boolean usePreparedStatements()
-    {
-        return use_prepared;
-    }
-
-    public boolean outputStatistics()
-    {
-        return captureStatistics;
-    }
-
-    /**
-     * Create Keyspace with Standard and Super/Counter column families
-     */
-    public void createKeySpaces()
-    {
-        KsDef keyspace = new KsDef();
-        String defaultComparator = comparator == null ? DEFAULT_COMPARATOR : comparator;
-
-        // column family for standard columns
-        CfDef standardCfDef = new CfDef(KEYSPACE_NAME, "Standard1");
-        Map<String, String> compressionOptions = new HashMap<String, String>();
-        if (compression != null)
-            compressionOptions.put("sstable_compression", compression);
-
-        standardCfDef.setComparator_type(defaultComparator)
-                     .setDefault_validation_class(DEFAULT_VALIDATOR)
-                     .setCompression_options(compressionOptions);
-
-        if (!timeUUIDComparator)
-        {
-            for (int i = 0; i < getColumnsPerKey(); i++)
-            {
-                standardCfDef.addToColumn_metadata(new ColumnDef(ByteBufferUtil.bytes("C" + i), "BytesType"));
-            }
-        }
-
-        if (indexType != null)
-        {
-            ColumnDef standardColumn = new ColumnDef(ByteBufferUtil.bytes("C1"), "BytesType");
-            standardColumn.setIndex_type(indexType).setIndex_name("Idx1");
-            standardCfDef.setColumn_metadata(Arrays.asList(standardColumn));
-        }
-
-        // column family with super columns
-        CfDef superCfDef = new CfDef(KEYSPACE_NAME, "Super1").setColumn_type("Super");
-        superCfDef.setComparator_type(DEFAULT_COMPARATOR)
-                  .setSubcomparator_type(defaultComparator)
-                  .setDefault_validation_class(DEFAULT_VALIDATOR)
-                  .setCompression_options(compressionOptions);
-
-        // column family for standard counters
-        CfDef counterCfDef = new CfDef(KEYSPACE_NAME, "Counter1").setComparator_type(defaultComparator)
-                                                                 .setComparator_type(defaultComparator)
-                                                                 .setDefault_validation_class("CounterColumnType")
-                                                                 .setReplicate_on_write(replicateOnWrite)
-                                                                 .setCompression_options(compressionOptions);
-
-        // column family with counter super columns
-        CfDef counterSuperCfDef = new CfDef(KEYSPACE_NAME, "SuperCounter1").setComparator_type(defaultComparator)
-                                                                           .setDefault_validation_class("CounterColumnType")
-                                                                           .setReplicate_on_write(replicateOnWrite)
-                                                                           .setColumn_type("Super")
-                                                                           .setCompression_options(compressionOptions);
-
-        keyspace.setName(KEYSPACE_NAME);
-        keyspace.setStrategy_class(replicationStrategy);
-
-        if (!replicationStrategyOptions.isEmpty())
-        {
-            keyspace.setStrategy_options(replicationStrategyOptions);
-        }
-
-        if (compactionStrategy != null)
-        {
-            standardCfDef.setCompaction_strategy(compactionStrategy);
-            superCfDef.setCompaction_strategy(compactionStrategy);
-            counterCfDef.setCompaction_strategy(compactionStrategy);
-            counterSuperCfDef.setCompaction_strategy(compactionStrategy);
-        }
-
-        keyspace.setCf_defs(new ArrayList<CfDef>(Arrays.asList(standardCfDef, superCfDef, counterCfDef, counterSuperCfDef)));
-
-        CassandraClient client = getClient(false);
-
-        try
-        {
-            client.system_add_keyspace(keyspace);
-
-            /* CQL3 counter cf */
-            client.set_cql_version("3.0.0"); // just to create counter cf for cql3
-
-            client.set_keyspace(KEYSPACE_NAME);
-            client.execute_cql3_query(createCounterCFStatementForCQL3(), Compression.NONE, ConsistencyLevel.ONE);
-
-            if (enable_cql)
-                client.set_cql_version(cqlVersion);
-            /* end */
-
-            System.out.println(String.format("Created keyspaces. Sleeping %ss for propagation.", nodes.length));
-            Thread.sleep(nodes.length * 1000); // seconds
-        }
-        catch (InvalidRequestException e)
-        {
-            System.err.println("Unable to create stress keyspace: " + e.getWhy());
-        }
-        catch (Exception e)
-        {
-            System.err.println(e.getMessage());
-        }
-    }
-
-    /**
-     * Thrift client connection with Keyspace1 set.
-     * @return cassandra client connection
-     */
-    public CassandraClient getClient()
-    {
-        return getClient(true);
-    }
-
-    /**
-     * Thrift client connection
-     * @param setKeyspace - should we set keyspace for client or not
-     * @return cassandra client connection
-     */
-    public CassandraClient getClient(boolean setKeyspace)
-    {
-        // random node selection for fake load balancing
-        String currentNode = nodes[Stress.randomizer.nextInt(nodes.length)];
-
-        try
-        {
-            TTransport transport = transportFactory.openTransport(currentNode, port);
-            CassandraClient client = new CassandraClient(new TBinaryProtocol(transport));
-
-            if (!transport.isOpen())
-                transport.open();
-
-            if (enable_cql)
-                client.set_cql_version(cqlVersion);
-
-            if (setKeyspace)
-                client.set_keyspace("Keyspace1");
-
-            if (username != null && password != null)
-            {
-                Map<String, String> credentials = new HashMap<String, String>();
-                credentials.put(IAuthenticator.USERNAME_KEY, username);
-                credentials.put(IAuthenticator.PASSWORD_KEY, password);
-                AuthenticationRequest authenticationRequest = new AuthenticationRequest(credentials);
-                client.login(authenticationRequest);
-            }
-            return client;
-        }
-        catch (AuthenticationException e)
-        {
-            throw new RuntimeException(e.getWhy());
-        }
-        catch (AuthorizationException e)
-        {
-            throw new RuntimeException(e.getWhy());
-        }
-        catch (InvalidRequestException e)
-        {
-            throw new RuntimeException(e.getWhy());
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e.getMessage());
-        }
-    }
-
-    public SimpleClient getNativeClient()
-    {
-        try
-        {
-            String currentNode = nodes[Stress.randomizer.nextInt(nodes.length)];
-            SimpleClient client = new SimpleClient(currentNode, 9042);
-            client.connect(false);
-            client.execute("USE \"Keyspace1\";", org.apache.cassandra.db.ConsistencyLevel.ONE);
-            return client;
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e.getMessage());
-        }
-    }
-
-    public static InetAddress getLocalAddress()
-    {
-        if (localInetAddress == null)
-        {
-            try
-            {
-                localInetAddress = InetAddress.getLocalHost();
-            }
-            catch (UnknownHostException e)
-            {
-                throw new RuntimeException(e);
-            }
-        }
-
-        return localInetAddress;
-    }
-
-    private ByteBuffer createCounterCFStatementForCQL3()
-    {
-        StringBuilder counter3 = new StringBuilder("CREATE TABLE \"Counter3\" (KEY blob PRIMARY KEY, ");
-
-        for (int i = 0; i < getColumnsPerKey(); i++)
-        {
-            counter3.append("c").append(i).append(" counter");
-            if (i != getColumnsPerKey() - 1)
-                counter3.append(", ");
-        }
-        counter3.append(");");
-
-        return ByteBufferUtil.bytes(counter3.toString());
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/Stress.java b/tools/stress/src/org/apache/cassandra/stress/Stress.java
index 738a1c0..256cefb 100644
--- a/tools/stress/src/org/apache/cassandra/stress/Stress.java
+++ b/tools/stress/src/org/apache/cassandra/stress/Stress.java

@@ -17,48 +17,65 @@
  */
 package org.apache.cassandra.stress;
 
-import org.apache.commons.cli.Option;
-
 import java.io.*;
 import java.net.Socket;
 import java.net.SocketException;
-import java.util.Random;
+
+import org.apache.cassandra.stress.settings.StressSettings;
 
 public final class Stress
 {
-    public static enum Operations
-    {
-        INSERT, READ, RANGE_SLICE, INDEXED_RANGE_SLICE, MULTI_GET, COUNTER_ADD, COUNTER_GET
-    }
 
-    public static Session session;
-    public static Random randomizer = new Random();
+    /**
+     * Known issues:
+     * - uncertainty/stderr assumes op-rates are normally distributed. Due to GC (and possibly latency stepping from
+     * different media, though the variance of request ratio across media should be normally distributed), they are not.
+     * Should attempt to account for pauses in stderr calculation, possibly by assuming these pauses are a separate
+     * normally distributed occurrence
+     * - Under very mixed work loads, the uncertainty calculations and op/s reporting really don't mean much. Should
+     * consider breaking op/s down per workload, or should have a lower-bound on inspection interval based on clustering
+     * of operations and thread count.
+     *
+     *
+     * Future improvements:
+     * - Configurable connection compression
+     * - Java driver support
+     * - Per column data generators
+     * - Automatic column/schema detection if provided with a CF
+     * - target rate produces a very steady work rate, and if we want to simulate a real op rate for an
+     *   application we should have some variation in the actual op rate within any time-slice.
+     * - auto rate should vary the thread count based on performance improvement, potentially starting on a very low
+     *   thread count with a high error rate / low count to get some basic numbers
+     */
+
     private static volatile boolean stopped = false;
 
     public static void main(String[] arguments) throws Exception
     {
+        final StressSettings settings;
         try
         {
-            session = new Session(arguments);
+            settings = StressSettings.parse(arguments);
         }
         catch (IllegalArgumentException e)
         {
             printHelpMessage();
+            e.printStackTrace();
             return;
         }
 
-        PrintStream outStream = session.getOutputStream();
+        PrintStream logout = settings.log.getOutput();
 
-        if (session.sendToDaemon != null)
+        if (settings.sendToDaemon != null)
         {
-            Socket socket = new Socket(session.sendToDaemon, 2159);
+            Socket socket = new Socket(settings.sendToDaemon, 2159);
 
             ObjectOutputStream out = new ObjectOutputStream(socket.getOutputStream());
             BufferedReader inp = new BufferedReader(new InputStreamReader(socket.getInputStream()));
 
             Runtime.getRuntime().addShutdownHook(new ShutDown(socket, out));
 
-            out.writeObject(session);
+            out.writeObject(settings);
 
             String line;
 
@@ -72,7 +89,7 @@
                         break;
                     }
 
-                    outStream.println(line);
+                    logout.println(line);
                 }
             }
             catch (SocketException e)
@@ -88,11 +105,11 @@
         }
         else
         {
-            StressAction stressAction = new StressAction(session, outStream);
-            stressAction.start();
-            stressAction.join();
-            System.exit(stressAction.getReturnCode());
+            StressAction stressAction = new StressAction(settings, logout);
+            stressAction.run();
         }
+
+        System.exit(0);
     }
 
     /**
@@ -100,15 +117,7 @@
      */
     public static void printHelpMessage()
     {
-        System.out.println("Usage: ./bin/cassandra-stress [options]\n\nOptions:");
-
-        for(Object o : Session.availableOptions.getOptions())
-        {
-            Option option = (Option) o;
-            String upperCaseName = option.getLongOpt().toUpperCase();
-            System.out.println(String.format("-%s%s, --%s%s%n\t\t%s%n", option.getOpt(), (option.hasArg()) ? " "+upperCaseName : "",
-                                                            option.getLongOpt(), (option.hasArg()) ? "="+upperCaseName : "", option.getDescription()));
-        }
+        StressSettings.printHelp();
     }
 
     private static class ShutDown extends Thread

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressAction.java b/tools/stress/src/org/apache/cassandra/stress/StressAction.java
index 7098d0b..b50637f 100644
--- a/tools/stress/src/org/apache/cassandra/stress/StressAction.java
+++ b/tools/stress/src/org/apache/cassandra/stress/StressAction.java

@@ -17,322 +17,403 @@
  */
 package org.apache.cassandra.stress;
 
+import java.io.IOException;
+import java.io.OutputStream;
 import java.io.PrintStream;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.SynchronousQueue;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
 
-import com.google.common.util.concurrent.Uninterruptibles;
 import com.google.common.util.concurrent.RateLimiter;
-import com.yammer.metrics.stats.Snapshot;
-import org.apache.cassandra.stress.operations.*;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
+import com.google.common.util.concurrent.Uninterruptibles;
+
+import org.apache.cassandra.stress.generate.Partition;
+import org.apache.cassandra.stress.operations.OpDistribution;
+import org.apache.cassandra.stress.operations.OpDistributionFactory;
+import org.apache.cassandra.stress.settings.*;
+import org.apache.cassandra.stress.util.JavaDriverClient;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
 import org.apache.cassandra.transport.SimpleClient;
 
-public class StressAction extends Thread
+public class StressAction implements Runnable
 {
-    /**
-     * Producer-Consumer model: 1 producer, N consumers
-     */
-    private final BlockingQueue<Operation> operations = new SynchronousQueue<Operation>(true);
 
-    private final Session client;
+    private final StressSettings settings;
     private final PrintStream output;
 
-    private volatile boolean stop = false;
-
-    public static final int SUCCESS = 0;
-    public static final int FAILURE = 1;
-
-    private volatile int returnCode = -1;
-
-    public StressAction(Session session, PrintStream out)
+    public StressAction(StressSettings settings, PrintStream out)
     {
-        client = session;
+        this.settings = settings;
         output = out;
     }
 
     public void run()
     {
-        Snapshot latency;
-        long oldLatency;
-        int epoch, total, oldTotal, keyCount, oldKeyCount;
-
         // creating keyspace and column families
-        if (client.getOperation() == Stress.Operations.INSERT || client.getOperation() == Stress.Operations.COUNTER_ADD)
-            client.createKeySpaces();
+        settings.maybeCreateKeyspaces();
 
-        int threadCount = client.getThreads();
-        Consumer[] consumers = new Consumer[threadCount];
+        // TODO: warmup should operate configurably over op/pk/row, and be of configurable length
+        if (!settings.command.noWarmup)
+            warmup(settings.command.getFactory(settings));
 
-        output.println("total,interval_op_rate,interval_key_rate,latency,95th,99.9th,elapsed_time");
+        output.println("Sleeping 2s...");
+        Uninterruptibles.sleepUninterruptibly(2, TimeUnit.SECONDS);
 
-        int itemsPerThread = client.getKeysPerThread();
-        int modulo = client.getNumKeys() % threadCount;
-        RateLimiter rateLimiter = RateLimiter.create(client.getMaxOpsPerSecond());
+        // TODO : move this to a new queue wrapper that gates progress based on a poisson (or configurable) distribution
+        RateLimiter rateLimiter = null;
+        if (settings.rate.opRateTargetPerSecond > 0)
+            rateLimiter = RateLimiter.create(settings.rate.opRateTargetPerSecond);
 
-        // creating required type of the threads for the test
-        for (int i = 0; i < threadCount; i++) {
-            if (i == threadCount - 1)
-                itemsPerThread += modulo; // last one is going to handle N + modulo items
+        boolean success;
+        if (settings.rate.minThreads > 0)
+            success = runMulti(settings.rate.auto, rateLimiter);
+        else
+            success = null != run(settings.command.getFactory(settings), settings.rate.threadCount, settings.command.count,
+                                  settings.command.duration, rateLimiter, settings.command.durationUnits, output);
 
-            consumers[i] = new Consumer(itemsPerThread, rateLimiter);
+        if (success)
+            output.println("END");
+        else
+            output.println("FAILURE");
+
+        settings.disconnect();
+    }
+
+    // type provided separately to support recursive call for mixed command with each command type it is performing
+    private void warmup(OpDistributionFactory operations)
+    {
+        // warmup - do 50k iterations; by default hotspot compiles methods after 10k invocations
+        PrintStream warmupOutput = new PrintStream(new OutputStream() { @Override public void write(int b) throws IOException { } } );
+        int iterations = 50000 * settings.node.nodes.size();
+        for (OpDistributionFactory single : operations.each())
+        {
+            // we need to warm up all the nodes in the cluster ideally, but we may not be the only stress instance;
+            // so warm up all the nodes we're speaking to only.
+            output.println(String.format("Warming up %s with %d iterations...", single.desc(), iterations));
+            run(single, 20, iterations, 0, null, null, warmupOutput);
         }
+    }
 
-        Producer producer = new Producer();
-        producer.start();
+    // TODO : permit varying more than just thread count
+    // TODO : vary thread count based on percentage improvement of previous increment, not by fixed amounts
+    private boolean runMulti(boolean auto, RateLimiter rateLimiter)
+    {
+        if (settings.command.targetUncertainty >= 0)
+            output.println("WARNING: uncertainty mode (err<) results in uneven workload between thread runs, so should be used for high level analysis only");
+        int prevThreadCount = -1;
+        int threadCount = settings.rate.minThreads;
+        List<StressMetrics> results = new ArrayList<>();
+        List<String> runIds = new ArrayList<>();
+        do
+        {
+            output.println(String.format("Running with %d threadCount", threadCount));
 
-        // starting worker threads
+            StressMetrics result = run(settings.command.getFactory(settings), threadCount, settings.command.count,
+                                       settings.command.duration, rateLimiter, settings.command.durationUnits, output);
+            if (result == null)
+                return false;
+            results.add(result);
+
+            if (prevThreadCount > 0)
+                System.out.println(String.format("Improvement over %d threadCount: %.0f%%",
+                        prevThreadCount, 100 * averageImprovement(results, 1)));
+
+            runIds.add(threadCount + " threadCount");
+            prevThreadCount = threadCount;
+            if (threadCount < 16)
+                threadCount *= 2;
+            else
+                threadCount *= 1.5;
+
+            if (!results.isEmpty() && threadCount > settings.rate.maxThreads)
+                break;
+
+            if (settings.command.type.updates)
+            {
+                // pause an arbitrary period of time to let the commit log flush, etc. shouldn't make much difference
+                // as we only increase load, never decrease it
+                output.println("Sleeping for 15s");
+                try
+                {
+                    Thread.sleep(15 * 1000);
+                } catch (InterruptedException e)
+                {
+                    return false;
+                }
+            }
+            // run until we have not improved throughput significantly for previous three runs
+        } while (!auto || (hasAverageImprovement(results, 3, 0) && hasAverageImprovement(results, 5, settings.command.targetUncertainty)));
+
+        // summarise all results
+        StressMetrics.summarise(runIds, results, output);
+        return true;
+    }
+
+    private boolean hasAverageImprovement(List<StressMetrics> results, int count, double minImprovement)
+    {
+        return results.size() < count + 1 || averageImprovement(results, count) >= minImprovement;
+    }
+
+    private double averageImprovement(List<StressMetrics> results, int count)
+    {
+        double improvement = 0;
+        for (int i = results.size() - count ; i < results.size() ; i++)
+        {
+            double prev = results.get(i - 1).getTiming().getHistory().opRate();
+            double cur = results.get(i).getTiming().getHistory().opRate();
+            improvement += (cur - prev) / prev;
+        }
+        return improvement / count;
+    }
+
+    private StressMetrics run(OpDistributionFactory operations, int threadCount, long opCount, long duration, RateLimiter rateLimiter, TimeUnit durationUnits, PrintStream output)
+    {
+        output.println(String.format("Running %s with %d threads %s",
+                                     operations.desc(),
+                                     threadCount,
+                                     durationUnits != null ? duration + " " + durationUnits.toString().toLowerCase()
+                                        : opCount > 0      ? "for " + opCount + " iteration"
+                                                           : "until stderr of mean < " + settings.command.targetUncertainty));
+        final WorkManager workManager;
+        if (opCount < 0)
+            workManager = new ContinuousWorkManager();
+        else
+            workManager = new FixedWorkManager(opCount);
+
+        final StressMetrics metrics = new StressMetrics(output, settings.log.intervalMillis, settings);
+
+        final CountDownLatch done = new CountDownLatch(threadCount);
+        final Consumer[] consumers = new Consumer[threadCount];
+        for (int i = 0; i < threadCount; i++)
+            consumers[i] = new Consumer(operations, done, workManager, metrics, rateLimiter);
+
+        // starting worker threadCount
         for (int i = 0; i < threadCount; i++)
             consumers[i].start();
 
-        // initialization of the values
-        boolean terminate = false;
-        epoch = total = keyCount = 0;
+        metrics.start();
 
-        int interval = client.getProgressInterval();
-        int epochIntervals = client.getProgressInterval() * 10;
-        long testStartTime = System.nanoTime();
-        
-        StressStatistics stats = new StressStatistics(client, output);
-
-        while (!terminate)
+        if (durationUnits != null)
         {
-            if (stop)
+            Uninterruptibles.sleepUninterruptibly(duration, durationUnits);
+            workManager.stop();
+        }
+        else if (opCount <= 0)
+        {
+            try
             {
-                producer.stopProducer();
-
-                for (Consumer consumer : consumers)
-                    consumer.stopConsume();
-
-                break;
-            }
-
-            Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
-
-            int alive = 0;
-            for (Thread thread : consumers)
-                if (thread.isAlive()) alive++;
-
-            if (alive == 0)
-                terminate = true;
-
-            epoch++;
-
-            if (terminate || epoch > epochIntervals)
-            {
-                epoch = 0;
-
-                oldTotal = total;
-                oldKeyCount = keyCount;
-
-                total = client.operations.get();
-                keyCount = client.keys.get();
-                latency = client.latency.getSnapshot();
-
-                int opDelta = total - oldTotal;
-                int keyDelta = keyCount - oldKeyCount;
-
-                long currentTimeInSeconds = TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - testStartTime);
-
-                output.println(String.format("%d,%d,%d,%.1f,%.1f,%.1f,%d",
-                                             total,
-                                             opDelta / interval,
-                                             keyDelta / interval,
-                                             latency.getMedian(), latency.get95thPercentile(), latency.get999thPercentile(),
-                                             currentTimeInSeconds));
-
-                if (client.outputStatistics()) {
-                    stats.addIntervalStats(total, 
-                                           opDelta / interval, 
-                                           keyDelta / interval, 
-                                           latency, 
-                                           currentTimeInSeconds);
-                        }
-            }
+                metrics.waitUntilConverges(settings.command.targetUncertainty,
+                        settings.command.minimumUncertaintyMeasurements,
+                        settings.command.maximumUncertaintyMeasurements);
+            } catch (InterruptedException e) { }
+            workManager.stop();
         }
 
-        // if any consumer failed, set the return code to failure.
-        returnCode = SUCCESS;
-        if (producer.isAlive())
+        try
         {
-            producer.interrupt(); // if producer is still alive it means that we had errors in the consumers
-            returnCode = FAILURE;
+            done.await();
+            metrics.stop();
         }
+        catch (InterruptedException e) {}
+
+        if (metrics.wasCancelled())
+            return null;
+
+        metrics.summarise();
+
+        boolean success = true;
         for (Consumer consumer : consumers)
-            if (consumer.getReturnCode() == FAILURE)
-                returnCode = FAILURE;
+            success &= consumer.success;
 
-        if (returnCode == SUCCESS) {            
-            if (client.outputStatistics())
-                stats.printStats();
-            // marking an end of the output to the client
-            output.println("END");            
-        } else {
-            output.println("FAILURE");
-        }
+        if (!success)
+            return null;
 
+        return metrics;
     }
 
-    public int getReturnCode()
-    {
-        return returnCode;
-    }
-
-    /**
-     * Produces exactly N items (awaits each to be consumed)
-     */
-    private class Producer extends Thread
-    {
-        private volatile boolean stop = false;
-
-        public void run()
-        {
-            for (int i = 0; i < client.getNumKeys(); i++)
-            {
-                if (stop)
-                    break;
-
-                try
-                {
-                    operations.put(createOperation(i % client.getNumDifferentKeys()));
-                }
-                catch (InterruptedException e)
-                {
-                    if (e.getMessage() != null)
-                        System.err.println("Producer error - " + e.getMessage());
-                    return;
-                }
-            }
-        }
-
-        public void stopProducer()
-        {
-            stop = true;
-        }
-    }
-
-    /**
-     * Each consumes exactly N items from queue
-     */
     private class Consumer extends Thread
     {
-        private final int items;
-        private final RateLimiter rateLimiter;
-        private volatile boolean stop = false;
-        private volatile int returnCode = StressAction.SUCCESS;
 
-        public Consumer(int toConsume, RateLimiter rateLimiter)
+        private final OpDistribution operations;
+        private final StressMetrics metrics;
+        private final Timer timer;
+        private final RateLimiter rateLimiter;
+        private volatile boolean success = true;
+        private final WorkManager workManager;
+        private final CountDownLatch done;
+
+        public Consumer(OpDistributionFactory operations, CountDownLatch done, WorkManager workManager, StressMetrics metrics, RateLimiter rateLimiter)
         {
-            items = toConsume;
+            this.done = done;
             this.rateLimiter = rateLimiter;
+            this.workManager = workManager;
+            this.metrics = metrics;
+            this.timer = metrics.getTiming().newTimer();
+            this.operations = operations.get(timer);
         }
 
         public void run()
         {
-            if (client.use_native_protocol)
-            {
-                SimpleClient connection = client.getNativeClient();
 
-                for (int i = 0; i < items; i++)
+            try
+            {
+
+                SimpleClient sclient = null;
+                ThriftClient tclient = null;
+                JavaDriverClient jclient = null;
+
+                switch (settings.mode.api)
                 {
-                    if (stop)
+                    case JAVA_DRIVER_NATIVE:
+                        jclient = settings.getJavaDriverClient();
                         break;
+                    case SIMPLE_NATIVE:
+                        sclient = settings.getSimpleNativeClient();
+                        break;
+                    case THRIFT:
+                    case THRIFT_SMART:
+                        tclient = settings.getThriftClient();
+                        break;
+                    default:
+                        throw new IllegalStateException();
+                }
+
+                int maxBatchSize = operations.maxBatchSize();
+                Partition[] partitions = new Partition[maxBatchSize];
+                while (true)
+                {
+
+                    // TODO: Operation should be able to ecapsulate much of this behaviour
+                    Operation op = operations.next();
+                    op.generator.reset();
+
+                    int batchSize = workManager.takePermits(Math.max(1, (int) op.partitionCount.next()));
+                    if (batchSize < 0)
+                        break;
+
+                    if (rateLimiter != null)
+                        rateLimiter.acquire(batchSize);
+
+                    int partitionCount = 0;
+                    while (partitionCount < batchSize)
+                    {
+                        Partition p = op.generator.generate(op);
+                        if (p == null)
+                            break;
+                        partitions[partitionCount++] = p;
+                    }
+
+                    if (partitionCount == 0)
+                        break;
+
+                    op.setPartitions(Arrays.asList(partitions).subList(0, partitionCount));
 
                     try
                     {
-                        rateLimiter.acquire();
-                        operations.take().run(connection); // running job
+                        switch (settings.mode.api)
+                        {
+                            case JAVA_DRIVER_NATIVE:
+                                op.run(jclient);
+                                break;
+                            case SIMPLE_NATIVE:
+                                op.run(sclient);
+                                break;
+                            case THRIFT:
+                            case THRIFT_SMART:
+                            default:
+                                op.run(tclient);
+                        }
                     }
                     catch (Exception e)
                     {
                         if (output == null)
                         {
                             System.err.println(e.getMessage());
-                            returnCode = StressAction.FAILURE;
+                            success = false;
                             System.exit(-1);
                         }
 
-                        output.println(e.getMessage());
-                        returnCode = StressAction.FAILURE;
-                        break;
+                        e.printStackTrace(output);
+                        success = false;
+                        workManager.stop();
+                        metrics.cancel();
+                        return;
                     }
                 }
             }
-            else
+            finally
             {
-                CassandraClient connection = client.getClient();
+                done.countDown();
+                timer.close();
+            }
 
-                for (int i = 0; i < items; i++)
-                {
-                    if (stop)
-                        break;
+        }
 
-                    try
-                    {
-                        rateLimiter.acquire();
-                        operations.take().run(connection); // running job
-                    }
-                    catch (Exception e)
-                    {
-                        if (output == null)
-                        {
-                            System.err.println(e.getMessage());
-                            returnCode = StressAction.FAILURE;
-                            System.exit(-1);
-                        }
+    }
 
-                        output.println(e.getMessage());
-                        returnCode = StressAction.FAILURE;
-                        break;
-                    }
-                }
+    private interface WorkManager
+    {
+        // -1 indicates consumer should terminate
+        int takePermits(int count);
+
+        // signal all consumers to terminate
+        void stop();
+    }
+
+    private static final class FixedWorkManager implements WorkManager
+    {
+
+        final AtomicLong permits;
+
+        public FixedWorkManager(long permits)
+        {
+            this.permits = new AtomicLong(permits);
+        }
+
+        @Override
+        public int takePermits(int count)
+        {
+            while (true)
+            {
+                long cur = permits.get();
+                if (cur == 0)
+                    return -1;
+                count = (int) Math.min(count, cur);
+                long next = cur - count;
+                if (permits.compareAndSet(cur, next))
+                    return count;
             }
         }
 
-        public void stopConsume()
+        @Override
+        public void stop()
+        {
+            permits.getAndSet(0);
+        }
+    }
+
+    private static final class ContinuousWorkManager implements WorkManager
+    {
+
+        volatile boolean stop = false;
+
+        @Override
+        public int takePermits(int count)
+        {
+            if (stop)
+                return -1;
+            return count;
+        }
+
+        @Override
+        public void stop()
         {
             stop = true;
         }
 
-        public int getReturnCode()
-        {
-            return returnCode;
-        }
-    }
-
-    private Operation createOperation(int index)
-    {
-        switch (client.getOperation())
-        {
-            case READ:
-                return client.isCQL() ? new CqlReader(client, index) : new Reader(client, index);
-
-            case COUNTER_GET:
-                return client.isCQL() ? new CqlCounterGetter(client, index) : new CounterGetter(client, index);
-
-            case INSERT:
-                return client.isCQL() ? new CqlInserter(client, index) : new Inserter(client, index);
-
-            case COUNTER_ADD:
-                return client.isCQL() ? new CqlCounterAdder(client, index) : new CounterAdder(client, index);
-
-            case RANGE_SLICE:
-                return client.isCQL() ? new CqlRangeSlicer(client, index) : new RangeSlicer(client, index);
-
-            case INDEXED_RANGE_SLICE:
-                return client.isCQL() ? new CqlIndexedRangeSlicer(client, index) : new IndexedRangeSlicer(client, index);
-
-            case MULTI_GET:
-                return client.isCQL() ? new CqlMultiGetter(client, index) : new MultiGetter(client, index);
-        }
-
-        throw new UnsupportedOperationException();
-    }
-
-    public void stopAction()
-    {
-        stop = true;
     }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressMetrics.java b/tools/stress/src/org/apache/cassandra/stress/StressMetrics.java
new file mode 100644
index 0000000..00f479e
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/StressMetrics.java

@@ -0,0 +1,250 @@
+package org.apache.cassandra.stress;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.lang3.time.DurationFormatUtils;
+
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.stress.settings.SettingsLog;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.JmxCollector;
+import org.apache.cassandra.stress.util.Timing;
+import org.apache.cassandra.stress.util.TimingInterval;
+import org.apache.cassandra.stress.util.Uncertainty;
+
+public class StressMetrics
+{
+
+    private static final ThreadFactory tf = new NamedThreadFactory("StressMetrics");
+
+    private final PrintStream output;
+    private final Thread thread;
+    private volatile boolean stop = false;
+    private volatile boolean cancelled = false;
+    private final Uncertainty rowRateUncertainty = new Uncertainty();
+    private final CountDownLatch stopped = new CountDownLatch(1);
+    private final Timing timing = new Timing();
+    private final Callable<JmxCollector.GcStats> gcStatsCollector;
+    private volatile JmxCollector.GcStats totalGcStats;
+
+    public StressMetrics(PrintStream output, final long logIntervalMillis, StressSettings settings)
+    {
+        this.output = output;
+        Callable<JmxCollector.GcStats> gcStatsCollector;
+        totalGcStats = new JmxCollector.GcStats(0);
+        try
+        {
+            gcStatsCollector = new JmxCollector(settings.node.resolveAllPermitted(settings), settings.port.jmxPort);
+        }
+        catch (Throwable t)
+        {
+            switch (settings.log.level)
+            {
+                case VERBOSE:
+                    t.printStackTrace();
+            }
+            System.err.println("Failed to connect over JMX; not collecting these stats");
+            gcStatsCollector = new Callable<JmxCollector.GcStats>()
+            {
+                public JmxCollector.GcStats call() throws Exception
+                {
+                    return totalGcStats;
+                }
+            };
+        }
+        this.gcStatsCollector = gcStatsCollector;
+
+        printHeader("", output);
+        thread = tf.newThread(new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                timing.start();
+                try {
+
+                    while (!stop)
+                    {
+                        try
+                        {
+                            long sleep = timing.getHistory().endMillis() + logIntervalMillis - System.currentTimeMillis();
+                            if (sleep < logIntervalMillis >>> 3)
+                                // if had a major hiccup, sleep full interval
+                                Thread.sleep(logIntervalMillis);
+                            else
+                                Thread.sleep(sleep);
+
+                            update();
+                        } catch (InterruptedException e)
+                        {
+                            break;
+                        }
+                    }
+
+                    update();
+                }
+                catch (InterruptedException e)
+                {}
+                catch (Exception e)
+                {
+                    cancel();
+                    e.printStackTrace(StressMetrics.this.output);
+                }
+                finally
+                {
+                    rowRateUncertainty.wakeAll();
+                    stopped.countDown();
+                }
+            }
+        });
+    }
+
+    public void start()
+    {
+        thread.start();
+    }
+
+    public void waitUntilConverges(double targetUncertainty, int minMeasurements, int maxMeasurements) throws InterruptedException
+    {
+        rowRateUncertainty.await(targetUncertainty, minMeasurements, maxMeasurements);
+    }
+
+    public void cancel()
+    {
+        cancelled = true;
+        stop = true;
+        thread.interrupt();
+        rowRateUncertainty.wakeAll();
+    }
+
+    public void stop() throws InterruptedException
+    {
+        stop = true;
+        thread.interrupt();
+        stopped.await();
+    }
+
+    private void update() throws InterruptedException
+    {
+        Timing.TimingResult<JmxCollector.GcStats> result = timing.snap(gcStatsCollector);
+        totalGcStats = JmxCollector.GcStats.aggregate(Arrays.asList(totalGcStats, result.extra));
+        if (result.timing.partitionCount != 0)
+            printRow("", result.timing, timing.getHistory(), result.extra, rowRateUncertainty, output);
+        rowRateUncertainty.update(result.timing.adjustedRowRate());
+        if (timing.done())
+            stop = true;
+    }
+
+
+    // PRINT FORMATTING
+
+    public static final String HEADFORMAT = "%-10s,%10s,%8s,%8s,%8s,%8s,%8s,%8s,%8s,%8s,%8s,%7s,%9s,%7s,%8s,%8s,%8s,%8s";
+    public static final String ROWFORMAT =  "%-10d,%10.0f,%8.0f,%8.0f,%8.0f,%8.1f,%8.1f,%8.1f,%8.1f,%8.1f,%8.1f,%7.1f,%9.5f,%7.0f,%8.0f,%8.0f,%8.0f,%8.0f";
+
+    private static void printHeader(String prefix, PrintStream output)
+    {
+        output.println(prefix + String.format(HEADFORMAT, "total ops","adj row/s","op/s","pk/s","row/s","mean","med",".95",".99",".999","max","time","stderr", "gc: #", "max ms", "sum ms", "sdv ms", "mb"));
+    }
+
+    private static void printRow(String prefix, TimingInterval interval, TimingInterval total, JmxCollector.GcStats gcStats, Uncertainty opRateUncertainty, PrintStream output)
+    {
+        output.println(prefix + String.format(ROWFORMAT,
+                total.operationCount,
+                interval.adjustedRowRate(),
+                interval.opRate(),
+                interval.partitionRate(),
+                interval.rowRate(),
+                interval.meanLatency(),
+                interval.medianLatency(),
+                interval.rankLatency(0.95f),
+                interval.rankLatency(0.99f),
+                interval.rankLatency(0.999f),
+                interval.maxLatency(),
+                total.runTime() / 1000f,
+                opRateUncertainty.getUncertainty(),
+                gcStats.count,
+                gcStats.maxms,
+                gcStats.summs,
+                gcStats.sdvms,
+                gcStats.bytes / (1 << 20)
+        ));
+    }
+
+    public void summarise()
+    {
+        output.println("\n");
+        output.println("Results:");
+        TimingInterval history = timing.getHistory();
+        output.println(String.format("op rate                   : %.0f", history.opRate()));
+        output.println(String.format("partition rate            : %.0f", history.partitionRate()));
+        output.println(String.format("row rate                  : %.0f", history.rowRate()));
+        output.println(String.format("latency mean              : %.1f", history.meanLatency()));
+        output.println(String.format("latency median            : %.1f", history.medianLatency()));
+        output.println(String.format("latency 95th percentile   : %.1f", history.rankLatency(.95f)));
+        output.println(String.format("latency 99th percentile   : %.1f", history.rankLatency(0.99f)));
+        output.println(String.format("latency 99.9th percentile : %.1f", history.rankLatency(0.999f)));
+        output.println(String.format("latency max               : %.1f", history.maxLatency()));
+        output.println(String.format("total gc count            : %.0f", totalGcStats.count));
+        output.println(String.format("total gc mb               : %.0f", totalGcStats.bytes / (1 << 20)));
+        output.println(String.format("total gc time (s)         : %.0f", totalGcStats.summs / 1000));
+        output.println(String.format("avg gc time(ms)           : %.0f", totalGcStats.summs / totalGcStats.count));
+        output.println(String.format("stdev gc time(ms)         : %.0f", totalGcStats.sdvms));
+        output.println("Total operation time      : " + DurationFormatUtils.formatDuration(
+                history.runTime(), "HH:mm:ss", true));
+    }
+
+    public static void summarise(List<String> ids, List<StressMetrics> summarise, PrintStream out)
+    {
+        int idLen = 0;
+        for (String id : ids)
+            idLen = Math.max(id.length(), idLen);
+        String formatstr = "%" + idLen + "s, ";
+        printHeader(String.format(formatstr, "id"), out);
+        for (int i = 0 ; i < ids.size() ; i++)
+            printRow(String.format(formatstr, ids.get(i)),
+                    summarise.get(i).timing.getHistory(),
+                    summarise.get(i).timing.getHistory(),
+                    summarise.get(i).totalGcStats,
+                    summarise.get(i).rowRateUncertainty,
+                    out
+            );
+    }
+
+    public Timing getTiming()
+    {
+        return timing;
+    }
+
+    public boolean wasCancelled()
+    {
+        return cancelled;
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressProfile.java b/tools/stress/src/org/apache/cassandra/stress/StressProfile.java
new file mode 100644
index 0000000..76642be
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/StressProfile.java

@@ -0,0 +1,567 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress;
+
+
+import com.datastax.driver.core.*;
+import com.datastax.driver.core.exceptions.AlreadyExistsException;
+
+import com.google.common.base.Function;
+import com.google.common.util.concurrent.Uninterruptibles;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.statements.CreateKeyspaceStatement;
+import org.apache.cassandra.exceptions.RequestValidationException;
+
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.RatioDistributionFactory;
+import org.apache.cassandra.stress.generate.SeedManager;
+import org.apache.cassandra.stress.generate.values.Booleans;
+import org.apache.cassandra.stress.generate.values.Bytes;
+import org.apache.cassandra.stress.generate.values.Generator;
+import org.apache.cassandra.stress.generate.values.Dates;
+import org.apache.cassandra.stress.generate.values.Doubles;
+import org.apache.cassandra.stress.generate.values.Floats;
+import org.apache.cassandra.stress.generate.values.GeneratorConfig;
+import org.apache.cassandra.stress.generate.values.Inets;
+import org.apache.cassandra.stress.generate.values.Integers;
+import org.apache.cassandra.stress.generate.values.Lists;
+import org.apache.cassandra.stress.generate.values.Longs;
+import org.apache.cassandra.stress.generate.values.Sets;
+import org.apache.cassandra.stress.generate.values.Strings;
+import org.apache.cassandra.stress.generate.values.TimeUUIDs;
+import org.apache.cassandra.stress.generate.values.UUIDs;
+import org.apache.cassandra.stress.operations.userdefined.SchemaInsert;
+import org.apache.cassandra.stress.operations.userdefined.SchemaQuery;
+import org.apache.cassandra.stress.settings.OptionDistribution;
+import org.apache.cassandra.stress.settings.OptionRatioDistribution;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.settings.ValidationType;
+import org.apache.cassandra.stress.util.JavaDriverClient;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.Compression;
+import org.apache.cassandra.thrift.ThriftConversion;
+import org.apache.thrift.TException;
+import org.yaml.snakeyaml.Yaml;
+import org.yaml.snakeyaml.constructor.Constructor;
+import org.yaml.snakeyaml.error.YAMLException;
+
+import java.io.*;
+import java.net.URI;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+public class StressProfile implements Serializable
+{
+    private String keyspaceCql;
+    private String tableCql;
+    private String seedStr;
+
+    public String keyspaceName;
+    public String tableName;
+    private Map<String, GeneratorConfig> columnConfigs;
+    private Map<String, StressYaml.QueryDef> queries;
+    private Map<String, String> insert;
+
+    transient volatile TableMetadata tableMetaData;
+
+    transient volatile GeneratorFactory generatorFactory;
+
+    transient volatile BatchStatement.Type batchType;
+    transient volatile DistributionFactory partitions;
+    transient volatile RatioDistributionFactory selectchance;
+    transient volatile PreparedStatement insertStatement;
+    transient volatile Integer thriftInsertId;
+
+    transient volatile Map<String, SchemaQuery.ArgSelect> argSelects;
+    transient volatile Map<String, PreparedStatement> queryStatements;
+    transient volatile Map<String, Integer> thriftQueryIds;
+
+    private void init(StressYaml yaml) throws RequestValidationException
+    {
+        keyspaceName = yaml.keyspace;
+        keyspaceCql = yaml.keyspace_definition;
+        tableName = yaml.table;
+        tableCql = yaml.table_definition;
+        seedStr = "seed for stress";
+        queries = yaml.queries;
+        insert = yaml.insert;
+
+        assert keyspaceName != null : "keyspace name is required in yaml file";
+        assert tableName != null : "table name is required in yaml file";
+        assert queries != null : "queries map is required in yaml file";
+
+        if (keyspaceCql != null && keyspaceCql.length() > 0)
+        {
+            try
+            {
+                String name = ((CreateKeyspaceStatement) QueryProcessor.parseStatement(keyspaceCql)).keyspace();
+                assert name.equalsIgnoreCase(keyspaceName) : "Name in keyspace_definition doesn't match keyspace property: '" + name + "' != '" + keyspaceName + "'";
+            }
+            catch (SyntaxException e)
+            {
+                throw new IllegalArgumentException("There was a problem parsing the keyspace cql: " + e.getMessage());
+            }
+        }
+        else
+        {
+            keyspaceCql = null;
+        }
+
+        if (tableCql != null && tableCql.length() > 0)
+        {
+            try
+            {
+                String name = CFMetaData.compile(tableCql, keyspaceName).cfName;
+                assert name.equalsIgnoreCase(tableName) : "Name in table_definition doesn't match table property: '" + name + "' != '" + tableName + "'";
+            }
+            catch (RuntimeException e)
+            {
+                throw new IllegalArgumentException("There was a problem parsing the table cql: " + e.getCause().getMessage());
+            }
+        }
+        else
+        {
+            tableCql = null;
+        }
+
+        columnConfigs = new HashMap<>();
+
+        if (yaml.columnspec != null)
+        {
+            for (Map<String, Object> spec : yaml.columnspec)
+            {
+                lowerCase(spec);
+                String name = (String) spec.remove("name");
+                DistributionFactory population = !spec.containsKey("population") ? null : OptionDistribution.get((String) spec.remove("population"));
+                DistributionFactory size = !spec.containsKey("size") ? null : OptionDistribution.get((String) spec.remove("size"));
+                DistributionFactory clustering = !spec.containsKey("cluster") ? null : OptionDistribution.get((String) spec.remove("cluster"));
+
+                if (!spec.isEmpty())
+                    throw new IllegalArgumentException("Unrecognised option(s) in column spec: " + spec);
+                if (name == null)
+                    throw new IllegalArgumentException("Missing name argument in column spec");
+
+                GeneratorConfig config = new GeneratorConfig(seedStr + name, clustering, size, population);
+                columnConfigs.put(name, config);
+            }
+        }
+    }
+
+    public void maybeCreateSchema(StressSettings settings)
+    {
+        JavaDriverClient client = settings.getJavaDriverClient(false);
+
+        if (keyspaceCql != null)
+        {
+            try
+            {
+                client.execute(keyspaceCql, org.apache.cassandra.db.ConsistencyLevel.ONE);
+            }
+            catch (AlreadyExistsException e)
+            {
+            }
+        }
+
+        client.execute("use "+keyspaceName, org.apache.cassandra.db.ConsistencyLevel.ONE);
+
+        if (tableCql != null)
+        {
+            try
+            {
+                client.execute(tableCql, org.apache.cassandra.db.ConsistencyLevel.ONE);
+            }
+            catch (AlreadyExistsException e)
+            {
+            }
+
+            System.out.println(String.format("Created schema. Sleeping %ss for propagation.", settings.node.nodes.size()));
+            Uninterruptibles.sleepUninterruptibly(settings.node.nodes.size(), TimeUnit.SECONDS);
+        }
+
+        maybeLoadSchemaInfo(settings);
+    }
+
+
+    private void maybeLoadSchemaInfo(StressSettings settings)
+    {
+        if (tableMetaData == null)
+        {
+            JavaDriverClient client = settings.getJavaDriverClient();
+
+            synchronized (client)
+            {
+
+                if (tableMetaData != null)
+                    return;
+
+                TableMetadata metadata = client.getCluster()
+                                               .getMetadata()
+                                               .getKeyspace(keyspaceName)
+                                               .getTable(tableName);
+
+                if (metadata == null)
+                    throw new RuntimeException("Unable to find table " + keyspaceName + "." + tableName);
+
+                //Fill in missing column configs
+                for (ColumnMetadata col : metadata.getColumns())
+                {
+                    if (columnConfigs.containsKey(col.getName()))
+                        continue;
+
+                    columnConfigs.put(col.getName(), new GeneratorConfig(seedStr + col.getName(), null, null, null));
+                }
+
+                tableMetaData = metadata;
+            }
+        }
+    }
+
+    public SchemaQuery getQuery(String name, Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        if (queryStatements == null)
+        {
+            synchronized (this)
+            {
+                if (queryStatements == null)
+                {
+                    try
+                    {
+                        JavaDriverClient jclient = settings.getJavaDriverClient();
+                        ThriftClient tclient = settings.getThriftClient();
+                        Map<String, PreparedStatement> stmts = new HashMap<>();
+                        Map<String, Integer> tids = new HashMap<>();
+                        Map<String, SchemaQuery.ArgSelect> args = new HashMap<>();
+                        for (Map.Entry<String, StressYaml.QueryDef> e : queries.entrySet())
+                        {
+                            stmts.put(e.getKey().toLowerCase(), jclient.prepare(e.getValue().cql));
+                            tids.put(e.getKey().toLowerCase(), tclient.prepare_cql3_query(e.getValue().cql, Compression.NONE));
+                            args.put(e.getKey().toLowerCase(), e.getValue().fields == null
+                                                                     ? SchemaQuery.ArgSelect.MULTIROW
+                                                                     : SchemaQuery.ArgSelect.valueOf(e.getValue().fields.toUpperCase()));
+                        }
+                        thriftQueryIds = tids;
+                        queryStatements = stmts;
+                        argSelects = args;
+                    }
+                    catch (TException e)
+                    {
+                        throw new RuntimeException(e);
+                    }
+                }
+            }
+        }
+
+        // TODO validation
+        name = name.toLowerCase();
+        if (!queryStatements.containsKey(name))
+            throw new IllegalArgumentException("No query defined with name " + name);
+        return new SchemaQuery(timer, generator, settings, thriftQueryIds.get(name), queryStatements.get(name), ThriftConversion.fromThrift(settings.command.consistencyLevel), ValidationType.NOT_FAIL, argSelects.get(name));
+    }
+
+    public SchemaInsert getInsert(Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        if (insertStatement == null)
+        {
+            synchronized (this)
+            {
+                if (insertStatement == null)
+                {
+                    maybeLoadSchemaInfo(settings);
+
+                    Set<ColumnMetadata> keyColumns = com.google.common.collect.Sets.newHashSet(tableMetaData.getPrimaryKey());
+
+                    //Non PK Columns
+                    StringBuilder sb = new StringBuilder();
+
+                    sb.append("UPDATE \"").append(tableName).append("\" SET ");
+
+                    //PK Columns
+                    StringBuilder pred = new StringBuilder();
+                    pred.append(" WHERE ");
+
+                    boolean firstCol = true;
+                    boolean firstPred = true;
+                    for (ColumnMetadata c : tableMetaData.getColumns())
+                    {
+
+                        if (keyColumns.contains(c))
+                        {
+                            if (firstPred)
+                                firstPred = false;
+                            else
+                                pred.append(" AND ");
+
+                            pred.append(c.getName()).append(" = ?");
+                        }
+                        else
+                        {
+                            if (firstCol)
+                                firstCol = false;
+                            else
+                                sb.append(",");
+
+                            sb.append(c.getName()).append(" = ");
+
+                            switch (c.getType().getName())
+                            {
+                                case SET:
+                                case LIST:
+                                case COUNTER:
+                                    sb.append(c.getName()).append(" + ?");
+                                    break;
+                                default:
+                                    sb.append("?");
+                                    break;
+                            }
+                        }
+                    }
+
+                    //Put PK predicates at the end
+                    sb.append(pred);
+
+                    if (insert == null)
+                        insert = new HashMap<>();
+                    lowerCase(insert);
+
+                    partitions = select(settings.insert.batchsize, "partitions", "fixed(1)", insert, OptionDistribution.BUILDER);
+                    selectchance = select(settings.insert.selectRatio, "select", "fixed(1)/1", insert, OptionRatioDistribution.BUILDER);
+                    batchType = settings.insert.batchType != null
+                                ? settings.insert.batchType
+                                : !insert.containsKey("batchtype")
+                                  ? BatchStatement.Type.LOGGED
+                                  : BatchStatement.Type.valueOf(insert.remove("batchtype"));
+                    if (!insert.isEmpty())
+                        throw new IllegalArgumentException("Unrecognised insert option(s): " + insert);
+
+                    Distribution visits = settings.insert.visits.get();
+                    // these min/max are not absolutely accurate if selectchance < 1, but they're close enough to
+                    // guarantee the vast majority of actions occur in these bounds
+                    double minBatchSize = selectchance.get().min() * partitions.get().minValue() * generator.minRowCount * (1d / visits.maxValue());
+                    double maxBatchSize = selectchance.get().max() * partitions.get().maxValue() * generator.maxRowCount * (1d / visits.minValue());
+                    System.out.printf("Generating batches with [%d..%d] partitions and [%.0f..%.0f] rows (of [%.0f..%.0f] total rows in the partitions)\n",
+                                      partitions.get().minValue(), partitions.get().maxValue(),
+                                      minBatchSize, maxBatchSize,
+                                      partitions.get().minValue() * generator.minRowCount,
+                                      partitions.get().maxValue() * generator.maxRowCount);
+                    if (generator.maxRowCount > 100 * 1000 * 1000)
+                        System.err.printf("WARNING: You have defined a schema that permits very large partitions (%.0f max rows (>100M))\n", generator.maxRowCount);
+                    if (batchType == BatchStatement.Type.LOGGED && maxBatchSize > 65535)
+                    {
+                        System.err.printf("ERROR: You have defined a workload that generates batches with more than 65k rows (%.0f), but have required the use of LOGGED batches. There is a 65k row limit on a single batch.\n",
+                                          selectchance.get().max() * partitions.get().maxValue() * generator.maxRowCount);
+                        System.exit(1);
+                    }
+                    if (maxBatchSize > 100000)
+                        System.err.printf("WARNING: You have defined a schema that permits very large batches (%.0f max rows (>100K)). This may OOM this stress client, or the server.\n",
+                                          selectchance.get().max() * partitions.get().maxValue() * generator.maxRowCount);
+
+                    JavaDriverClient client = settings.getJavaDriverClient();
+                    String query = sb.toString();
+                    try
+                    {
+                        thriftInsertId = settings.getThriftClient().prepare_cql3_query(query, Compression.NONE);
+                    }
+                    catch (TException e)
+                    {
+                        throw new RuntimeException(e);
+                    }
+                    insertStatement = client.prepare(query);
+                }
+            }
+        }
+
+        return new SchemaInsert(timer, generator, settings, partitions.get(), selectchance.get(), thriftInsertId, insertStatement, ThriftConversion.fromThrift(settings.command.consistencyLevel), batchType);
+    }
+
+    private static <E> E select(E first, String key, String defValue, Map<String, String> map, Function<String, E> builder)
+    {
+        String val = map.remove(key);
+
+        if (first != null)
+            return first;
+        if (val != null && val.trim().length() > 0)
+            return builder.apply(val);
+        return builder.apply(defValue);
+    }
+
+    public PartitionGenerator newGenerator(StressSettings settings, SeedManager seeds)
+    {
+        if (generatorFactory == null)
+        {
+            synchronized (this)
+            {
+                maybeLoadSchemaInfo(settings);
+                if (generatorFactory == null)
+                    generatorFactory = new GeneratorFactory();
+            }
+        }
+
+        return generatorFactory.newGenerator(settings, seeds);
+    }
+
+    private class GeneratorFactory
+    {
+        final List<ColumnInfo> partitionKeys = new ArrayList<>();
+        final List<ColumnInfo> clusteringColumns = new ArrayList<>();
+        final List<ColumnInfo> valueColumns = new ArrayList<>();
+
+        private GeneratorFactory()
+        {
+            Set<ColumnMetadata> keyColumns = com.google.common.collect.Sets.newHashSet(tableMetaData.getPrimaryKey());
+
+            for (ColumnMetadata metadata : tableMetaData.getPartitionKey())
+                partitionKeys.add(new ColumnInfo(metadata.getName(), metadata.getType(), columnConfigs.get(metadata.getName())));
+            for (ColumnMetadata metadata : tableMetaData.getClusteringColumns())
+                clusteringColumns.add(new ColumnInfo(metadata.getName(), metadata.getType(), columnConfigs.get(metadata.getName())));
+            for (ColumnMetadata metadata : tableMetaData.getColumns())
+                if (!keyColumns.contains(metadata))
+                    valueColumns.add(new ColumnInfo(metadata.getName(), metadata.getType(), columnConfigs.get(metadata.getName())));
+        }
+
+        PartitionGenerator newGenerator(StressSettings settings, SeedManager seeds)
+        {
+            return new PartitionGenerator(get(partitionKeys), get(clusteringColumns), get(valueColumns), settings.generate.order, seeds);
+        }
+
+        List<Generator> get(List<ColumnInfo> columnInfos)
+        {
+            List<Generator> result = new ArrayList<>();
+            for (ColumnInfo columnInfo : columnInfos)
+                result.add(columnInfo.getGenerator());
+            return result;
+        }
+    }
+
+    static class ColumnInfo
+    {
+        final String name;
+        final DataType type;
+        final GeneratorConfig config;
+
+        ColumnInfo(String name, DataType type, GeneratorConfig config)
+        {
+            this.name = name;
+            this.type = type;
+            this.config = config;
+        }
+
+        Generator getGenerator()
+        {
+            return getGenerator(name, type, config);
+        }
+
+        static Generator getGenerator(final String name, final DataType type, GeneratorConfig config)
+        {
+            switch (type.getName())
+            {
+                case ASCII:
+                case TEXT:
+                case VARCHAR:
+                    return new Strings(name, config);
+                case BIGINT:
+                case COUNTER:
+                    return new Longs(name, config);
+                case BLOB:
+                    return new Bytes(name, config);
+                case BOOLEAN:
+                    return new Booleans(name, config);
+                case DECIMAL:
+                case DOUBLE:
+                    return new Doubles(name, config);
+                case FLOAT:
+                    return new Floats(name, config);
+                case INET:
+                    return new Inets(name, config);
+                case INT:
+                case VARINT:
+                    return new Integers(name, config);
+                case TIMESTAMP:
+                    return new Dates(name, config);
+                case UUID:
+                    return new UUIDs(name, config);
+                case TIMEUUID:
+                    return new TimeUUIDs(name, config);
+                case SET:
+                    return new Sets(name, getGenerator(name, type.getTypeArguments().get(0), config), config);
+                case LIST:
+                    return new Lists(name, getGenerator(name, type.getTypeArguments().get(0), config), config);
+                default:
+                    throw new UnsupportedOperationException();
+            }
+        }
+    }
+
+    public static StressProfile load(URI file) throws IOError
+    {
+        try
+        {
+            Constructor constructor = new Constructor(StressYaml.class);
+
+            Yaml yaml = new Yaml(constructor);
+
+            InputStream yamlStream = file.toURL().openStream();
+
+            if (yamlStream.available() == 0)
+                throw new IOException("Unable to load yaml file from: "+file);
+
+            StressYaml profileYaml = yaml.loadAs(yamlStream, StressYaml.class);
+
+            StressProfile profile = new StressProfile();
+            profile.init(profileYaml);
+
+            return profile;
+        }
+        catch (YAMLException | IOException | RequestValidationException e)
+        {
+            throw new IOError(e);
+        }
+    }
+
+    static <V> void lowerCase(Map<String, V> map)
+    {
+        List<Map.Entry<String, V>> reinsert = new ArrayList<>();
+        Iterator<Map.Entry<String, V>> iter = map.entrySet().iterator();
+        while (iter.hasNext())
+        {
+            Map.Entry<String, V> e = iter.next();
+            if (!e.getKey().equalsIgnoreCase(e.getKey()))
+            {
+                reinsert.add(e);
+                iter.remove();
+            }
+        }
+        for (Map.Entry<String, V> e : reinsert)
+            map.put(e.getKey().toLowerCase(), e.getValue());
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressServer.java b/tools/stress/src/org/apache/cassandra/stress/StressServer.java
index 6600dfd..3c9e2a6 100644
--- a/tools/stress/src/org/apache/cassandra/stress/StressServer.java
+++ b/tools/stress/src/org/apache/cassandra/stress/StressServer.java

@@ -1,27 +1,30 @@
 /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
 package org.apache.cassandra.stress;
 
 import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.PrintStream;
 import java.net.InetAddress;
 import java.net.ServerSocket;
+import java.net.Socket;
 
-import org.apache.cassandra.stress.server.StressThread;
+import org.apache.cassandra.stress.settings.StressSettings;
 import org.apache.commons.cli.*;
 
 public class StressServer
@@ -68,4 +71,57 @@
         for (;;)
             new StressThread(serverSocket.accept()).start();
     }
+
+    public static class StressThread extends Thread
+    {
+        private final Socket socket;
+
+        public StressThread(Socket client)
+        {
+            this.socket = client;
+        }
+
+        public void run()
+        {
+            try
+            {
+                ObjectInputStream in = new ObjectInputStream(socket.getInputStream());
+                PrintStream out = new PrintStream(socket.getOutputStream());
+
+                StressAction action = new StressAction((StressSettings) in.readObject(), out);
+                Thread actionThread = new Thread(action);
+                actionThread.start();
+
+                while (actionThread.isAlive())
+                {
+                    try
+                    {
+                        if (in.readInt() == 1)
+                        {
+                            actionThread.interrupt();
+                            break;
+                        }
+                    }
+                    catch (Exception e)
+                    {
+                        // continue without problem
+                    }
+                }
+
+                out.close();
+                in.close();
+                socket.close();
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException(e.getMessage(), e);
+            }
+            catch (Exception e)
+            {
+                e.printStackTrace();
+            }
+        }
+
+    }
+
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressStatistics.java b/tools/stress/src/org/apache/cassandra/stress/StressStatistics.java
deleted file mode 100644
index e735f97..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/StressStatistics.java
+++ /dev/null

@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.stress;
-
-import com.yammer.metrics.stats.Snapshot;
-import java.io.PrintStream;
-import org.apache.commons.lang3.time.DurationFormatUtils;
-
-/**
- * Gathers and aggregates statistics for an operation
- */
-public class StressStatistics
-{
-    
-    private Session client;
-    private PrintStream output;
-
-    private long durationInSeconds;
-    /** The sum of the interval_op_rate values collected by tallyAverages */
-    private int tallyOpRateSum;
-    /** The number of interval_op_rate values collected by tallyAverages */
-    private int tallyOpRateCount;
-    /** The sum of the interval_key_rate values collected by tallyAverages */
-    private int tallyKeyRateSum;
-    /** The number of interval_key_rate values collected by tallyAverages */
-    private int tallyKeyRateCount;
-
-    /** The sum of the latency values collected by tallyAverages */
-    private double tallyLatencySum;
-    /** The number of latency values collected by tallyAverages */
-    private int tallyLatencyCount;
-    /** The sum of the 95%tile latency values collected by tallyAverages */
-    private double tally95thLatencySum;
-    /** The number of 95%tile latency values collected by tallyAverages */
-    private int tally95thLatencyCount;
-    /** The sum of the 99.9%tile latency values collected by tallyAverages */
-    private double tally999thLatencySum;
-    /** The number of 99.9%tile latency values collected by tallyAverages */
-    private int tally999thLatencyCount;
-    
-
-    public StressStatistics(Session client, PrintStream out)
-    {
-        this.client = client;
-        this.output = out;
-
-        tallyOpRateSum = 0;
-        tallyOpRateCount = 0;
-    }
-
-    /**
-     * Collect statistics per-interval
-     */
-    public void addIntervalStats(int totalOperations, int intervalOpRate, 
-                                 int intervalKeyRate, Snapshot latency, 
-                                 long currentTimeInSeconds)
-    {
-        this.tallyAverages(totalOperations, intervalKeyRate, intervalKeyRate, 
-                                latency, currentTimeInSeconds);
-    }
-
-    /**
-     * Collect interval_op_rate and interval_key_rate averages
-     */
-    private void tallyAverages(int totalOperations, int intervalOpRate, 
-                                 int intervalKeyRate, Snapshot latency, 
-                                 long currentTimeInSeconds)
-    {
-        //Skip the first and last 10% of values.
-        //The middle values of the operation are the ones worthwhile
-        //to collect and average:
-        if (totalOperations > (0.10 * client.getNumKeys()) &&
-            totalOperations < (0.90 * client.getNumKeys())) {
-                tallyOpRateSum += intervalOpRate;
-                tallyOpRateCount += 1;
-                tallyKeyRateSum += intervalKeyRate;
-                tallyKeyRateCount += 1;
-                tallyLatencySum += latency.getMedian();
-                tallyLatencyCount += 1;
-                tally95thLatencySum += latency.get95thPercentile();
-                tally95thLatencyCount += 1;
-                tally999thLatencySum += latency.get999thPercentile();
-                tally999thLatencyCount += 1;
-            }
-        durationInSeconds = currentTimeInSeconds;
-    }
-
-    public void printStats()
-    {
-        output.println("\n");
-        if (tallyOpRateCount > 0) {
-            output.println("Averages from the middle 80% of values:");
-            output.println(String.format("interval_op_rate          : %d", 
-                                         (tallyOpRateSum / tallyOpRateCount)));
-            output.println(String.format("interval_key_rate         : %d", 
-                                         (tallyKeyRateSum / tallyKeyRateCount)));
-            output.println(String.format("latency median            : %.1f", 
-                                         (tallyLatencySum / tallyLatencyCount)));
-            output.println(String.format("latency 95th percentile   : %.1f",
-                                         (tally95thLatencySum / tally95thLatencyCount)));
-            output.println(String.format("latency 99.9th percentile : %.1f", 
-                                         (tally999thLatencySum / tally999thLatencyCount)));
-        }
-        output.println("Total operation time      : " + DurationFormatUtils.formatDuration(
-            durationInSeconds*1000, "HH:mm:ss", true));
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/StressYaml.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/StressYaml.java
index e42574b..b6efc5e 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/StressYaml.java

@@ -1,6 +1,5 @@
-package org.apache.cassandra.io.util;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +7,37 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
+package org.apache.cassandra.stress;
 
+import java.util.List;
+import java.util.Map;
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class StressYaml
 {
-    private final ByteBuffer buffer;
+    public String keyspace;
+    public String keyspace_definition;
+    public String table;
+    public String table_definition;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public List<Map<String, Object>> columnspec;
+    public Map<String, QueryDef> queries;
+    public Map<String, String> insert;
+
+    public static class QueryDef
     {
-        this.buffer = buffer;
+        public String cql;
+        public String fields;
     }
 
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/Distribution.java b/tools/stress/src/org/apache/cassandra/stress/generate/Distribution.java
new file mode 100644
index 0000000..4662454
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/Distribution.java

@@ -0,0 +1,57 @@
+package org.apache.cassandra.stress.generate;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+
+public abstract class Distribution implements Serializable
+{
+
+    public abstract long next();
+    public abstract double nextDouble();
+    public abstract long inverseCumProb(double cumProb);
+    public abstract void setSeed(long seed);
+
+    public long maxValue()
+    {
+        return inverseCumProb(1d);
+    }
+
+    public long minValue()
+    {
+        return inverseCumProb(0d);
+    }
+
+    // approximation of the average; slightly costly to calculate, so should not be invoked frequently
+    public long average()
+    {
+        double sum = 0;
+        int count = 0;
+        for (float d = 0 ; d <= 1.0d ; d += 0.02d)
+        {
+            sum += inverseCumProb(d);
+            count += 1;
+        }
+        return (long) (sum / count);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/DistributionBoundApache.java b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionBoundApache.java
new file mode 100644
index 0000000..23ce3e9
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionBoundApache.java

@@ -0,0 +1,84 @@
+package org.apache.cassandra.stress.generate;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import org.apache.commons.math3.distribution.AbstractRealDistribution;
+
+public class DistributionBoundApache extends Distribution
+{
+
+    final AbstractRealDistribution delegate;
+    final long min, max;
+
+    public DistributionBoundApache(AbstractRealDistribution delegate, long min, long max)
+    {
+        this.delegate = delegate;
+        this.min = min;
+        this.max = max;
+    }
+
+    @Override
+    public long next()
+    {
+        return bound(min, max, delegate.sample());
+    }
+
+    public double nextDouble()
+    {
+        return boundDouble(min, max, delegate.sample());
+    }
+
+    @Override
+    public long inverseCumProb(double cumProb)
+    {
+        return bound(min, max, delegate.inverseCumulativeProbability(cumProb));
+    }
+
+    public void setSeed(long seed)
+    {
+        delegate.reseedRandomGenerator(seed);
+    }
+
+    private static long bound(long min, long max, double val)
+    {
+        long r = (long) val;
+        if ((r >= min) & (r <= max))
+            return r;
+        if (r < min)
+            return min;
+        if (r > max)
+            return max;
+        throw new IllegalStateException();
+    }
+
+    private static double boundDouble(long min, long max, double r)
+    {
+        if ((r >= min) & (r <= max))
+            return r;
+        if (r < min)
+            return min;
+        if (r > max)
+            return max;
+        throw new IllegalStateException();
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionFactory.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/DistributionFactory.java
index e42574b..d0dfa89 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionFactory.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.generate;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,11 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
+import java.io.Serializable;
 
-public class ByteBufferOutputStream extends OutputStream
+public interface DistributionFactory extends Serializable
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
+    Distribution get();
 
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
 }

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionFixed.java
similarity index 67%
rename from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
rename to tools/stress/src/org/apache/cassandra/stress/generate/DistributionFixed.java
index e42574b..bbfb894 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionFixed.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.generate;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,34 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class DistributionFixed extends Distribution
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
+    final long key;
 
-    public void write(int b)
+    public DistributionFixed(long key)
     {
-        buffer.put((byte) b);
+        this.key = key;
     }
 
     @Override
-    public void write(byte[] b, int off, int len)
+    public long next()
     {
-        buffer.put(b, off, len);
+        return key;
+    }
+
+    public double nextDouble()
+    {
+        return key;
+    }
+
+    @Override
+    public long inverseCumProb(double cumProb)
+    {
+        return key;
+    }
+
+    public void setSeed(long seed)
+    {
     }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/DistributionInverted.java b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionInverted.java
new file mode 100644
index 0000000..4062b58
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionInverted.java

@@ -0,0 +1,65 @@
+package org.apache.cassandra.stress.generate;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+public class DistributionInverted extends Distribution
+{
+
+    final Distribution wrapped;
+    final long min;
+    final long max;
+
+    public DistributionInverted(Distribution wrapped)
+    {
+        this.wrapped = wrapped;
+        this.min = wrapped.minValue();
+        this.max = wrapped.maxValue();
+    }
+
+    public long next()
+    {
+        return max - (wrapped.next() - min);
+    }
+
+    public double nextDouble()
+    {
+        return max - (wrapped.nextDouble() - min);
+    }
+
+    public long inverseCumProb(double cumProb)
+    {
+        return max - (wrapped.inverseCumProb(cumProb) - min);
+    }
+
+    public void setSeed(long seed)
+    {
+        wrapped.setSeed(seed);
+    }
+
+    public static Distribution invert(Distribution distribution)
+    {
+        if (distribution instanceof DistributionInverted)
+            return ((DistributionInverted) distribution).wrapped;
+        return new DistributionInverted(distribution);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/DistributionOffsetApache.java b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionOffsetApache.java
new file mode 100644
index 0000000..b0e41eb
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionOffsetApache.java

@@ -0,0 +1,80 @@
+package org.apache.cassandra.stress.generate;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import org.apache.commons.math3.distribution.AbstractRealDistribution;
+
+public class DistributionOffsetApache extends Distribution
+{
+
+    final AbstractRealDistribution delegate;
+    final long min, delta;
+
+    public DistributionOffsetApache(AbstractRealDistribution delegate, long min, long max)
+    {
+        this.delegate = delegate;
+        this.min = min;
+        this.delta = max - min;
+    }
+
+    public void setSeed(long seed)
+    {
+        delegate.reseedRandomGenerator(seed);
+    }
+
+    @Override
+    public long next()
+    {
+        return offset(min, delta, delegate.sample());
+    }
+
+    public double nextDouble()
+    {
+        return offsetDouble(min, delta, delegate.sample());
+    }
+
+    @Override
+    public long inverseCumProb(double cumProb)
+    {
+        return offset(min, delta, delegate.inverseCumulativeProbability(cumProb));
+    }
+
+    private long offset(long min, long delta, double val)
+    {
+        long r = (long) val;
+        if (r < 0)
+            r = 0;
+        if (r > delta)
+            r = delta;
+        return min + r;
+    }
+
+    private double offsetDouble(long min, long delta, double r)
+    {
+        if (r < 0)
+            r = 0;
+        if (r > delta)
+            r = delta;
+        return min + r;
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/DistributionQuantized.java b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionQuantized.java
new file mode 100644
index 0000000..9771134
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/DistributionQuantized.java

@@ -0,0 +1,90 @@
+package org.apache.cassandra.stress.generate;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.Arrays;
+import java.util.Random;
+
+import org.apache.cassandra.stress.Stress;
+
+public class DistributionQuantized extends Distribution
+{
+
+    final Distribution delegate;
+    final long[] bounds;
+    final Random random = new Random();
+
+    public DistributionQuantized(Distribution delegate, int quantas)
+    {
+        this.delegate = delegate;
+        this.bounds = new long[quantas + 1];
+        bounds[0] = delegate.minValue();
+        bounds[quantas] = delegate.maxValue() + 1;
+        for (int i = 1 ; i < quantas ; i++)
+            bounds[i] = delegate.inverseCumProb(i / (double) quantas);
+    }
+
+    @Override
+    public long next()
+    {
+        int quanta = quanta(delegate.next());
+        return bounds[quanta] + (long) (random.nextDouble() * ((bounds[quanta + 1] - bounds[quanta])));
+    }
+
+    public double nextDouble()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long inverseCumProb(double cumProb)
+    {
+        long val = delegate.inverseCumProb(cumProb);
+        int quanta = quanta(val);
+        if (quanta < 0)
+            return bounds[0];
+        if (quanta >= bounds.length - 1)
+            return bounds[bounds.length - 1] - 1;
+        cumProb -= (quanta / ((double) bounds.length - 1));
+        cumProb *= (double) bounds.length - 1;
+        return bounds[quanta] + (long) (cumProb * (bounds[quanta + 1] - bounds[quanta]));
+    }
+
+    int quanta(long val)
+    {
+        int i = Arrays.binarySearch(bounds, val);
+        if (i < 0)
+            return -2 -i;
+        return i - 1;
+    }
+
+    public void setSeed(long seed)
+    {
+        delegate.setSeed(seed);
+    }
+
+    public static void main(String[] args) throws Exception
+    {
+        Stress.main(new String[] { "print", "dist=qextreme(1..1M,2,2)"});
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/FasterRandom.java b/tools/stress/src/org/apache/cassandra/stress/generate/FasterRandom.java
new file mode 100644
index 0000000..455fec4
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/FasterRandom.java

@@ -0,0 +1,116 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.stress.generate;
+
+import java.util.Random;
+
+import org.apache.commons.math3.random.RandomGenerator;
+
+// based on http://en.wikipedia.org/wiki/Xorshift, but periodically we reseed with our stronger random generator
+// note it is also non-atomically updated, so expects to be used by a single thread
+public class FasterRandom implements RandomGenerator
+{
+    final Random random = new Random();
+
+    private long seed;
+    private int reseed;
+
+    public void setSeed(int seed)
+    {
+        setSeed((long) seed);
+    }
+
+    public void setSeed(int[] ints)
+    {
+        if (ints.length > 1)
+            setSeed (((long) ints[0] << 32) | ints[1]);
+        else
+            setSeed(ints[0]);
+    }
+
+    public void setSeed(long seed)
+    {
+        this.seed = seed;
+        rollover();
+    }
+
+    private void rollover()
+    {
+        this.reseed = 0;
+        random.setSeed(seed);
+        seed = random.nextLong();
+    }
+
+    public void nextBytes(byte[] bytes)
+    {
+        int i = 0;
+        while (i < bytes.length)
+        {
+            long next = nextLong();
+            while (i < bytes.length)
+            {
+                bytes[i++] = (byte) (next & 0xFF);
+                next >>>= 8;
+            }
+        }
+    }
+
+    public int nextInt()
+    {
+        return (int) nextLong();
+    }
+
+    public int nextInt(int i)
+    {
+        return Math.abs((int) nextLong() % i);
+    }
+
+    public long nextLong()
+    {
+        if (++this.reseed == 32)
+            rollover();
+
+        long seed = this.seed;
+        seed ^= seed >> 12;
+        seed ^= seed << 25;
+        seed ^= seed >> 27;
+        this.seed = seed;
+        return seed * 2685821657736338717L;
+    }
+
+    public boolean nextBoolean()
+    {
+        return ((int) nextLong() & 1) == 1;
+    }
+
+    public float nextFloat()
+    {
+        return Float.intBitsToFloat((int) nextLong());
+    }
+
+    public double nextDouble()
+    {
+        return Double.longBitsToDouble(nextLong());
+    }
+
+    public double nextGaussian()
+    {
+        return random.nextGaussian();
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/Partition.java b/tools/stress/src/org/apache/cassandra/stress/generate/Partition.java
new file mode 100644
index 0000000..66f8c1d
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/Partition.java

@@ -0,0 +1,554 @@
+package org.apache.cassandra.stress.generate;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.nio.ByteBuffer;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Queue;
+import java.util.Random;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.stress.generate.values.Generator;
+
+// a partition is re-used to reduce garbage generation, as is its internal RowIterator
+// TODO: we should batch the generation of clustering components so we can bound the time and size necessary to
+// generate huge partitions with only a small number of clustering components; i.e. we should generate seeds for batches
+// of a single component, and then generate the values within those batches as necessary. this will be difficult with
+// generating sorted partitions, and may require generator support (e.g. we may need to support generating prefixes
+// that are extended/suffixed to generate each batch, so that we can sort the prefixes)
+public class Partition
+{
+
+    private long idseed;
+    private Seed seed;
+    private final Object[] partitionKey;
+    private final PartitionGenerator generator;
+    private final RowIterator iterator;
+
+    public Partition(PartitionGenerator generator)
+    {
+        this.generator = generator;
+        this.partitionKey = new Object[generator.partitionKey.size()];
+        if (generator.clusteringComponents.size() > 0)
+            iterator = new MultiRowIterator();
+        else
+            iterator = new SingleRowIterator();
+    }
+
+    void setSeed(Seed seed)
+    {
+        long idseed = 0;
+        for (int i = 0 ; i < partitionKey.length ; i++)
+        {
+            Generator generator = this.generator.partitionKey.get(i);
+            // set the partition key seed based on the current work item we're processing
+            generator.setSeed(seed.seed);
+            Object key = generator.generate();
+            partitionKey[i] = key;
+            // then contribute this value to the data seed
+            idseed = seed(key, generator.type, idseed);
+        }
+        this.seed = seed;
+        this.idseed = idseed;
+    }
+
+    public RowIterator iterator(double useChance, boolean isWrite)
+    {
+        iterator.reset(useChance, 0, 1, isWrite);
+        return iterator;
+    }
+
+    public RowIterator iterator(int targetCount, boolean isWrite)
+    {
+        iterator.reset(Double.NaN, targetCount, 1, isWrite);
+        return iterator;
+    }
+
+    class SingleRowIterator extends RowIterator
+    {
+        boolean done;
+
+        void reset(double useChance, int targetCount, int batches, boolean isWrite)
+        {
+            done = false;
+        }
+
+        public Iterable<Row> next()
+        {
+            if (done)
+                return Collections.emptyList();
+            for (int i = 0 ; i < row.row.length ; i++)
+            {
+                Generator gen = generator.valueComponents.get(i);
+                gen.setSeed(idseed);
+                row.row[i] = gen.generate();
+            }
+            done = true;
+            return Collections.singleton(row);
+        }
+
+        public boolean done()
+        {
+            return done;
+        }
+
+        public void markWriteFinished()
+        {
+            assert done;
+            generator.seeds.markFinished(seed);
+        }
+    }
+
+    public abstract class RowIterator
+    {
+        // we reuse the row object to save garbage
+        final Row row = new Row(partitionKey, new Object[generator.clusteringComponents.size() + generator.valueComponents.size()]);
+
+        public abstract Iterable<Row> next();
+        public abstract boolean done();
+        public abstract void markWriteFinished();
+        abstract void reset(double useChance, int targetCount, int batches, boolean isWrite);
+
+        public Partition partition()
+        {
+            return Partition.this;
+        }
+    }
+
+    // permits iterating a random subset of the procedurally generated rows in this partition. this is the only mechanism for visiting rows.
+    // we maintain a stack of clustering components and their seeds; for each clustering component we visit, we generate all values it takes at that level,
+    // and then, using the average (total) number of children it takes we randomly choose whether or not we visit its children;
+    // if we do, we generate all possible values the immediate children can take, and repeat the process. So at any one time we are using space proportional
+    // to C.N, where N is the average number of values each clustering component takes, as opposed to N^C total values in the partition.
+    // TODO : guarantee at least one row is always returned
+    // TODO : support first/last row, and constraining reads to rows we know are populated
+    class MultiRowIterator extends RowIterator
+    {
+
+        // probability any single row will be generated in this iteration
+        double useChance;
+
+        // the seed used to generate the current values for the clustering components at each depth;
+        // used to save recalculating it for each row, so we only need to recalc from prior row.
+        final long[] clusteringSeeds = new long[generator.clusteringComponents.size()];
+        // the components remaining to be visited for each level of the current stack
+        final Deque<Object>[] clusteringComponents = new ArrayDeque[generator.clusteringComponents.size()];
+
+        // we want our chance of selection to be applied uniformly, so we compound the roll we make at each level
+        // so that we know with what chance we reached there, and we adjust our roll at that level by that amount
+        final double[] chancemodifier = new double[generator.clusteringComponents.size()];
+        final double[] rollmodifier = new double[generator.clusteringComponents.size()];
+
+        // track where in the partition we are, and where we are limited to
+        final int[] position = new int[generator.clusteringComponents.size()];
+        final int[] limit = new int[position.length];
+        int batchSize;
+        boolean returnedOne;
+        boolean forceReturnOne;
+
+        // reusable collections for generating unique and sorted clustering components
+        final Set<Object> unique = new HashSet<>();
+        final List<Comparable> tosort = new ArrayList<>();
+        final Random random = new Random();
+
+        MultiRowIterator()
+        {
+            for (int i = 0 ; i < clusteringComponents.length ; i++)
+                clusteringComponents[i] = new ArrayDeque<>();
+            rollmodifier[0] = 1f;
+            chancemodifier[0] = generator.clusteringChildAverages[0];
+        }
+
+        // if we're a write, the expected behaviour is that the requested batch count is compounded with the seed's visit
+        // count to decide how much we should return in one iteration
+        void reset(double useChance, int targetCount, int batches, boolean isWrite)
+        {
+            if (this.useChance < 1d)
+            {
+                // we clear our prior roll-modifiers if the use chance was previously less-than zero
+                Arrays.fill(rollmodifier, 1d);
+                Arrays.fill(chancemodifier, 1d);
+            }
+
+            // set the seed for the first clustering component
+            generator.clusteringComponents.get(0).setSeed(idseed);
+            int[] position = seed.position;
+
+            // calculate how many first clustering components we'll generate, and how many total rows this predicts
+            int firstComponentCount = (int) generator.clusteringComponents.get(0).clusteringDistribution.next();
+            int expectedRowCount;
+
+            if (!isWrite && position != null)
+            {
+                expectedRowCount = 0;
+                for (int i = 0 ; i < position.length ; i++)
+                {
+                    expectedRowCount += position[i] * generator.clusteringChildAverages[i];
+                    limit[i] = position[i];
+                }
+            }
+            else
+            {
+                expectedRowCount = firstComponentCount * generator.clusteringChildAverages[0];
+                if (isWrite)
+                    batches *= seed.visits;
+                Arrays.fill(limit, Integer.MAX_VALUE);
+            }
+
+            batchSize = Math.max(1, expectedRowCount / batches);
+            if (Double.isNaN(useChance))
+                useChance = Math.max(0d, Math.min(1d, targetCount / (double) expectedRowCount));
+
+            // clear any remnants of the last iteration, wire up our constants, and fill in the first clustering components
+            this.useChance = useChance;
+            this.returnedOne = false;
+            for (Queue<?> q : clusteringComponents)
+                q.clear();
+            clusteringSeeds[0] = idseed;
+            fill(clusteringComponents[0], firstComponentCount, generator.clusteringComponents.get(0));
+
+            // seek to our start position
+            seek(isWrite ? position : null);
+        }
+
+        // generate the clustering components for the provided depth; requires preceding components
+        // to have been generated and their seeds populated into clusteringSeeds
+        void fill(int depth)
+        {
+            long seed = clusteringSeeds[depth - 1];
+            Generator gen = generator.clusteringComponents.get(depth);
+            gen.setSeed(seed);
+            clusteringSeeds[depth] = seed(clusteringComponents[depth - 1].peek(), generator.clusteringComponents.get(depth - 1).type, seed);
+            fill(clusteringComponents[depth], (int) gen.clusteringDistribution.next(), gen);
+        }
+
+        // generate the clustering components into the queue
+        void fill(Queue<Object> queue, int count, Generator generator)
+        {
+            if (count == 1)
+            {
+                queue.add(generator.generate());
+                return;
+            }
+
+            switch (Partition.this.generator.order)
+            {
+                case SORTED:
+                    if (Comparable.class.isAssignableFrom(generator.clazz))
+                    {
+                        tosort.clear();
+                        for (int i = 0 ; i < count ; i++)
+                            tosort.add((Comparable) generator.generate());
+                        Collections.sort(tosort);
+                        for (int i = 0 ; i < count ; i++)
+                            queue.add(tosort.get(i));
+                        break;
+                    }
+                    else
+                    {
+                        throw new RuntimeException("Generator class is not comparable: "+generator.clazz);
+                    }
+                case ARBITRARY:
+                    unique.clear();
+                    for (int i = 0 ; i < count ; i++)
+                    {
+                        Object next = generator.generate();
+                        if (unique.add(next))
+                            queue.add(next);
+                    }
+                    break;
+                case SHUFFLED:
+                    unique.clear();
+                    tosort.clear();
+                    for (int i = 0 ; i < count ; i++)
+                    {
+                        Object next = generator.generate();
+                        if (unique.add(next))
+                            tosort.add(new RandomOrder(next));
+                    }
+                    Collections.sort(tosort);
+                    for (Object o : tosort)
+                        queue.add(((RandomOrder)o).value);
+                    break;
+                default:
+                    throw new IllegalStateException();
+            }
+        }
+
+        // seek to the provided position (or the first entry if null)
+        private void seek(int[] position)
+        {
+            if (position == null)
+            {
+                this.position[0] = -1;
+                clusteringComponents[0].addFirst(this);
+                advance(0);
+                return;
+            }
+
+            assert position.length == clusteringComponents.length;
+            for (int i = 0 ; i < position.length ; i++)
+            {
+                if (i != 0)
+                    fill(i);
+                for (int c = position[i] ; c > 0 ; c--)
+                    clusteringComponents[i].poll();
+                row.row[i] = clusteringComponents[i].peek();
+            }
+            System.arraycopy(position, 0, this.position, 0, position.length);
+        }
+
+        // normal method for moving the iterator forward; maintains the row object, and delegates to advance(int)
+        // to move the iterator to the next item
+        void advance()
+        {
+            // we are always at the leaf level when this method is invoked
+            // so we calculate the seed for generating the row by combining the seed that generated the clustering components
+            int depth = clusteringComponents.length - 1;
+            long parentSeed = clusteringSeeds[depth];
+            long rowSeed = seed(clusteringComponents[depth].peek(), generator.clusteringComponents.get(depth).type, parentSeed);
+
+            // and then fill the row with the _non-clustering_ values for the position we _were_ at, as this is what we'll deliver
+            for (int i = clusteringSeeds.length ; i < row.row.length ; i++)
+            {
+                Generator gen = generator.valueComponents.get(i - clusteringSeeds.length);
+                gen.setSeed(rowSeed);
+                row.row[i] = gen.generate();
+            }
+            returnedOne = true;
+            forceReturnOne = false;
+
+            // then we advance the leaf level
+            advance(depth);
+        }
+
+        private void advance(int depth)
+        {
+            // advance the leaf component
+            clusteringComponents[depth].poll();
+            position[depth]++;
+            while (true)
+            {
+                if (clusteringComponents[depth].isEmpty())
+                {
+                    // if we've run out of clustering components at this level, ascend
+                    if (depth == 0)
+                        return;
+                    depth--;
+                    clusteringComponents[depth].poll();
+                    position[depth]++;
+                    continue;
+                }
+
+                if (depth == 0 && !returnedOne && clusteringComponents[0].size() == 1)
+                    forceReturnOne = true;
+
+                // the chance of descending is the uniform usechance, multiplied by the number of children
+                // we would on average generate (so if we have a 0.1 use chance, but should generate 10 children
+                // then we will always descend), multiplied by 1/(compound roll), where (compound roll) is the
+                // chance with which we reached this depth, i.e. if we already beat 50/50 odds, we double our
+                // chance of beating this next roll
+                double thischance = useChance * chancemodifier[depth];
+                if (forceReturnOne || thischance > 0.999f || thischance >= random.nextDouble())
+                {
+                    // if we're descending, we fill in our clustering component and increase our depth
+                    row.row[depth] = clusteringComponents[depth].peek();
+                    depth++;
+                    if (depth == clusteringComponents.length)
+                        break;
+                    // if we haven't reached the leaf, we update our probability statistics, fill in all of
+                    // this level's clustering components, and repeat
+                    if (useChance < 1d)
+                    {
+                        rollmodifier[depth] = rollmodifier[depth - 1] / Math.min(1d, thischance);
+                        chancemodifier[depth] = generator.clusteringChildAverages[depth] * rollmodifier[depth];
+                    }
+                    position[depth] = 0;
+                    fill(depth);
+                    continue;
+                }
+
+                // if we don't descend, we remove the clustering suffix we've skipped and continue
+                clusteringComponents[depth].poll();
+                position[depth]++;
+            }
+        }
+
+        public Iterable<Row> next()
+        {
+            final int[] limit = position.clone();
+            int remainingSize = batchSize;
+            for (int i = 0 ; i < limit.length && remainingSize > 0 ; i++)
+            {
+                limit[i] += remainingSize / generator.clusteringChildAverages[i];
+                remainingSize %= generator.clusteringChildAverages[i];
+            }
+            assert remainingSize == 0;
+            for (int i = limit.length - 1 ; i > 0 ; i--)
+            {
+                if (limit[i] > generator.clusteringChildAverages[i])
+                {
+                    limit[i - 1] += limit[i] / generator.clusteringChildAverages[i];
+                    limit[i] %= generator.clusteringChildAverages[i];
+                }
+            }
+            for (int i = 0 ; i < limit.length ; i++)
+            {
+                if (limit[i] < this.limit[i])
+                    break;
+                limit[i] = Math.min(limit[i], this.limit[i]);
+            }
+            return new Iterable<Row>()
+            {
+                public Iterator<Row> iterator()
+                {
+                    return new Iterator<Row>()
+                    {
+
+                        public boolean hasNext()
+                        {
+                            if (done())
+                                return false;
+                            for (int i = 0 ; i < position.length ; i++)
+                                if (position[i] < limit[i])
+                                    return true;
+                            return false;
+                        }
+
+                        public Row next()
+                        {
+                            advance();
+                            return row;
+                        }
+
+                        public void remove()
+                        {
+                            throw new UnsupportedOperationException();
+                        }
+                    };
+                }
+            };
+        }
+
+        public boolean done()
+        {
+            return clusteringComponents[0].isEmpty();
+        }
+
+        public void markWriteFinished()
+        {
+            if (done())
+                generator.seeds.markFinished(seed);
+            else
+                generator.seeds.markVisited(seed, position.clone());
+        }
+
+        public Partition partition()
+        {
+            return Partition.this;
+        }
+    }
+
+    private static class RandomOrder implements Comparable<RandomOrder>
+    {
+        final int order = ThreadLocalRandom.current().nextInt();
+        final Object value;
+        private RandomOrder(Object value)
+        {
+            this.value = value;
+        }
+
+        public int compareTo(RandomOrder that)
+        {
+            return Integer.compare(this.order, that.order);
+        }
+    }
+
+    // calculate a new seed based on the combination of a parent seed and the generated child, to generate
+    // any children of this child
+    static long seed(Object object, AbstractType type, long seed)
+    {
+        if (object instanceof ByteBuffer)
+        {
+            ByteBuffer buf = (ByteBuffer) object;
+            for (int i = buf.position() ; i < buf.limit() ; i++)
+                seed = (31 * seed) + buf.get(i);
+            return seed;
+        }
+        else if (object instanceof String)
+        {
+            String str = (String) object;
+            for (int i = 0 ; i < str.length() ; i++)
+                seed = (31 * seed) + str.charAt(i);
+            return seed;
+        }
+        else if (object instanceof Number)
+        {
+            return (seed * 31) + ((Number) object).longValue();
+        }
+        else if (object instanceof UUID)
+        {
+            return seed * 31 + (((UUID) object).getLeastSignificantBits() ^ ((UUID) object).getMostSignificantBits());
+        }
+        else
+        {
+            return seed(type.decompose(object), BytesType.instance, seed);
+        }
+    }
+
+    public Object getPartitionKey(int i)
+    {
+        return partitionKey[i];
+    }
+
+    public String getKeyAsString()
+    {
+        StringBuilder sb = new StringBuilder();
+        int i = 0;
+        for (Object key : partitionKey)
+        {
+            if (i > 0)
+                sb.append("|");
+            AbstractType type = generator.partitionKey.get(i++).type;
+            sb.append(type.getString(type.decompose(key)));
+        }
+        return sb.toString();
+    }
+
+    // used for thrift smart routing - if it's a multi-part key we don't try to route correctly right now
+    public ByteBuffer getToken()
+    {
+        return generator.partitionKey.get(0).type.decompose(partitionKey[0]);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/PartitionGenerator.java b/tools/stress/src/org/apache/cassandra/stress/generate/PartitionGenerator.java
new file mode 100644
index 0000000..128d2f5
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/PartitionGenerator.java

@@ -0,0 +1,125 @@
+package org.apache.cassandra.stress.generate;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.generate.values.Generator;
+
+public class PartitionGenerator
+{
+
+    public static enum Order
+    {
+        ARBITRARY, SHUFFLED, SORTED
+    }
+
+    public final double maxRowCount;
+    public final double minRowCount;
+    final List<Generator> partitionKey;
+    final List<Generator> clusteringComponents;
+    final List<Generator> valueComponents;
+    final int[] clusteringChildAverages;
+
+    private final Map<String, Integer> indexMap;
+    final Order order;
+    final SeedManager seeds;
+
+    final List<Partition> recyclable = new ArrayList<>();
+    int partitionsInUse = 0;
+
+    public void reset()
+    {
+        partitionsInUse = 0;
+    }
+
+    public PartitionGenerator(List<Generator> partitionKey, List<Generator> clusteringComponents, List<Generator> valueComponents, Order order, SeedManager seeds)
+    {
+        this.partitionKey = partitionKey;
+        this.clusteringComponents = clusteringComponents;
+        this.valueComponents = valueComponents;
+        this.order = order;
+        this.seeds = seeds;
+        this.clusteringChildAverages = new int[clusteringComponents.size()];
+        for (int i = clusteringChildAverages.length - 1 ; i >= 0 ; i--)
+            clusteringChildAverages[i] = (int) (i < (clusteringChildAverages.length - 1) ? clusteringComponents.get(i + 1).clusteringDistribution.average() * clusteringChildAverages[i + 1] : 1);
+        double maxRowCount = 1d;
+        double minRowCount = 1d;
+        for (Generator component : clusteringComponents)
+        {
+            maxRowCount *= component.clusteringDistribution.maxValue();
+            minRowCount *= component.clusteringDistribution.minValue();
+        }
+        this.maxRowCount = maxRowCount;
+        this.minRowCount = minRowCount;
+        this.indexMap = new HashMap<>();
+        int i = 0;
+        for (Generator generator : partitionKey)
+            indexMap.put(generator.name, --i);
+        i = 0;
+        for (Generator generator : Iterables.concat(clusteringComponents, valueComponents))
+            indexMap.put(generator.name, i++);
+    }
+
+    public boolean permitNulls(int index)
+    {
+        return !(index < 0 || index < clusteringComponents.size());
+    }
+
+    public int indexOf(String name)
+    {
+        Integer i = indexMap.get(name);
+        if (i == null)
+            throw new NoSuchElementException();
+        return i;
+    }
+
+    public Partition generate(Operation op)
+    {
+        if (recyclable.size() <= partitionsInUse || recyclable.get(partitionsInUse) == null)
+            recyclable.add(new Partition(this));
+
+        Seed seed = seeds.next(op);
+        if (seed == null)
+            return null;
+        Partition partition = recyclable.get(partitionsInUse++);
+        partition.setSeed(seed);
+        return partition;
+    }
+
+    public ByteBuffer convert(int c, Object v)
+    {
+        if (c < 0)
+            return partitionKey.get(-1-c).type.decompose(v);
+        if (c < clusteringComponents.size())
+            return clusteringComponents.get(c).type.decompose(v);
+        return valueComponents.get(c - clusteringComponents.size()).type.decompose(v);
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/RatioDistribution.java b/tools/stress/src/org/apache/cassandra/stress/generate/RatioDistribution.java
new file mode 100644
index 0000000..c71945a
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/RatioDistribution.java

@@ -0,0 +1,51 @@
+package org.apache.cassandra.stress.generate;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+public class RatioDistribution
+{
+
+    final Distribution distribution;
+    final double divisor;
+
+    public RatioDistribution(Distribution distribution, double divisor)
+    {
+        this.distribution = distribution;
+        this.divisor = divisor;
+    }
+
+    // yields a value between 0 and 1
+    public double next()
+    {
+        return Math.max(0f, Math.min(1f, distribution.nextDouble() / divisor));
+    }
+
+    public double min()
+    {
+        return Math.min(1d, distribution.minValue() / divisor);
+    }
+
+    public double max()
+    {
+        return Math.min(1d, distribution.maxValue() / divisor);
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/RatioDistributionFactory.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/RatioDistributionFactory.java
index e42574b..16474d8 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/RatioDistributionFactory.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.generate;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,11 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
+import java.io.Serializable;
 
-public class ByteBufferOutputStream extends OutputStream
+public interface RatioDistributionFactory extends Serializable
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
+    RatioDistribution get();
 
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
 }

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/Row.java
similarity index 65%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/Row.java
index e42574b..421dbbf 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/Row.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.generate;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,23 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class Row
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    final Object[] partitionKey;
+    final Object[] row;
+
+    public Row(Object[] partitionKey, Object[] row)
     {
-        this.buffer = buffer;
+        this.partitionKey = partitionKey;
+        this.row = row;
     }
 
-    public void write(int b)
+    public Object get(int column)
     {
-        buffer.put((byte) b);
+        if (column < 0)
+            return partitionKey[-1-column];
+        return row[column];
     }
 
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/Seed.java b/tools/stress/src/org/apache/cassandra/stress/generate/Seed.java
new file mode 100644
index 0000000..f427608
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/Seed.java

@@ -0,0 +1,67 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.stress.generate;
+
+import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
+
+import org.apache.cassandra.stress.util.DynamicList;
+
+public class Seed implements Comparable<Seed>
+{
+
+    public final long seed;
+    final int visits;
+
+    DynamicList.Node poolNode;
+    volatile int[] position;
+    volatile State state = State.HELD;
+
+    private static final AtomicReferenceFieldUpdater<Seed, Seed.State> stateUpdater = AtomicReferenceFieldUpdater.newUpdater(Seed.class, State.class, "state");
+
+    public int compareTo(Seed that)
+    {
+        return Long.compare(this.seed, that.seed);
+    }
+
+    static enum State
+    {
+        HELD, AVAILABLE
+    }
+
+    Seed(long seed, int visits)
+    {
+        this.seed = seed;
+        this.visits = visits;
+    }
+
+    boolean take()
+    {
+        return stateUpdater.compareAndSet(this, State.AVAILABLE, State.HELD);
+    }
+
+    void yield()
+    {
+        state = State.AVAILABLE;
+    }
+
+    public int[] position()
+    {
+        return position;
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/SeedManager.java b/tools/stress/src/org/apache/cassandra/stress/generate/SeedManager.java
new file mode 100644
index 0000000..dba721d
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/SeedManager.java

@@ -0,0 +1,249 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.stress.generate;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.DynamicList;
+
+public class SeedManager
+{
+
+    final Distribution visits;
+    final Generator writes;
+    final Generator reads;
+    final ConcurrentHashMap<Seed, Seed> managing = new ConcurrentHashMap<>();
+    final DynamicList<Seed> sampleFrom;
+    final Distribution sample;
+
+    public SeedManager(StressSettings settings)
+    {
+        Generator writes, reads;
+        if (settings.generate.sequence != null)
+        {
+            long[] seq = settings.generate.sequence;
+            if (settings.generate.readlookback != null)
+            {
+                LookbackableWriteGenerator series = new LookbackableWriteGenerator(seq[0], seq[1], settings.generate.wrap, settings.generate.readlookback.get());
+                writes = series;
+                reads = series.reads;
+            }
+            else
+            {
+                writes = reads = new SeriesGenerator(seq[0], seq[1], settings.generate.wrap);
+            }
+        }
+        else
+        {
+            writes = reads = new RandomGenerator(settings.generate.distribution.get());
+        }
+        this.visits = settings.insert.visits.get();
+        this.writes = writes;
+        this.reads = reads;
+        this.sample = DistributionInverted.invert(settings.insert.revisit.get());
+        if (sample.maxValue() > Integer.MAX_VALUE || sample.minValue() < 0)
+            throw new IllegalArgumentException();
+        this.sampleFrom = new DynamicList<>((int) sample.maxValue());
+    }
+
+    public Seed next(Operation op)
+    {
+        if (!op.isWrite())
+        {
+            Seed seed = reads.next(-1);
+            if (seed == null)
+                return null;
+            Seed managing = this.managing.get(seed);
+            return managing == null ? seed : managing;
+        }
+
+        while (true)
+        {
+            int index = (int) sample.next();
+            Seed seed = sampleFrom.get(index);
+            if (seed != null && seed.take())
+                return seed;
+
+            seed = writes.next((int) visits.next());
+            if (seed == null)
+                return null;
+            // seeds are created HELD, so if we insert it successfully we have it exclusively for our write
+            if (managing.putIfAbsent(seed, seed) == null)
+                return seed;
+        }
+    }
+
+    public void markVisited(Seed seed, int[] position)
+    {
+        boolean first = seed.position == null;
+        seed.position = position;
+        finishedWriting(seed, first, false);
+    }
+
+    public void markFinished(Seed seed)
+    {
+        finishedWriting(seed, seed.position == null, true);
+    }
+
+    void finishedWriting(Seed seed, boolean first, boolean completed)
+    {
+        if (!completed)
+        {
+            if (first)
+                seed.poolNode = sampleFrom.append(seed);
+            seed.yield();
+        }
+        else
+        {
+            if (!first)
+                sampleFrom.remove(seed.poolNode);
+            managing.remove(seed);
+        }
+        if (first)
+            writes.finishWrite(seed);
+    }
+
+    private abstract class Generator
+    {
+        abstract Seed next(int visits);
+        void finishWrite(Seed seed) { }
+    }
+
+    private class RandomGenerator extends Generator
+    {
+
+        final Distribution distribution;
+
+        public RandomGenerator(Distribution distribution)
+        {
+            this.distribution = distribution;
+        }
+
+        public Seed next(int visits)
+        {
+            return new Seed(distribution.next(), visits);
+        }
+    }
+
+    private class SeriesGenerator extends Generator
+    {
+
+        final long start;
+        final long totalCount;
+        final boolean wrap;
+        final AtomicLong next = new AtomicLong();
+
+        public SeriesGenerator(long start, long end, boolean wrap)
+        {
+            this.wrap = wrap;
+            if (start > end)
+                throw new IllegalStateException();
+            this.start = start;
+            this.totalCount = 1 + end - start;
+        }
+
+        public Seed next(int visits)
+        {
+            long next = this.next.getAndIncrement();
+            if (!wrap && next >= totalCount)
+                return null;
+            return new Seed(start + (next % totalCount), visits);
+        }
+    }
+
+    private class LookbackableWriteGenerator extends SeriesGenerator
+    {
+
+        final AtomicLong writeCount = new AtomicLong();
+        final ConcurrentSkipListMap<Seed, Seed> afterMin = new ConcurrentSkipListMap<>();
+        final LookbackReadGenerator reads;
+
+        public LookbackableWriteGenerator(long start, long end, boolean wrap, Distribution readLookback)
+        {
+            super(start, end, wrap);
+            this.writeCount.set(0);
+            reads = new LookbackReadGenerator(readLookback);
+        }
+
+        public Seed next(int visits)
+        {
+            long next = this.next.getAndIncrement();
+            if (!wrap && next >= totalCount)
+                return null;
+            return new Seed(start + (next % totalCount), visits);
+        }
+
+        void finishWrite(Seed seed)
+        {
+            if (seed.seed <= writeCount.get())
+                return;
+            afterMin.put(seed, seed);
+            while (true)
+            {
+                Map.Entry<Seed, Seed> head = afterMin.firstEntry();
+                if (head == null)
+                    return;
+                long min = this.writeCount.get();
+                if (head.getKey().seed <= min)
+                    return;
+                if (head.getKey().seed == min + 1 && this.writeCount.compareAndSet(min, min + 1))
+                {
+                    afterMin.remove(head.getKey());
+                    continue;
+                }
+                return;
+            }
+        }
+
+        private class LookbackReadGenerator extends Generator
+        {
+
+            final Distribution lookback;
+
+            public LookbackReadGenerator(Distribution lookback)
+            {
+                this.lookback = lookback;
+                if (lookback.maxValue() > start + totalCount)
+                    throw new IllegalArgumentException("Invalid lookback distribution; max value is " + lookback.maxValue()
+                                                       + ", but series only ranges from " + writeCount + " to " + (start + totalCount));
+            }
+
+            public Seed next(int visits)
+            {
+                long lookback = this.lookback.next();
+                long range = writeCount.get();
+                long startOffset = range - lookback;
+                if (startOffset < 0)
+                {
+                    if (range == totalCount && !wrap)
+                        return null;
+                    startOffset = range == 0 ? 0 : lookback % range;
+                }
+                return new Seed(start + startOffset, visits);
+            }
+        }
+
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Booleans.java
similarity index 64%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/values/Booleans.java
index e42574b..21525af 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Booleans.java

@@ -1,6 +1,5 @@
-package org.apache.cassandra.io.util;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +7,31 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
+package org.apache.cassandra.stress.generate.values;
 
+import org.apache.cassandra.db.marshal.BooleanType;
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class Booleans extends Generator<Boolean>
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public Booleans(String name, GeneratorConfig config)
     {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
+        super(BooleanType.instance, config, name, Boolean.class);
     }
 
     @Override
-    public void write(byte[] b, int off, int len)
+    public Boolean generate()
     {
-        buffer.put(b, off, len);
+        return identityDistribution.next() % 1 == 0;
     }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/Bytes.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Bytes.java
new file mode 100644
index 0000000..358163c
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Bytes.java

@@ -0,0 +1,55 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.stress.generate.FasterRandom;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Random;
+
+public class Bytes extends Generator<ByteBuffer>
+{
+    private final byte[] bytes;
+    private final FasterRandom rand = new FasterRandom();
+
+    public Bytes(String name, GeneratorConfig config)
+    {
+        super(BytesType.instance, config, name, ByteBuffer.class);
+        bytes = new byte[(int) sizeDistribution.maxValue()];
+    }
+
+    @Override
+    public ByteBuffer generate()
+    {
+        long seed = identityDistribution.next();
+        sizeDistribution.setSeed(seed);
+        rand.setSeed(~seed);
+        int size = (int) sizeDistribution.next();
+        for (int i = 0; i < size; )
+            for (long v = rand.nextLong(),
+                 n = Math.min(size - i, Long.SIZE/Byte.SIZE);
+                 n-- > 0; v >>= Byte.SIZE)
+                bytes[i++] = (byte)v;
+        return ByteBuffer.wrap(Arrays.copyOf(bytes, size));
+    }
+}
\ No newline at end of file

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/Dates.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Dates.java
new file mode 100644
index 0000000..7350f57
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Dates.java

@@ -0,0 +1,47 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+import java.util.Date;
+
+import org.apache.cassandra.db.marshal.DateType;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.settings.OptionDistribution;
+
+public class Dates extends Generator<Date>
+{
+    public Dates(String name, GeneratorConfig config)
+    {
+        super(DateType.instance, config, name, Date.class);
+    }
+
+    // TODO: let the range of values generated advance as stress test progresses
+    @Override
+    public Date generate()
+    {
+        return new Date(identityDistribution.next());
+    }
+
+    DistributionFactory defaultIdentityDistribution()
+    {
+        return OptionDistribution.get("uniform(1.." + Long.toString(50L*365L*24L*60L*60L*1000L) + ")");
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Doubles.java
similarity index 64%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/values/Doubles.java
index e42574b..0f04eb6 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Doubles.java

@@ -1,6 +1,5 @@
-package org.apache.cassandra.io.util;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +7,31 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
+package org.apache.cassandra.stress.generate.values;
 
+import org.apache.cassandra.db.marshal.DoubleType;
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class Doubles extends Generator<Double>
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public Doubles(String name, GeneratorConfig config)
     {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
+        super(DoubleType.instance, config, name, Double.class);
     }
 
     @Override
-    public void write(byte[] b, int off, int len)
+    public Double generate()
     {
-        buffer.put(b, off, len);
+        return identityDistribution.nextDouble();
     }
 }

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Floats.java
similarity index 64%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/values/Floats.java
index e42574b..19f449a 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Floats.java

@@ -1,6 +1,5 @@
-package org.apache.cassandra.io.util;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +7,31 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
+package org.apache.cassandra.stress.generate.values;
 
+import org.apache.cassandra.db.marshal.FloatType;
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class Floats extends Generator<Float>
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public Floats(String name, GeneratorConfig config)
     {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
+        super(FloatType.instance, config, name, Float.class);
     }
 
     @Override
-    public void write(byte[] b, int off, int len)
+    public Float generate()
     {
-        buffer.put(b, off, len);
+        return (float) identityDistribution.nextDouble();
     }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/Generator.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Generator.java
new file mode 100644
index 0000000..00f866a
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Generator.java

@@ -0,0 +1,73 @@
+package org.apache.cassandra.stress.generate.values;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.settings.OptionDistribution;
+
+public abstract class Generator<T>
+{
+
+    public final String name;
+    public final AbstractType<T> type;
+    public final Class<T> clazz;
+    final long salt;
+    final Distribution identityDistribution;
+    final Distribution sizeDistribution;
+    public final Distribution clusteringDistribution;
+
+    public Generator(AbstractType<T> type, GeneratorConfig config, String name, Class<T> clazz)
+    {
+        this.type = type;
+        this.name = name;
+        this.clazz = clazz;
+        this.salt = config.salt;
+        this.identityDistribution = config.getIdentityDistribution(defaultIdentityDistribution());
+        this.sizeDistribution = config.getSizeDistribution(defaultSizeDistribution());
+        this.clusteringDistribution = config.getClusteringDistribution(defaultClusteringDistribution());
+    }
+
+    public void setSeed(long seed)
+    {
+        identityDistribution.setSeed(seed ^ salt);
+        clusteringDistribution.setSeed(seed ^ ~salt);
+    }
+
+    public abstract T generate();
+
+    DistributionFactory defaultIdentityDistribution()
+    {
+        return OptionDistribution.get("uniform(1..100B)");
+    }
+
+    DistributionFactory defaultSizeDistribution()
+    {
+        return OptionDistribution.get("uniform(4..8)");
+    }
+
+    DistributionFactory defaultClusteringDistribution()
+    {
+        return OptionDistribution.get("fixed(1)");
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/GeneratorConfig.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/GeneratorConfig.java
new file mode 100644
index 0000000..8f7b2ea
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/GeneratorConfig.java

@@ -0,0 +1,68 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.MurmurHash;
+
+
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.Map;
+
+public class GeneratorConfig implements Serializable
+{
+    public final long salt;
+
+    private final DistributionFactory clusteringDistributions;
+    private final DistributionFactory sizeDistributions;
+    private final DistributionFactory identityDistributions;
+
+    public GeneratorConfig(String seedStr, DistributionFactory clusteringDistributions, DistributionFactory sizeDistributions, DistributionFactory identityDistributions)
+    {
+        this.clusteringDistributions = clusteringDistributions;
+        this.sizeDistributions = sizeDistributions;
+        this.identityDistributions = identityDistributions;
+        ByteBuffer buf = ByteBufferUtil.bytes(seedStr);
+        long[] hash = new long[2];
+        MurmurHash.hash3_x64_128(buf, buf.position(), buf.remaining(), 0, hash);
+        salt = hash[0];
+    }
+
+    Distribution getClusteringDistribution(DistributionFactory deflt)
+    {
+        return (clusteringDistributions == null ? deflt : clusteringDistributions).get();
+    }
+
+    Distribution getIdentityDistribution(DistributionFactory deflt)
+    {
+        return (identityDistributions == null ? deflt : identityDistributions).get();
+    }
+
+    Distribution getSizeDistribution(DistributionFactory deflt)
+    {
+        return (sizeDistributions == null ? deflt : sizeDistributions).get();
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/HexBytes.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/HexBytes.java
new file mode 100644
index 0000000..19f2cc3
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/HexBytes.java

@@ -0,0 +1,56 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+import org.apache.cassandra.db.marshal.BytesType;
+
+public class HexBytes extends Generator<ByteBuffer>
+{
+    private final byte[] bytes;
+
+    public HexBytes(String name, GeneratorConfig config)
+    {
+        super(BytesType.instance, config, name, ByteBuffer.class);
+        bytes = new byte[(int) sizeDistribution.maxValue()];
+    }
+
+    @Override
+    public ByteBuffer generate()
+    {
+        long seed = identityDistribution.next();
+        sizeDistribution.setSeed(seed);
+        int size = (int) sizeDistribution.next();
+        for (int i = 0 ; i < size ; i +=16)
+        {
+            long value = identityDistribution.next();
+            for (int j = 0 ; j < 16 && i + j < size ; j++)
+            {
+                int v = (int) (value & 15);
+                bytes[i + j] = (byte) ((v < 10 ? '0' : 'A') + v);
+                value >>>= 4;
+            }
+        }
+        return ByteBuffer.wrap(Arrays.copyOf(bytes, size));
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/HexStrings.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/HexStrings.java
new file mode 100644
index 0000000..c811a61
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/HexStrings.java

@@ -0,0 +1,53 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+import org.apache.cassandra.db.marshal.UTF8Type;
+
+public class HexStrings extends Generator<String>
+{
+    private final char[] chars;
+
+    public HexStrings(String name, GeneratorConfig config)
+    {
+        super(UTF8Type.instance, config, name, String.class);
+        chars = new char[(int) sizeDistribution.maxValue()];
+    }
+
+    @Override
+    public String generate()
+    {
+        long seed = identityDistribution.next();
+        sizeDistribution.setSeed(seed);
+        int size = (int) sizeDistribution.next();
+        for (int i = 0 ; i < size ; i +=16)
+        {
+            long value = identityDistribution.next();
+            for (int j = 0 ; j < 16 && i + j < size ; j++)
+            {
+                int v = (int) (value & 15);
+                chars[i + j] = (char) ((v < 10 ? '0' : 'A') + v);
+                value >>>= 4;
+            }
+        }
+        return new String(chars, 0, size);
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/Inets.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Inets.java
new file mode 100644
index 0000000..107daad
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Inets.java

@@ -0,0 +1,57 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.cassandra.db.marshal.InetAddressType;
+
+
+public class Inets extends Generator<InetAddress>
+{
+    final byte[] buf;
+    public Inets(String name, GeneratorConfig config)
+    {
+        super(InetAddressType.instance, config, name, InetAddress.class);
+        buf = new byte[4];
+    }
+
+    @Override
+    public InetAddress generate()
+    {
+        int val = (int) identityDistribution.next();
+
+        buf[0] = (byte)(val >>> 24);
+        buf[1] = (byte)(val >>> 16);
+        buf[2] = (byte)(val >>> 8);
+        buf[3] = (byte)val;
+
+        try
+        {
+            return InetAddress.getByAddress(buf);
+        }
+        catch (UnknownHostException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Integers.java
similarity index 65%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/values/Integers.java
index e42574b..e05c615 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Integers.java

@@ -1,6 +1,5 @@
-package org.apache.cassandra.io.util;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +7,32 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
+package org.apache.cassandra.stress.generate.values;
 
+import org.apache.cassandra.db.marshal.Int32Type;
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class Integers extends Generator<Integer>
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public Integers(String name, GeneratorConfig config)
     {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
+        super(Int32Type.instance, config, name, Integer.class);
     }
 
     @Override
-    public void write(byte[] b, int off, int len)
+    public Integer generate()
     {
-        buffer.put(b, off, len);
+        return (int) identityDistribution.next();
     }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/Lists.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Lists.java
new file mode 100644
index 0000000..6480d7a
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Lists.java

@@ -0,0 +1,55 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.cassandra.db.marshal.ListType;
+
+public class Lists extends Generator<List>
+{
+    final Generator valueType;
+    final Object[] buffer;
+
+    public Lists(String name, Generator valueType, GeneratorConfig config)
+    {
+        super(ListType.getInstance(valueType.type), config, name, List.class);
+        this.valueType = valueType;
+        buffer = new Object[(int) sizeDistribution.maxValue()];
+    }
+
+    public void setSeed(long seed)
+    {
+        super.setSeed(seed);
+        valueType.setSeed(seed * 31);
+    }
+
+    @Override
+    public List generate()
+    {
+        int size = (int) sizeDistribution.next();
+        for (int i = 0 ; i < size ; i++)
+            buffer[i] = valueType.generate();
+        return com.google.common.collect.Lists.newArrayList(Arrays.copyOf(buffer, size));
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Longs.java
similarity index 64%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/values/Longs.java
index e42574b..638ecd0 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Longs.java

@@ -1,6 +1,5 @@
-package org.apache.cassandra.io.util;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +7,31 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
+package org.apache.cassandra.stress.generate.values;
 
+import org.apache.cassandra.db.marshal.LongType;
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public class Longs extends Generator<Long>
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public Longs(String name, GeneratorConfig config)
     {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
+        super(LongType.instance, config, name, Long.class);
     }
 
     @Override
-    public void write(byte[] b, int off, int len)
+    public Long generate()
     {
-        buffer.put(b, off, len);
+        return identityDistribution.next();
     }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/Sets.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Sets.java
new file mode 100644
index 0000000..8246286
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Sets.java

@@ -0,0 +1,54 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.cassandra.db.marshal.SetType;
+
+public class Sets extends Generator<Set>
+{
+    final Generator valueType;
+
+    public Sets(String name, Generator valueType, GeneratorConfig config)
+    {
+        super(SetType.getInstance(valueType.type), config, name, Set.class);
+        this.valueType = valueType;
+    }
+
+    public void setSeed(long seed)
+    {
+        super.setSeed(seed);
+        valueType.setSeed(seed * 31);
+    }
+
+    @Override
+    public Set generate()
+    {
+        final Set set = new HashSet();
+        int size = (int) sizeDistribution.next();
+        for (int i = 0 ; i < size ; i++)
+            set.add(valueType.generate());
+        return set;
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/Strings.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/Strings.java
new file mode 100644
index 0000000..71aaae6
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/Strings.java

@@ -0,0 +1,53 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+import java.util.Random;
+
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.stress.generate.FasterRandom;
+
+public class Strings extends Generator<String>
+{
+    private final char[] chars;
+    private final FasterRandom rnd = new FasterRandom();
+
+    public Strings(String name, GeneratorConfig config)
+    {
+        super(UTF8Type.instance, config, name, String.class);
+        chars = new char[(int) sizeDistribution.maxValue()];
+    }
+
+    @Override
+    public String generate()
+    {
+        long seed = identityDistribution.next();
+        sizeDistribution.setSeed(seed);
+        rnd.setSeed(~seed);
+        int size = (int) sizeDistribution.next();
+        for (int i = 0; i < size; )
+            for (long v = rnd.nextLong(),
+                 n = Math.min(size - i, Long.SIZE/Byte.SIZE);
+                 n-- > 0; v >>= Byte.SIZE)
+                chars[i++] = (char) (((v & 127) + 32) & 127);
+        return new String(chars, 0, size);
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/values/TimeUUIDs.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/TimeUUIDs.java
new file mode 100644
index 0000000..efe4b79
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/TimeUUIDs.java

@@ -0,0 +1,51 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.stress.generate.values;
+
+
+import org.apache.cassandra.db.marshal.TimeUUIDType;
+import org.apache.cassandra.utils.UUIDGen;
+
+import java.util.UUID;
+
+public class TimeUUIDs extends Generator<UUID>
+{
+    final Dates dateGen;
+    final long clockSeqAndNode;
+
+    public TimeUUIDs(String name, GeneratorConfig config)
+    {
+        super(TimeUUIDType.instance, config, name, UUID.class);
+        dateGen = new Dates(name, config);
+        clockSeqAndNode = config.salt;
+    }
+
+    public void setSeed(long seed)
+    {
+        dateGen.setSeed(seed);
+    }
+
+    @Override
+    public UUID generate()
+    {
+        return UUIDGen.getTimeUUID(dateGen.generate().getTime(), clockSeqAndNode);
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/generate/values/UUIDs.java
similarity index 65%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/generate/values/UUIDs.java
index e42574b..faa58c6 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/values/UUIDs.java

@@ -1,6 +1,5 @@
-package org.apache.cassandra.io.util;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,39 +7,33 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
+package org.apache.cassandra.stress.generate.values;
 
+import java.util.UUID;
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
+import org.apache.cassandra.db.marshal.UUIDType;
 
-public class ByteBufferOutputStream extends OutputStream
+public class UUIDs extends Generator<UUID>
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    public UUIDs(String name, GeneratorConfig config)
     {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
+        super(UUIDType.instance, config, name, UUID.class);
     }
 
     @Override
-    public void write(byte[] b, int off, int len)
+    public UUID generate()
     {
-        buffer.put(b, off, len);
+        return new UUID(identityDistribution.next(), identityDistribution.next());
     }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CQLOperation.java b/tools/stress/src/org/apache/cassandra/stress/operations/CQLOperation.java
deleted file mode 100644
index 54737a4..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CQLOperation.java
+++ /dev/null

@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.stress.operations;
-
-import java.nio.ByteBuffer;
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.transport.SimpleClient;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.CqlResult;
-import org.apache.cassandra.thrift.ThriftConversion;
-
-public abstract class CQLOperation extends Operation
-{
-    public CQLOperation(Session client, int idx)
-    {
-        super(client, idx);
-    }
-
-    protected abstract void run(CQLQueryExecutor executor) throws IOException;
-
-    protected abstract boolean validateThriftResult(CqlResult result);
-
-    protected abstract boolean validateNativeResult(ResultMessage result);
-
-    public void run(final CassandraClient client) throws IOException
-    {
-        run(new CQLQueryExecutor()
-        {
-            public boolean execute(String cqlQuery, List<String> queryParams) throws Exception
-            {
-                CqlResult result = null;
-                if (session.usePreparedStatements())
-                {
-                    Integer stmntId = getPreparedStatement(client, cqlQuery);
-                    if (session.cqlVersion.startsWith("3"))
-                        result = client.execute_prepared_cql3_query(stmntId, queryParamsAsByteBuffer(queryParams), session.getConsistencyLevel());
-                    else
-                        result = client.execute_prepared_cql_query(stmntId, queryParamsAsByteBuffer(queryParams));
-                }
-                else
-                {
-                    String formattedQuery = formatCqlQuery(cqlQuery, queryParams);
-                    if (session.cqlVersion.startsWith("3"))
-                        result = client.execute_cql3_query(ByteBuffer.wrap(formattedQuery.getBytes()), Compression.NONE, session.getConsistencyLevel());
-                    else
-                        result = client.execute_cql_query(ByteBuffer.wrap(formattedQuery.getBytes()), Compression.NONE);
-                }
-                return validateThriftResult(result);
-            }
-        });
-    }
-
-    public void run(final SimpleClient client) throws IOException
-    {
-        run(new CQLQueryExecutor()
-        {
-            public boolean execute(String cqlQuery, List<String> queryParams) throws Exception
-            {
-                ResultMessage result = null;
-                if (session.usePreparedStatements())
-                {
-                    byte[] stmntId = getPreparedStatement(client, cqlQuery);
-                    result = client.executePrepared(stmntId, queryParamsAsByteBuffer(queryParams), ThriftConversion.fromThrift(session.getConsistencyLevel()));
-                }
-                else
-                {
-                    String formattedQuery = formatCqlQuery(cqlQuery, queryParams);
-                    result = client.execute(formattedQuery, ThriftConversion.fromThrift(session.getConsistencyLevel()));
-                }
-                return validateNativeResult(result);
-            }
-        });
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CounterAdder.java b/tools/stress/src/org/apache/cassandra/stress/operations/CounterAdder.java
deleted file mode 100644
index ab6ae9d..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CounterAdder.java
+++ /dev/null

@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.operations;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-public class CounterAdder extends Operation
-{
-    public CounterAdder(Session client, int index)
-    {
-        super(client, index);
-    }
-
-    public void run(CassandraClient client) throws IOException
-    {
-        List<CounterColumn> columns = new ArrayList<CounterColumn>();
-        List<CounterSuperColumn> superColumns = new ArrayList<CounterSuperColumn>();
-
-        // format used for keys
-        String format = "%0" + session.getTotalKeysLength() + "d";
-
-        for (int i = 0; i < session.getColumnsPerKey(); i++)
-        {
-            String columnName = ("C" + Integer.toString(i));
-
-            columns.add(new CounterColumn(ByteBufferUtil.bytes(columnName), 1L));
-        }
-
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-        {
-            // supers = [SuperColumn('S' + str(j), columns) for j in xrange(supers_per_key)]
-            for (int i = 0; i < session.getSuperColumns(); i++)
-            {
-                String superColumnName = "S" + Integer.toString(i);
-                superColumns.add(new CounterSuperColumn(ByteBuffer.wrap(superColumnName.getBytes()), columns));
-            }
-        }
-
-        String rawKey = String.format(format, index);
-        Map<ByteBuffer, Map<String, List<Mutation>>> record = new HashMap<ByteBuffer, Map<String, List<Mutation>>>();
-
-        record.put(ByteBufferUtil.bytes(rawKey), session.getColumnFamilyType() == ColumnFamilyType.Super
-                                                                                ? getSuperColumnsMutationMap(superColumns)
-                                                                                : getColumnsMutationMap(columns));
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                client.batch_mutate(record, session.getConsistencyLevel());
-                success = true;
-            }
-            catch (Exception e)
-            {
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error incrementing key %s %s%n",
-                                index,
-                                session.getRetryTimes(),
-                                rawKey,
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndIncrement();
-        context.stop();
-    }
-
-    private Map<String, List<Mutation>> getSuperColumnsMutationMap(List<CounterSuperColumn> superColumns)
-    {
-        List<Mutation> mutations = new ArrayList<Mutation>();
-        Map<String, List<Mutation>> mutationMap = new HashMap<String, List<Mutation>>();
-
-        for (CounterSuperColumn s : superColumns)
-        {
-            ColumnOrSuperColumn cosc = new ColumnOrSuperColumn().setCounter_super_column(s);
-            mutations.add(new Mutation().setColumn_or_supercolumn(cosc));
-        }
-
-        mutationMap.put("SuperCounter1", mutations);
-
-        return mutationMap;
-    }
-
-    private Map<String, List<Mutation>> getColumnsMutationMap(List<CounterColumn> columns)
-    {
-        List<Mutation> mutations = new ArrayList<Mutation>();
-        Map<String, List<Mutation>> mutationMap = new HashMap<String, List<Mutation>>();
-
-        for (CounterColumn c : columns)
-        {
-            ColumnOrSuperColumn cosc = new ColumnOrSuperColumn().setCounter_column(c);
-            mutations.add(new Mutation().setColumn_or_supercolumn(cosc));
-        }
-
-        mutationMap.put("Counter1", mutations);
-
-        return mutationMap;
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CounterGetter.java b/tools/stress/src/org/apache/cassandra/stress/operations/CounterGetter.java
deleted file mode 100644
index 56ef243..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CounterGetter.java
+++ /dev/null

@@ -1,152 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.operations;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.thrift.*;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.List;
-
-public class CounterGetter extends Operation
-{
-    public CounterGetter(Session client, int index)
-    {
-        super(client, index);
-    }
-
-    public void run(CassandraClient client) throws IOException
-    {
-        SliceRange sliceRange = new SliceRange();
-
-        // start/finish
-        sliceRange.setStart(new byte[] {}).setFinish(new byte[] {});
-
-        // reversed/count
-        sliceRange.setReversed(false).setCount(session.getColumnsPerKey());
-
-        // initialize SlicePredicate with existing SliceRange
-        SlicePredicate predicate = new SlicePredicate().setSlice_range(sliceRange);
-
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-        {
-            runSuperCounterGetter(predicate, client);
-        }
-        else
-        {
-            runCounterGetter(predicate, client);
-        }
-    }
-
-    private void runSuperCounterGetter(SlicePredicate predicate, Cassandra.Client client) throws IOException
-    {
-        byte[] rawKey = generateKey();
-        ByteBuffer key = ByteBuffer.wrap(rawKey);
-
-        for (int j = 0; j < session.getSuperColumns(); j++)
-        {
-            String superColumn = 'S' + Integer.toString(j);
-            ColumnParent parent = new ColumnParent("SuperCounter1").setSuper_column(superColumn.getBytes());
-
-            TimerContext context = session.latency.time();
-
-            boolean success = false;
-            String exceptionMessage = null;
-
-            for (int t = 0; t < session.getRetryTimes(); t++)
-            {
-                if (success)
-                    break;
-
-                try
-                {
-                    List<ColumnOrSuperColumn> counters;
-                    counters = client.get_slice(key, parent, predicate, session.getConsistencyLevel());
-                    success = (counters.size() != 0);
-                }
-                catch (Exception e)
-                {
-                    exceptionMessage = getExceptionMessage(e);
-                    success = false;
-                }
-            }
-
-            if (!success)
-            {
-                error(String.format("Operation [%d] retried %d times - error reading counter key %s %s%n",
-                                    index,
-                                    session.getRetryTimes(),
-                                    new String(rawKey),
-                                    (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-            }
-
-            session.operations.getAndIncrement();
-            session.keys.getAndIncrement();
-            context.stop();
-        }
-    }
-
-    private void runCounterGetter(SlicePredicate predicate, Cassandra.Client client) throws IOException
-    {
-        ColumnParent parent = new ColumnParent("Counter1");
-
-        byte[] key = generateKey();
-        ByteBuffer keyBuffer = ByteBuffer.wrap(key);
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                List<ColumnOrSuperColumn> counters;
-                counters = client.get_slice(keyBuffer, parent, predicate, session.getConsistencyLevel());
-                success = (counters.size() != 0);
-            }
-            catch (Exception e)
-            {
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error reading counter key %s %s%n",
-                                index,
-                                session.getRetryTimes(),
-                                new String(key),
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndIncrement();
-        context.stop();
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CqlCounterAdder.java b/tools/stress/src/org/apache/cassandra/stress/operations/CqlCounterAdder.java
deleted file mode 100644
index 31e8371..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CqlCounterAdder.java
+++ /dev/null

@@ -1,122 +0,0 @@
-package org.apache.cassandra.stress.operations;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.CqlResult;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class CqlCounterAdder extends CQLOperation
-{
-    private static String cqlQuery = null;
-
-    public CqlCounterAdder(Session client, int idx)
-    {
-        super(client, idx);
-    }
-
-    protected void run(CQLQueryExecutor executor) throws IOException
-    {
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-            throw new RuntimeException("Super columns are not implemented for CQL");
-
-        if (cqlQuery == null)
-        {
-            String counterCF = session.cqlVersion.startsWith("2") ? "Counter1" : "Counter3";
-
-            StringBuilder query = new StringBuilder("UPDATE ").append(wrapInQuotesIfRequired(counterCF));
-
-            if (session.cqlVersion.startsWith("2"))
-                query.append(" USING CONSISTENCY ").append(session.getConsistencyLevel());
-
-            query.append(" SET ");
-
-            for (int i = 0; i < session.getColumnsPerKey(); i++)
-            {
-                if (i > 0)
-                    query.append(",");
-
-                query.append('C').append(i).append("=C").append(i).append("+1");
-            }
-            query.append(" WHERE KEY=?");
-            cqlQuery = query.toString();
-        }
-
-        String key = String.format("%0" + session.getTotalKeysLength() + "d", index);
-        List<String> queryParams = Collections.singletonList(getUnQuotedCqlBlob(key, session.cqlVersion.startsWith("3")));
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                success = executor.execute(cqlQuery, queryParams);
-            }
-            catch (Exception e)
-            {
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error incrementing key %s %s%n",
-                                index,
-                                session.getRetryTimes(),
-                                key,
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndIncrement();
-        context.stop();
-    }
-
-    protected boolean validateThriftResult(CqlResult result)
-    {
-        return true;
-    }
-
-    protected boolean validateNativeResult(ResultMessage result)
-    {
-        return true;
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CqlCounterGetter.java b/tools/stress/src/org/apache/cassandra/stress/operations/CqlCounterGetter.java
deleted file mode 100644
index a4d037a..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CqlCounterGetter.java
+++ /dev/null

@@ -1,120 +0,0 @@
-package org.apache.cassandra.stress.operations;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.CqlResult;
-import org.apache.cassandra.thrift.CqlResultType;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class CqlCounterGetter extends CQLOperation
-{
-    private static String cqlQuery = null;
-
-    public CqlCounterGetter(Session client, int idx)
-    {
-        super(client, idx);
-    }
-
-    protected void run(CQLQueryExecutor executor) throws IOException
-    {
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-            throw new RuntimeException("Super columns are not implemented for CQL");
-
-        if (cqlQuery == null)
-        {
-            StringBuilder query = new StringBuilder("SELECT ");
-
-            if (session.cqlVersion.startsWith("2"))
-                query.append("FIRST ").append(session.getColumnsPerKey()).append(" ''..''");
-            else
-                query.append("*");
-
-            String counterCF = session.cqlVersion.startsWith("2") ? "Counter1" : "Counter3";
-
-            query.append(" FROM ").append(wrapInQuotesIfRequired(counterCF));
-
-            if (session.cqlVersion.startsWith("2"))
-                query.append(" USING CONSISTENCY ").append(session.getConsistencyLevel().toString());
-
-            cqlQuery = query.append(" WHERE KEY=?").toString();
-        }
-
-        byte[] key = generateKey();
-        List<String> queryParams = Collections.singletonList(getUnQuotedCqlBlob(key, session.cqlVersion.startsWith("3")));
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                success = executor.execute(cqlQuery, queryParams);
-            }
-            catch (Exception e)
-            {
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error reading counter key %s %s%n",
-                                index,
-                                session.getRetryTimes(),
-                                new String(key),
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndIncrement();
-        context.stop();
-    }
-
-    protected boolean validateThriftResult(CqlResult result)
-    {
-        return result.rows.get(0).columns.size() != 0;
-    }
-
-    protected boolean validateNativeResult(ResultMessage result)
-    {
-        return result instanceof ResultMessage.Rows && ((ResultMessage.Rows)result).result.size() != 0;
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CqlIndexedRangeSlicer.java b/tools/stress/src/org/apache/cassandra/stress/operations/CqlIndexedRangeSlicer.java
deleted file mode 100644
index bf416cc..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CqlIndexedRangeSlicer.java
+++ /dev/null

@@ -1,179 +0,0 @@
-package org.apache.cassandra.stress.operations;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.cql3.ResultSet;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.CqlResult;
-import org.apache.cassandra.thrift.CqlRow;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class CqlIndexedRangeSlicer extends CQLOperation
-{
-    private static List<ByteBuffer> values = null;
-    private static String cqlQuery = null;
-
-    private int lastQueryResultSize;
-    private int lastMaxKey;
-
-    public CqlIndexedRangeSlicer(Session client, int idx)
-    {
-        super(client, idx);
-    }
-
-    protected void run(CQLQueryExecutor executor) throws IOException
-    {
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-            throw new RuntimeException("Super columns are not implemented for CQL");
-
-        if (values == null)
-            values = generateValues();
-
-        if (cqlQuery == null)
-        {
-            StringBuilder query = new StringBuilder("SELECT ");
-
-            if (session.cqlVersion.startsWith("2"))
-                query.append(session.getColumnsPerKey()).append(" ''..''");
-            else
-                query.append("*");
-
-            query.append(" FROM Standard1");
-
-            if (session.cqlVersion.startsWith("2"))
-                query.append(" USING CONSISTENCY ").append(session.getConsistencyLevel());
-
-            query.append(" WHERE C1=").append(getUnQuotedCqlBlob(values.get(1).array(), session.cqlVersion.startsWith("3")))
-                 .append(" AND KEY > ? LIMIT ").append(session.getKeysPerCall());
-
-            cqlQuery = query.toString();
-        }
-
-        String format = "%0" + session.getTotalKeysLength() + "d";
-        String startOffset = String.format(format, 0);
-
-        int expectedPerValue = session.getNumKeys() / values.size(), received = 0;
-
-        while (received < expectedPerValue)
-        {
-            TimerContext context = session.latency.time();
-
-            boolean success = false;
-            String exceptionMessage = null;
-            String formattedQuery = null;
-            List<String> queryParms = Collections.singletonList(getUnQuotedCqlBlob(startOffset, session.cqlVersion.startsWith("3")));
-
-            for (int t = 0; t < session.getRetryTimes(); t++)
-            {
-                if (success)
-                    break;
-
-                try
-                {
-                    success = executor.execute(cqlQuery, queryParms);
-                }
-                catch (Exception e)
-                {
-                    exceptionMessage = getExceptionMessage(e);
-                    success = false;
-                }
-            }
-
-            if (!success)
-            {
-                error(String.format("Operation [%d] retried %d times - error executing indexed range query with offset %s %s%n",
-                                    index,
-                                    session.getRetryTimes(),
-                                    startOffset,
-                                    (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-            }
-
-            received += lastQueryResultSize;
-
-            // convert max key found back to an integer, and increment it
-            startOffset = String.format(format, (1 + lastMaxKey));
-
-            session.operations.getAndIncrement();
-            session.keys.getAndAdd(lastQueryResultSize);
-            context.stop();
-        }
-    }
-
-    /**
-     * Get maximum key from CqlRow list
-     * @param rows list of the CqlRow objects
-     * @return maximum key value of the list
-     */
-    private int getMaxKey(List<CqlRow> rows)
-    {
-        int maxKey = ByteBufferUtil.toInt(rows.get(0).key);
-
-        for (CqlRow row : rows)
-        {
-            int currentKey = ByteBufferUtil.toInt(row.key);
-            if (currentKey > maxKey)
-                maxKey = currentKey;
-        }
-
-        return maxKey;
-    }
-
-    private int getMaxKey(ResultSet rs)
-    {
-        int maxKey = ByteBufferUtil.toInt(rs.rows.get(0).get(0));
-
-        for (List<ByteBuffer> row : rs.rows)
-        {
-            int currentKey = ByteBufferUtil.toInt(row.get(0));
-            if (currentKey > maxKey)
-                maxKey = currentKey;
-        }
-
-        return maxKey;
-    }
-
-    protected boolean validateThriftResult(CqlResult result)
-    {
-        lastQueryResultSize = result.rows.size();
-        lastMaxKey = getMaxKey(result.rows);
-        return lastQueryResultSize != 0;
-    }
-
-    protected boolean validateNativeResult(ResultMessage result)
-    {
-        assert result instanceof ResultMessage.Rows;
-        lastQueryResultSize = ((ResultMessage.Rows)result).result.size();
-        lastMaxKey = getMaxKey(((ResultMessage.Rows)result).result);
-        return lastQueryResultSize != 0;
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CqlInserter.java b/tools/stress/src/org/apache/cassandra/stress/operations/CqlInserter.java
deleted file mode 100644
index 3572c36..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CqlInserter.java
+++ /dev/null

@@ -1,146 +0,0 @@
-package org.apache.cassandra.stress.operations;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.transport.SimpleClient;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.CqlResult;
-import org.apache.cassandra.utils.UUIDGen;
-
-public class CqlInserter extends CQLOperation
-{
-    private static List<ByteBuffer> values;
-    private static String cqlQuery = null;
-
-    public CqlInserter(Session client, int idx)
-    {
-        super(client, idx);
-    }
-
-    protected void run(CQLQueryExecutor executor) throws IOException
-    {
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-            throw new RuntimeException("Super columns are not implemented for CQL");
-
-        if (values == null)
-            values = generateValues();
-
-        // Construct a query string once.
-        if (cqlQuery == null)
-        {
-            StringBuilder query = new StringBuilder("UPDATE ").append(wrapInQuotesIfRequired("Standard1"));
-
-            if (session.cqlVersion.startsWith("2"))
-                query.append(" USING CONSISTENCY ").append(session.getConsistencyLevel().toString());
-
-            query.append(" SET ");
-
-            for (int i = 0; i < session.getColumnsPerKey(); i++)
-            {
-                if (i > 0)
-                    query.append(',');
-
-                if (session.timeUUIDComparator)
-                {
-                    if (session.cqlVersion.startsWith("3"))
-                        throw new UnsupportedOperationException("Cannot use UUIDs in column names with CQL3");
-
-                    query.append(wrapInQuotesIfRequired(UUIDGen.getTimeUUID().toString()))
-                         .append(" = ?");
-                }
-                else
-                {
-                    query.append(wrapInQuotesIfRequired("C" + i)).append(" = ?");
-                }
-            }
-
-            query.append(" WHERE KEY=?");
-            cqlQuery = query.toString();
-        }
-
-        List<String> queryParms = new ArrayList<String>();
-        for (int i = 0; i < session.getColumnsPerKey(); i++)
-        {
-            // Column value
-            queryParms.add(getUnQuotedCqlBlob(values.get(i % values.size()).array(), session.cqlVersion.startsWith("3")));
-        }
-
-        String key = String.format("%0" + session.getTotalKeysLength() + "d", index);
-        queryParms.add(getUnQuotedCqlBlob(key, session.cqlVersion.startsWith("3")));
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                success = executor.execute(cqlQuery, queryParms);
-            }
-            catch (Exception e)
-            {
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error inserting key %s %s%n with query %s",
-                                index,
-                                session.getRetryTimes(),
-                                key,
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")",
-                                cqlQuery));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndIncrement();
-        context.stop();
-    }
-
-    protected boolean validateThriftResult(CqlResult result)
-    {
-        return true;
-    }
-
-    protected boolean validateNativeResult(ResultMessage result)
-    {
-        return true;
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CqlMultiGetter.java b/tools/stress/src/org/apache/cassandra/stress/operations/CqlMultiGetter.java
deleted file mode 100644
index ec645d4..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CqlMultiGetter.java
+++ /dev/null

@@ -1,47 +0,0 @@
-package org.apache.cassandra.stress.operations;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.transport.SimpleClient;
-
-public class CqlMultiGetter extends Operation
-{
-    public CqlMultiGetter(Session client, int idx)
-    {
-        super(client, idx);
-    }
-
-    public void run(CassandraClient client) throws IOException
-    {
-        throw new RuntimeException("Multiget is not implemented for CQL");
-    }
-
-    public void run(SimpleClient client) throws IOException
-    {
-        throw new RuntimeException("Multiget is not implemented for CQL");
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CqlRangeSlicer.java b/tools/stress/src/org/apache/cassandra/stress/operations/CqlRangeSlicer.java
deleted file mode 100644
index c01767b..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CqlRangeSlicer.java
+++ /dev/null

@@ -1,118 +0,0 @@
-package org.apache.cassandra.stress.operations;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.CqlResult;
-import org.apache.cassandra.transport.SimpleClient;
-
-public class CqlRangeSlicer extends CQLOperation
-{
-    private static String cqlQuery = null;
-    private int lastRowCount;
-
-    public CqlRangeSlicer(Session client, int idx)
-    {
-        super(client, idx);
-    }
-
-    protected void run(CQLQueryExecutor executor) throws IOException
-    {
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-            throw new RuntimeException("Super columns are not implemented for CQL");
-
-        if (cqlQuery == null)
-        {
-            StringBuilder query = new StringBuilder("SELECT FIRST ").append(session.getColumnsPerKey())
-                    .append(" ''..'' FROM Standard1");
-
-            if (session.cqlVersion.startsWith("2"))
-                query.append(" USING CONSISTENCY ").append(session.getConsistencyLevel().toString());
-
-            cqlQuery = query.append(" WHERE KEY > ?").toString();
-        }
-
-        String key = String.format("%0" +  session.getTotalKeysLength() + "d", index);
-        List<String> queryParams = Collections.singletonList(getUnQuotedCqlBlob(key, session.cqlVersion.startsWith("3")));
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                success = executor.execute(cqlQuery, queryParams);
-            }
-            catch (Exception e)
-            {
-                System.err.println(e);
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error executing range slice with offset %s %s%n",
-                                index,
-                                session.getRetryTimes(),
-                                key,
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndAdd(lastRowCount);
-        context.stop();
-    }
-
-    protected boolean validateThriftResult(CqlResult result)
-    {
-        lastRowCount = result.rows.size();
-        return  lastRowCount != 0;
-    }
-
-    protected boolean validateNativeResult(ResultMessage result)
-    {
-        assert result instanceof ResultMessage.Rows;
-        lastRowCount = ((ResultMessage.Rows)result).result.size();
-        return lastRowCount != 0;
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/CqlReader.java b/tools/stress/src/org/apache/cassandra/stress/operations/CqlReader.java
deleted file mode 100644
index 70273c1..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/CqlReader.java
+++ /dev/null

@@ -1,136 +0,0 @@
-package org.apache.cassandra.stress.operations;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.transport.SimpleClient;
-import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.CqlResult;
-import org.apache.cassandra.thrift.ThriftConversion;
-
-public class CqlReader extends CQLOperation
-{
-    private static String cqlQuery = null;
-
-    public CqlReader(Session client, int idx)
-    {
-        super(client, idx);
-    }
-
-    protected void run(CQLQueryExecutor executor) throws IOException
-    {
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-            throw new RuntimeException("Super columns are not implemented for CQL");
-
-        if (cqlQuery == null)
-        {
-            StringBuilder query = new StringBuilder("SELECT ");
-
-            if (session.columnNames == null)
-            {
-                if (session.cqlVersion.startsWith("2"))
-                    query.append("FIRST ").append(session.getColumnsPerKey()).append(" ''..''");
-                else
-                    query.append("*");
-            }
-            else
-            {
-                for (int i = 0; i < session.columnNames.size(); i++)
-                {
-                    if (i > 0) query.append(",");
-                    query.append('?');
-                }
-            }
-
-            query.append(" FROM ").append(wrapInQuotesIfRequired("Standard1"));
-
-            if (session.cqlVersion.startsWith("2"))
-                query.append(" USING CONSISTENCY ").append(session.getConsistencyLevel().toString());
-            query.append(" WHERE KEY=?");
-
-            cqlQuery = query.toString();
-        }
-
-        List<String> queryParams = new ArrayList<String>();
-        if (session.columnNames != null)
-            for (int i = 0; i < session.columnNames.size(); i++)
-                queryParams.add(getUnQuotedCqlBlob(session.columnNames.get(i).array(), session.cqlVersion.startsWith("3")));
-
-        byte[] key = generateKey();
-        queryParams.add(getUnQuotedCqlBlob(key, session.cqlVersion.startsWith("3")));
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                success = executor.execute(cqlQuery, queryParams);
-            }
-            catch (Exception e)
-            {
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error reading key %s %s%n with query %s",
-                                index,
-                                session.getRetryTimes(),
-                                new String(key),
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")",
-                                cqlQuery));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndIncrement();
-        context.stop();
-    }
-
-    protected boolean validateThriftResult(CqlResult result)
-    {
-        return result.rows.get(0).columns.size() != 0;
-    }
-
-    protected boolean validateNativeResult(ResultMessage result)
-    {
-        return result instanceof ResultMessage.Rows && ((ResultMessage.Rows)result).result.size() != 0;
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/operations/FixedOpDistribution.java
similarity index 66%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/operations/FixedOpDistribution.java
index e42574b..3212795 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/FixedOpDistribution.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.operations;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,26 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
+import org.apache.cassandra.stress.Operation;
 
-public class ByteBufferOutputStream extends OutputStream
+public class FixedOpDistribution implements OpDistribution
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    final Operation operation;
+
+    public FixedOpDistribution(Operation operation)
     {
-        this.buffer = buffer;
+        this.operation = operation;
     }
 
-    public void write(int b)
+    public Operation next()
     {
-        buffer.put((byte) b);
+        return operation;
     }
 
-    @Override
-    public void write(byte[] b, int off, int len)
+    public int maxBatchSize()
     {
-        buffer.put(b, off, len);
+        return (int) operation.partitionCount.maxValue();
     }
+
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/IndexedRangeSlicer.java b/tools/stress/src/org/apache/cassandra/stress/operations/IndexedRangeSlicer.java
deleted file mode 100644
index b7c72a2..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/IndexedRangeSlicer.java
+++ /dev/null

@@ -1,135 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.operations;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.List;
-
-public class IndexedRangeSlicer extends Operation
-{
-    private static List<ByteBuffer> values = null;
-
-    public IndexedRangeSlicer(Session client, int index)
-    {
-        super(client, index);
-    }
-
-    public void run(CassandraClient client) throws IOException
-    {
-        if (values == null)
-            values = generateValues();
-
-        String format = "%0" + session.getTotalKeysLength() + "d";
-        SlicePredicate predicate = new SlicePredicate().setSlice_range(new SliceRange(ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                                      ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                                      false, session.getColumnsPerKey()));
-
-        ColumnParent parent = new ColumnParent("Standard1");
-        int expectedPerValue = session.getNumKeys() / values.size();
-
-        ByteBuffer columnName = ByteBufferUtil.bytes("C1");
-
-        int received = 0;
-
-        String startOffset = String.format(format, 0);
-        ByteBuffer value = values.get(1); // only C1 column is indexed
-
-        IndexExpression expression = new IndexExpression(columnName, IndexOperator.EQ, value);
-
-        while (received < expectedPerValue)
-        {
-            IndexClause clause = new IndexClause(Arrays.asList(expression),
-                                                 ByteBufferUtil.bytes(startOffset),
-                                                 session.getKeysPerCall());
-
-            List<KeySlice> results = null;
-            TimerContext context = session.latency.time();
-
-            boolean success = false;
-            String exceptionMessage = null;
-
-            for (int t = 0; t < session.getRetryTimes(); t++)
-            {
-                if (success)
-                    break;
-
-                try
-                {
-                    results = client.get_indexed_slices(parent, clause, predicate, session.getConsistencyLevel());
-                    success = (results.size() != 0);
-                }
-                catch (Exception e)
-                {
-                    exceptionMessage = getExceptionMessage(e);
-                    success = false;
-                }
-            }
-
-            if (!success)
-            {
-                error(String.format("Operation [%d] retried %d times - error on calling get_indexed_slices for offset %s %s%n",
-                                    index,
-                                    session.getRetryTimes(),
-                                    startOffset,
-                                    (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-            }
-
-            received += results.size();
-
-            // convert max key found back to an integer, and increment it
-            startOffset = String.format(format, (1 + getMaxKey(results)));
-
-            session.operations.getAndIncrement();
-            session.keys.getAndAdd(results.size());
-            context.stop();
-        }
-    }
-
-    /**
-     * Get maximum key from keySlice list
-     * @param keySlices list of the KeySlice objects
-     * @return maximum key value of the list
-     */
-    private int getMaxKey(List<KeySlice> keySlices)
-    {
-        byte[] firstKey = keySlices.get(0).getKey();
-        int maxKey = ByteBufferUtil.toInt(ByteBuffer.wrap(firstKey));
-
-        for (KeySlice k : keySlices)
-        {
-            int currentKey = ByteBufferUtil.toInt(ByteBuffer.wrap(k.getKey()));
-
-            if (currentKey > maxKey)
-            {
-                maxKey = currentKey;
-            }
-        }
-
-        return maxKey;
-    }
-
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/Inserter.java b/tools/stress/src/org/apache/cassandra/stress/operations/Inserter.java
deleted file mode 100644
index cbf6b98..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/Inserter.java
+++ /dev/null

@@ -1,135 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.operations;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-public class Inserter extends Operation
-{
-    private static List<ByteBuffer> values;
-
-    public Inserter(Session client, int index)
-    {
-        super(client, index);
-    }
-
-    public void run(CassandraClient client) throws IOException
-    {
-        if (values == null)
-            values = generateValues();
-
-        List<Column> columns = new ArrayList<Column>(session.getColumnsPerKey());
-        List<SuperColumn> superColumns = null;
-
-        // format used for keys
-        String format = "%0" + session.getTotalKeysLength() + "d";
-
-        for (int i = 0; i < session.getColumnsPerKey(); i++)
-        {
-            columns.add(new Column(columnName(i, session.timeUUIDComparator))
-                            .setValue(values.get(i % values.size()))
-                            .setTimestamp(FBUtilities.timestampMicros()));
-        }
-
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-        {
-            superColumns = new ArrayList<SuperColumn>();
-            // supers = [SuperColumn('S' + str(j), columns) for j in xrange(supers_per_key)]
-            for (int i = 0; i < session.getSuperColumns(); i++)
-            {
-                String superColumnName = "S" + Integer.toString(i);
-                superColumns.add(new SuperColumn(ByteBufferUtil.bytes(superColumnName), columns));
-            }
-        }
-
-        String rawKey = String.format(format, index);
-        Map<String, List<Mutation>> row = session.getColumnFamilyType() == ColumnFamilyType.Super
-                                        ? getSuperColumnsMutationMap(superColumns)
-                                        : getColumnsMutationMap(columns);
-        Map<ByteBuffer, Map<String, List<Mutation>>> record = Collections.singletonMap(ByteBufferUtil.bytes(rawKey), row);
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                client.batch_mutate(record, session.getConsistencyLevel());
-                success = true;
-            }
-            catch (Exception e)
-            {
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error inserting key %s %s%n",
-                                index,
-                                session.getRetryTimes(),
-                                rawKey,
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndIncrement();
-        context.stop();
-    }
-
-    private Map<String, List<Mutation>> getSuperColumnsMutationMap(List<SuperColumn> superColumns)
-    {
-        List<Mutation> mutations = new ArrayList<Mutation>(superColumns.size());
-        for (SuperColumn s : superColumns)
-        {
-            ColumnOrSuperColumn superColumn = new ColumnOrSuperColumn().setSuper_column(s);
-            mutations.add(new Mutation().setColumn_or_supercolumn(superColumn));
-        }
-
-        return Collections.singletonMap("Super1", mutations);
-    }
-
-    private Map<String, List<Mutation>> getColumnsMutationMap(List<Column> columns)
-    {
-        List<Mutation> mutations = new ArrayList<Mutation>(columns.size());
-        for (Column c : columns)
-        {
-            ColumnOrSuperColumn column = new ColumnOrSuperColumn().setColumn(c);
-            mutations.add(new Mutation().setColumn_or_supercolumn(column));
-        }
-
-        return Collections.singletonMap("Standard1", mutations);
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/MultiGetter.java b/tools/stress/src/org/apache/cassandra/stress/operations/MultiGetter.java
deleted file mode 100644
index 12a39fb..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/MultiGetter.java
+++ /dev/null

@@ -1,152 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.operations;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-
-public class MultiGetter extends Operation
-{
-    public MultiGetter(Session client, int index)
-    {
-        super(client, index);
-    }
-
-    public void run(CassandraClient client) throws IOException
-    {
-        SlicePredicate predicate = new SlicePredicate().setSlice_range(new SliceRange(ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                                      ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                                      false, session.getColumnsPerKey()));
-
-        int offset = index * session.getKeysPerThread();
-        Map<ByteBuffer,List<ColumnOrSuperColumn>> results;
-
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-        {
-            List<ByteBuffer> keys = generateKeys(offset, offset + session.getKeysPerCall());
-
-            for (int j = 0; j < session.getSuperColumns(); j++)
-            {
-                ColumnParent parent = new ColumnParent("Super1").setSuper_column(ByteBufferUtil.bytes("S" + j));
-
-                TimerContext context = session.latency.time();
-
-                boolean success = false;
-                String exceptionMessage = null;
-
-                for (int t = 0; t < session.getRetryTimes(); t++)
-                {
-                    if (success)
-                        break;
-
-                    try
-                    {
-                        results = client.multiget_slice(keys, parent, predicate, session.getConsistencyLevel());
-                        success = (results.size() != 0);
-                    }
-                    catch (Exception e)
-                    {
-                        exceptionMessage = getExceptionMessage(e);
-                    }
-                }
-
-                if (!success)
-                {
-                    error(String.format("Operation [%d] retried %d times - error on calling multiget_slice for keys %s %s%n",
-                                        index,
-                                        session.getRetryTimes(),
-                                        keys,
-                                        (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-                }
-
-                session.operations.getAndIncrement();
-                session.keys.getAndAdd(keys.size());
-                context.stop();
-
-                offset += session.getKeysPerCall();
-            }
-        }
-        else
-        {
-            ColumnParent parent = new ColumnParent("Standard1");
-
-            List<ByteBuffer> keys = generateKeys(offset, offset + session.getKeysPerCall());
-
-            TimerContext context = session.latency.time();
-
-            boolean success = false;
-            String exceptionMessage = null;
-
-            for (int t = 0; t < session.getRetryTimes(); t++)
-            {
-                if (success)
-                    break;
-
-                try
-                {
-                    results = client.multiget_slice(keys, parent, predicate, session.getConsistencyLevel());
-                    success = (results.size() != 0);
-                }
-                catch (Exception e)
-                {
-                    exceptionMessage = getExceptionMessage(e);
-                    success = false;
-                }
-            }
-
-            if (!success)
-            {
-                error(String.format("Operation [%d] retried %d times - error on calling multiget_slice for keys %s %s%n",
-                                    index,
-                                    session.getRetryTimes(),
-                                    keys,
-                                    (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-            }
-
-            session.operations.getAndIncrement();
-            session.keys.getAndAdd(keys.size());
-            context.stop();
-
-            offset += session.getKeysPerCall();
-        }
-    }
-
-    private List<ByteBuffer> generateKeys(int start, int limit)
-    {
-        List<ByteBuffer> keys = new ArrayList<ByteBuffer>();
-
-        for (int i = start; i < limit; i++)
-        {
-            keys.add(ByteBuffer.wrap(generateKey()));
-        }
-
-        return keys;
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/operations/OpDistribution.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/operations/OpDistribution.java
index e42574b..bcbd0bf 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/OpDistribution.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.operations;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,12 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
+import org.apache.cassandra.stress.Operation;
 
-public class ByteBufferOutputStream extends OutputStream
+public interface OpDistribution
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
+    Operation next();
+    public int maxBatchSize();
 
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
 }

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/operations/OpDistributionFactory.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/operations/OpDistributionFactory.java
index e42574b..afbae7d 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/OpDistributionFactory.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.operations;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,13 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
+import org.apache.cassandra.stress.util.Timer;
 
-public class ByteBufferOutputStream extends OutputStream
+public interface OpDistributionFactory
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
+    public OpDistribution get(Timer timer);
+    public String desc();
+    Iterable<OpDistributionFactory> each();
 
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/RangeSlicer.java b/tools/stress/src/org/apache/cassandra/stress/operations/RangeSlicer.java
deleted file mode 100644
index f9ba115..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/RangeSlicer.java
+++ /dev/null

@@ -1,144 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.operations;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-public class RangeSlicer extends Operation
-{
-
-    public RangeSlicer(Session client, int index)
-    {
-        super(client, index);
-    }
-
-    public void run(CassandraClient client) throws IOException
-    {
-        String format = "%0" + session.getTotalKeysLength() + "d";
-
-        // initial values
-        int count = session.getColumnsPerKey();
-
-        SlicePredicate predicate = new SlicePredicate().setSlice_range(new SliceRange(ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                                      ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                                                                                      false,
-                                                                                      count));
-
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-        {
-            ByteBuffer start = ByteBufferUtil.bytes(String.format(format, index));
-
-            List<KeySlice> slices = new ArrayList<KeySlice>();
-            KeyRange range = new KeyRange(count).setStart_key(start).setEnd_key(ByteBufferUtil.EMPTY_BYTE_BUFFER);
-
-            for (int i = 0; i < session.getSuperColumns(); i++)
-            {
-                String superColumnName = "S" + Integer.toString(i);
-                ColumnParent parent = new ColumnParent("Super1").setSuper_column(ByteBufferUtil.bytes(superColumnName));
-
-                TimerContext context = session.latency.time();
-
-                boolean success = false;
-                String exceptionMessage = null;
-
-                for (int t = 0; t < session.getRetryTimes(); t++)
-                {
-                    try
-                    {
-                        slices = client.get_range_slices(parent, predicate, range, session.getConsistencyLevel());
-                        success = (slices.size() != 0);
-                    }
-                    catch (Exception e)
-                    {
-                        exceptionMessage = getExceptionMessage(e);
-                        success = false;
-                    }
-                }
-
-                if (!success)
-                {
-                    error(String.format("Operation [%d] retried %d times - error on calling get_range_slices for range offset %s %s%n",
-                                        index,
-                                        session.getRetryTimes(),
-                                        ByteBufferUtil.string(start),
-                                        (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-                }
-
-                session.operations.getAndIncrement();
-                context.stop();
-            }
-
-            session.keys.getAndAdd(slices.size());
-        }
-        else
-        {
-            ColumnParent parent = new ColumnParent("Standard1");
-
-            ByteBuffer start = ByteBufferUtil.bytes(String.format(format, index));
-
-            List<KeySlice> slices = new ArrayList<KeySlice>();
-            KeyRange range = new KeyRange(count).setStart_key(start).setEnd_key(ByteBufferUtil.EMPTY_BYTE_BUFFER);
-
-            TimerContext context = session.latency.time();
-
-            boolean success = false;
-            String exceptionMessage = null;
-
-            for (int t = 0; t < session.getRetryTimes(); t++)
-            {
-                if (success)
-                    break;
-
-                try
-                {
-                    slices = client.get_range_slices(parent, predicate, range, session.getConsistencyLevel());
-                    success = (slices.size() != 0);
-                }
-                catch (Exception e)
-                {
-                    exceptionMessage = getExceptionMessage(e);
-                    success = false;
-                }
-            }
-
-            if (!success)
-            {
-                error(String.format("Operation [%d] retried %d times - error on calling get_indexed_slices for range offset %s %s%n",
-                                    index,
-                                    session.getRetryTimes(),
-                                    ByteBufferUtil.string(start),
-                                    (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-            }
-
-            session.operations.getAndIncrement();
-            session.keys.getAndAdd(slices.size());
-            context.stop();
-        }
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/Reader.java b/tools/stress/src/org/apache/cassandra/stress/operations/Reader.java
deleted file mode 100644
index 72d09b4..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/operations/Reader.java
+++ /dev/null

@@ -1,159 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.operations;
-
-import com.yammer.metrics.core.TimerContext;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.util.CassandraClient;
-import org.apache.cassandra.stress.util.Operation;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.thrift.*;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.List;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-public class Reader extends Operation
-{
-    public Reader(Session client, int index)
-    {
-        super(client, index);
-    }
-
-    public void run(CassandraClient client) throws IOException
-    {
-        // initialize SlicePredicate with existing SliceRange
-        SlicePredicate predicate = new SlicePredicate();
-
-        if (session.columnNames == null)
-            predicate.setSlice_range(getSliceRange());
-        else // see CASSANDRA-3064 about why this is useful
-            predicate.setColumn_names(session.columnNames);
-
-        if (session.getColumnFamilyType() == ColumnFamilyType.Super)
-        {
-            runSuperColumnReader(predicate, client);
-        }
-        else
-        {
-            runColumnReader(predicate, client);
-        }
-    }
-
-    private void runSuperColumnReader(SlicePredicate predicate, Cassandra.Client client) throws IOException
-    {
-        byte[] rawKey = generateKey();
-        ByteBuffer key = ByteBuffer.wrap(rawKey);
-
-        for (int j = 0; j < session.getSuperColumns(); j++)
-        {
-            String superColumn = 'S' + Integer.toString(j);
-            ColumnParent parent = new ColumnParent("Super1").setSuper_column(superColumn.getBytes(UTF_8));
-
-            TimerContext context = session.latency.time();
-
-            boolean success = false;
-            String exceptionMessage = null;
-
-            for (int t = 0; t < session.getRetryTimes(); t++)
-            {
-                if (success)
-                    break;
-
-                try
-                {
-                    List<ColumnOrSuperColumn> columns;
-                    columns = client.get_slice(key, parent, predicate, session.getConsistencyLevel());
-                    success = (columns.size() != 0);
-                }
-                catch (Exception e)
-                {
-                    exceptionMessage = getExceptionMessage(e);
-                    success = false;
-                }
-            }
-
-            if (!success)
-            {
-                error(String.format("Operation [%d] retried %d times - error reading key %s %s%n",
-                                    index,
-                                    session.getRetryTimes(),
-                                    new String(rawKey),
-                                    (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-            }
-
-            session.operations.getAndIncrement();
-            session.keys.getAndIncrement();
-            context.stop();
-        }
-    }
-
-    private void runColumnReader(SlicePredicate predicate, Cassandra.Client client) throws IOException
-    {
-        ColumnParent parent = new ColumnParent("Standard1");
-
-        byte[] key = generateKey();
-        ByteBuffer keyBuffer = ByteBuffer.wrap(key);
-
-        TimerContext context = session.latency.time();
-
-        boolean success = false;
-        String exceptionMessage = null;
-
-        for (int t = 0; t < session.getRetryTimes(); t++)
-        {
-            if (success)
-                break;
-
-            try
-            {
-                List<ColumnOrSuperColumn> columns;
-                columns = client.get_slice(keyBuffer, parent, predicate, session.getConsistencyLevel());
-                success = (columns.size() != 0);
-            }
-            catch (Exception e)
-            {
-                exceptionMessage = getExceptionMessage(e);
-                success = false;
-            }
-        }
-
-        if (!success)
-        {
-            error(String.format("Operation [%d] retried %d times - error reading key %s %s%n",
-                                index,
-                                session.getRetryTimes(),
-                                new String(key),
-                                (exceptionMessage == null) ? "" : "(" + exceptionMessage + ")"));
-        }
-
-        session.operations.getAndIncrement();
-        session.keys.getAndIncrement();
-        context.stop();
-    }
-
-    private SliceRange getSliceRange()
-    {
-        return new SliceRange()
-                    .setStart(new byte[] {})
-                    .setFinish(new byte[] {})
-                    .setReversed(false)
-                    .setCount(session.getColumnsPerKey());
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistribution.java b/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistribution.java
new file mode 100644
index 0000000..0bd64c5
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistribution.java

@@ -0,0 +1,62 @@
+package org.apache.cassandra.stress.operations;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import org.apache.commons.math3.distribution.EnumeratedDistribution;
+import org.apache.commons.math3.util.Pair;
+
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.generate.Distribution;
+
+public class SampledOpDistribution implements OpDistribution
+{
+
+    final EnumeratedDistribution<Operation> operations;
+    final Distribution clustering;
+    private Operation cur;
+    private long remaining;
+
+    public SampledOpDistribution(EnumeratedDistribution<Operation> operations, Distribution clustering)
+    {
+        this.operations = operations;
+        this.clustering = clustering;
+    }
+
+    public int maxBatchSize()
+    {
+        int max = 1;
+        for (Pair<Operation, Double> pair : operations.getPmf())
+            max = Math.max(max, (int) pair.getFirst().partitionCount.maxValue());
+        return max;
+    }
+
+    public Operation next()
+    {
+        while (remaining == 0)
+        {
+            remaining = clustering.next();
+            cur = operations.sample();
+        }
+        remaining--;
+        return cur;
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistributionFactory.java b/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistributionFactory.java
new file mode 100644
index 0000000..9e1a5e8
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistributionFactory.java

@@ -0,0 +1,94 @@
+package org.apache.cassandra.stress.operations;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.math3.distribution.EnumeratedDistribution;
+import org.apache.commons.math3.util.Pair;
+
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.util.Timer;
+
+public abstract class SampledOpDistributionFactory<T> implements OpDistributionFactory
+{
+
+    final Map<T, Double> ratios;
+    final DistributionFactory clustering;
+    protected SampledOpDistributionFactory(Map<T, Double> ratios, DistributionFactory clustering)
+    {
+        this.ratios = ratios;
+        this.clustering = clustering;
+    }
+
+    protected abstract Operation get(Timer timer, PartitionGenerator generator, T key);
+    protected abstract PartitionGenerator newGenerator();
+
+    public OpDistribution get(Timer timer)
+    {
+        PartitionGenerator generator = newGenerator();
+        List<Pair<Operation, Double>> operations = new ArrayList<>();
+        for (Map.Entry<T, Double> ratio : ratios.entrySet())
+            operations.add(new Pair<>(get(timer, generator, ratio.getKey()), ratio.getValue()));
+        return new SampledOpDistribution(new EnumeratedDistribution<>(operations), clustering.get());
+    }
+
+    public String desc()
+    {
+        List<T> keys = new ArrayList<>();
+        for (Map.Entry<T, Double> ratio : ratios.entrySet())
+            keys.add(ratio.getKey());
+        return keys.toString();
+    }
+
+    public Iterable<OpDistributionFactory> each()
+    {
+        List<OpDistributionFactory> out = new ArrayList<>();
+        for (final Map.Entry<T, Double> ratio : ratios.entrySet())
+        {
+            out.add(new OpDistributionFactory()
+            {
+                public OpDistribution get(Timer timer)
+                {
+                    return new FixedOpDistribution(SampledOpDistributionFactory.this.get(timer, newGenerator(), ratio.getKey()));
+                }
+
+                public String desc()
+                {
+                    return ratio.getKey().toString();
+                }
+
+                public Iterable<OpDistributionFactory> each()
+                {
+                    return Collections.<OpDistributionFactory>singleton(this);
+                }
+            });
+        }
+        return out;
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlCounterAdder.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlCounterAdder.java
new file mode 100644
index 0000000..b7d1ee7
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlCounterAdder.java

@@ -0,0 +1,89 @@
+package org.apache.cassandra.stress.operations.predefined;
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.Timer;
+
+public class CqlCounterAdder extends CqlOperation<Integer>
+{
+
+    final Distribution counteradd;
+    public CqlCounterAdder(DistributionFactory counteradd, Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(Command.COUNTER_WRITE, timer, generator, settings);
+        this.counteradd = counteradd.get();
+    }
+
+    @Override
+    protected String buildQuery()
+    {
+        String counterCF = isCql2() ? type.table : "Counter3";
+
+        StringBuilder query = new StringBuilder("UPDATE ").append(wrapInQuotesIfRequired(counterCF));
+
+        if (isCql2())
+            query.append(" USING CONSISTENCY ").append(settings.command.consistencyLevel);
+
+        query.append(" SET ");
+
+        // TODO : increment distribution subset of columns
+        for (int i = 0; i < settings.columns.maxColumnsPerKey; i++)
+        {
+            if (i > 0)
+                query.append(",");
+
+            query.append('C').append(i).append("=C").append(i).append("+?");
+        }
+        query.append(" WHERE KEY=?");
+        return query.toString();
+    }
+
+    @Override
+    protected List<Object> getQueryParameters(byte[] key)
+    {
+        final List<Object> list = new ArrayList<>();
+        for (int i = 0; i < settings.columns.maxColumnsPerKey; i++)
+            list.add(counteradd.next());
+        list.add(ByteBuffer.wrap(key));
+        return list;
+    }
+
+    @Override
+    protected CqlRunOp<Integer> buildRunOp(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key)
+    {
+        return new CqlRunOpAlwaysSucceed(client, query, queryId, params, key, 1);
+    }
+
+    public boolean isWrite()
+    {
+        return true;
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlCounterGetter.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlCounterGetter.java
new file mode 100644
index 0000000..94c8faf
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlCounterGetter.java

@@ -0,0 +1,74 @@
+package org.apache.cassandra.stress.operations.predefined;
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.Timer;
+
+public class CqlCounterGetter extends CqlOperation<Integer>
+{
+
+    public CqlCounterGetter(Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(Command.COUNTER_READ, timer, generator, settings);
+    }
+
+    @Override
+    protected List<Object> getQueryParameters(byte[] key)
+    {
+        return Collections.<Object>singletonList(ByteBuffer.wrap(key));
+    }
+
+    @Override
+    protected String buildQuery()
+    {
+        StringBuilder query = new StringBuilder("SELECT ");
+
+        // TODO: obey slice/noslice option (instead of always slicing)
+        if (isCql2())
+            query.append("FIRST ").append(settings.columns.maxColumnsPerKey).append(" ''..''");
+        else
+            query.append("*");
+
+        String counterCF = isCql2() ? type.table : "Counter3";
+
+        query.append(" FROM ").append(wrapInQuotesIfRequired(counterCF));
+
+        if (isCql2())
+            query.append(" USING CONSISTENCY ").append(settings.command.consistencyLevel);
+
+        return query.append(" WHERE KEY=?").toString();
+    }
+
+    @Override
+    protected CqlRunOp<Integer> buildRunOp(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key)
+    {
+        return new CqlRunOpTestNonEmpty(client, query, queryId, params, key);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlInserter.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlInserter.java
new file mode 100644
index 0000000..622eb14
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlInserter.java

@@ -0,0 +1,84 @@
+package org.apache.cassandra.stress.operations.predefined;
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.utils.UUIDGen;
+
+public class CqlInserter extends CqlOperation<Integer>
+{
+
+    public CqlInserter(Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(Command.WRITE, timer, generator, settings);
+    }
+
+    @Override
+    protected String buildQuery()
+    {
+        StringBuilder query = new StringBuilder("UPDATE ").append(wrapInQuotesIfRequired(type.table));
+
+        if (isCql2())
+            query.append(" USING CONSISTENCY ").append(settings.command.consistencyLevel);
+
+        query.append(" SET ");
+
+        for (int i = 0 ; i < settings.columns.maxColumnsPerKey ; i++)
+        {
+            if (i > 0)
+                query.append(',');
+
+            query.append(wrapInQuotesIfRequired(settings.columns.namestrs.get(i))).append(" = ?");
+        }
+
+        query.append(" WHERE KEY=?");
+        return query.toString();
+    }
+
+    @Override
+    protected List<Object> getQueryParameters(byte[] key)
+    {
+        final ArrayList<Object> queryParams = new ArrayList<>();
+        List<ByteBuffer> values = getColumnValues();
+        queryParams.addAll(values);
+        queryParams.add(ByteBuffer.wrap(key));
+        return queryParams;
+    }
+
+    @Override
+    protected CqlRunOp<Integer> buildRunOp(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key)
+    {
+        return new CqlRunOpAlwaysSucceed(client, query, queryId, params, key, 1);
+    }
+
+    public boolean isWrite()
+    {
+        return true;
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlOperation.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlOperation.java
new file mode 100644
index 0000000..0264cd1
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlOperation.java

@@ -0,0 +1,714 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.stress.operations.predefined;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Row;
+import com.google.common.base.Function;
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.StressMetrics;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.ConnectionStyle;
+import org.apache.cassandra.stress.settings.CqlVersion;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.JavaDriverClient;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.Compression;
+import org.apache.cassandra.thrift.CqlResult;
+import org.apache.cassandra.thrift.CqlRow;
+import org.apache.cassandra.thrift.ThriftConversion;
+import org.apache.cassandra.transport.SimpleClient;
+import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.thrift.TException;
+
+public abstract class CqlOperation<V> extends PredefinedOperation
+{
+
+    protected abstract List<Object> getQueryParameters(byte[] key);
+    protected abstract String buildQuery();
+    protected abstract CqlRunOp<V> buildRunOp(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key);
+
+    public CqlOperation(Command type, Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(type, timer, generator, settings);
+        if (settings.columns.variableColumnCount)
+            throw new IllegalStateException("Variable column counts are not implemented for CQL");
+    }
+
+    protected CqlRunOp<V> run(final ClientWrapper client, final List<Object> queryParams, final ByteBuffer key) throws IOException
+    {
+        final CqlRunOp<V> op;
+        if (settings.mode.style == ConnectionStyle.CQL_PREPARED)
+        {
+            final Object id;
+            Object idobj = getCqlCache();
+            if (idobj == null)
+            {
+                try
+                {
+                    id = client.createPreparedStatement(buildQuery());
+                } catch (TException e)
+                {
+                    throw new RuntimeException(e);
+                }
+                storeCqlCache(id);
+            }
+            else
+                id = idobj;
+
+            op = buildRunOp(client, null, id, queryParams, key);
+        }
+        else
+        {
+            final String query;
+            Object qobj = getCqlCache();
+            if (qobj == null)
+                storeCqlCache(query = buildQuery());
+            else
+                query = qobj.toString();
+
+            op = buildRunOp(client, query, null, queryParams, key);
+        }
+
+        timeWithRetry(op);
+        return op;
+    }
+
+    protected void run(final ClientWrapper client) throws IOException
+    {
+        final byte[] key = getKey().array();
+        final List<Object> queryParams = getQueryParameters(key);
+        run(client, queryParams, ByteBuffer.wrap(key));
+    }
+
+    // Classes to process Cql results
+
+    // Always succeeds so long as the query executes without error; provides a keyCount to increment on instantiation
+    protected final class CqlRunOpAlwaysSucceed extends CqlRunOp<Integer>
+    {
+
+        final int keyCount;
+
+        protected CqlRunOpAlwaysSucceed(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key, int keyCount)
+        {
+            super(client, query, queryId, RowCountHandler.INSTANCE, params, key);
+            this.keyCount = keyCount;
+        }
+
+        @Override
+        public boolean validate(Integer result)
+        {
+            return true;
+        }
+
+        @Override
+        public int partitionCount()
+        {
+            return keyCount;
+        }
+
+        @Override
+        public int rowCount()
+        {
+            return keyCount;
+        }
+    }
+
+    // Succeeds so long as the result set is nonempty, and the query executes without error
+    protected final class CqlRunOpTestNonEmpty extends CqlRunOp<Integer>
+    {
+
+        protected CqlRunOpTestNonEmpty(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key)
+        {
+            super(client, query, queryId, RowCountHandler.INSTANCE, params, key);
+        }
+
+        @Override
+        public boolean validate(Integer result)
+        {
+            return result > 0;
+        }
+
+        @Override
+        public int partitionCount()
+        {
+            return result;
+        }
+
+        @Override
+        public int rowCount()
+        {
+            return result;
+        }
+    }
+
+    // Requires a custom validate() method, but fetches and stores the keys from the result set for further processing
+    protected abstract class CqlRunOpFetchKeys extends CqlRunOp<byte[][]>
+    {
+
+        protected CqlRunOpFetchKeys(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key)
+        {
+            super(client, query, queryId, KeysHandler.INSTANCE, params, key);
+        }
+
+        @Override
+        public int partitionCount()
+        {
+            return result.length;
+        }
+
+        @Override
+        public int rowCount()
+        {
+            return result.length;
+        }
+    }
+
+    protected final class CqlRunOpMatchResults extends CqlRunOp<ByteBuffer[][]>
+    {
+
+        final List<List<ByteBuffer>> expect;
+
+        // a null value for an item in expect means we just check the row is present
+        protected CqlRunOpMatchResults(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key, List<List<ByteBuffer>> expect)
+        {
+            super(client, query, queryId, RowsHandler.INSTANCE, params, key);
+            this.expect = expect;
+        }
+
+        @Override
+        public int partitionCount()
+        {
+            return result == null ? 0 : result.length;
+        }
+
+        @Override
+        public int rowCount()
+        {
+            return result == null ? 0 : result.length;
+        }
+
+        public boolean validate(ByteBuffer[][] result)
+        {
+            if (result.length != expect.size())
+                return false;
+            for (int i = 0 ; i < result.length ; i++)
+                if (expect.get(i) != null && !expect.get(i).equals(Arrays.asList(result[i])))
+                    return false;
+            return true;
+        }
+    }
+
+    // Cql
+    protected abstract class CqlRunOp<V> implements RunOp
+    {
+
+        final ClientWrapper client;
+        final String query;
+        final Object queryId;
+        final List<Object> params;
+        final ByteBuffer key;
+        final ResultHandler<V> handler;
+        V result;
+
+        private CqlRunOp(ClientWrapper client, String query, Object queryId, ResultHandler<V> handler, List<Object> params, ByteBuffer key)
+        {
+            this.client = client;
+            this.query = query;
+            this.queryId = queryId;
+            this.handler = handler;
+            this.params = params;
+            this.key = key;
+        }
+
+        @Override
+        public boolean run() throws Exception
+        {
+            return queryId != null
+            ? validate(result = client.execute(queryId, key, params, handler))
+            : validate(result = client.execute(query, key, params, handler));
+        }
+
+        public abstract boolean validate(V result);
+
+    }
+
+
+    /// LOTS OF WRAPPING/UNWRAPPING NONSENSE
+
+
+    @Override
+    public void run(final ThriftClient client) throws IOException
+    {
+        run(wrap(client));
+    }
+
+    @Override
+    public void run(SimpleClient client) throws IOException
+    {
+        run(wrap(client));
+    }
+
+    @Override
+    public void run(JavaDriverClient client) throws IOException
+    {
+        run(wrap(client));
+    }
+
+    public ClientWrapper wrap(ThriftClient client)
+    {
+        return isCql3()
+                ? new Cql3CassandraClientWrapper(client)
+                : new Cql2CassandraClientWrapper(client);
+
+    }
+
+    public ClientWrapper wrap(JavaDriverClient client)
+    {
+        return new JavaDriverWrapper(client);
+    }
+
+    public ClientWrapper wrap(SimpleClient client)
+    {
+        return new SimpleClientWrapper(client);
+    }
+
+    protected interface ClientWrapper
+    {
+        Object createPreparedStatement(String cqlQuery) throws TException;
+        <V> V execute(Object preparedStatementId, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler) throws TException;
+        <V> V execute(String query, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler) throws TException;
+    }
+
+    private final class JavaDriverWrapper implements ClientWrapper
+    {
+        final JavaDriverClient client;
+        private JavaDriverWrapper(JavaDriverClient client)
+        {
+            this.client = client;
+        }
+
+        @Override
+        public <V> V execute(String query, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler)
+        {
+            String formattedQuery = formatCqlQuery(query, queryParams, isCql3());
+            return handler.javaDriverHandler().apply(client.execute(formattedQuery, ThriftConversion.fromThrift(settings.command.consistencyLevel)));
+        }
+
+        @Override
+        public <V> V execute(Object preparedStatementId, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler)
+        {
+            return handler.javaDriverHandler().apply(
+                    client.executePrepared(
+                            (PreparedStatement) preparedStatementId,
+                            queryParams,
+                            ThriftConversion.fromThrift(settings.command.consistencyLevel)));
+        }
+
+        @Override
+        public Object createPreparedStatement(String cqlQuery)
+        {
+            return client.prepare(cqlQuery);
+        }
+    }
+
+    private final class SimpleClientWrapper implements ClientWrapper
+    {
+        final SimpleClient client;
+        private SimpleClientWrapper(SimpleClient client)
+        {
+            this.client = client;
+        }
+
+        @Override
+        public <V> V execute(String query, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler)
+        {
+            String formattedQuery = formatCqlQuery(query, queryParams, isCql3());
+            return handler.thriftHandler().apply(client.execute(formattedQuery, ThriftConversion.fromThrift(settings.command.consistencyLevel)));
+        }
+
+        @Override
+        public <V> V execute(Object preparedStatementId, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler)
+        {
+            return handler.thriftHandler().apply(
+                    client.executePrepared(
+                            (byte[]) preparedStatementId,
+                            toByteBufferParams(queryParams),
+                            ThriftConversion.fromThrift(settings.command.consistencyLevel)));
+        }
+
+        @Override
+        public Object createPreparedStatement(String cqlQuery)
+        {
+            return client.prepare(cqlQuery).statementId.bytes;
+        }
+    }
+
+    // client wrapper for Cql3
+    private final class Cql3CassandraClientWrapper implements ClientWrapper
+    {
+        final ThriftClient client;
+        private Cql3CassandraClientWrapper(ThriftClient client)
+        {
+            this.client = client;
+        }
+
+        @Override
+        public <V> V execute(String query, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler) throws TException
+        {
+            String formattedQuery = formatCqlQuery(query, queryParams, true);
+            return handler.simpleNativeHandler().apply(
+                    client.execute_cql3_query(formattedQuery, key, Compression.NONE, settings.command.consistencyLevel)
+            );
+        }
+
+        @Override
+        public <V> V execute(Object preparedStatementId, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler) throws TException
+        {
+            Integer id = (Integer) preparedStatementId;
+            return handler.simpleNativeHandler().apply(
+                    client.execute_prepared_cql3_query(id, key, toByteBufferParams(queryParams), settings.command.consistencyLevel)
+            );
+        }
+
+        @Override
+        public Object createPreparedStatement(String cqlQuery) throws TException
+        {
+            return client.prepare_cql3_query(cqlQuery, Compression.NONE);
+        }
+    }
+
+    // client wrapper for Cql2
+    private final class Cql2CassandraClientWrapper implements ClientWrapper
+    {
+        final ThriftClient client;
+        private Cql2CassandraClientWrapper(ThriftClient client)
+        {
+            this.client = client;
+        }
+
+        @Override
+        public <V> V execute(String query, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler) throws TException
+        {
+            String formattedQuery = formatCqlQuery(query, queryParams, false);
+            return handler.simpleNativeHandler().apply(
+                    client.execute_cql_query(formattedQuery, key, Compression.NONE)
+            );
+        }
+
+        @Override
+        public <V> V execute(Object preparedStatementId, ByteBuffer key, List<Object> queryParams, ResultHandler<V> handler) throws TException
+        {
+            Integer id = (Integer) preparedStatementId;
+            return handler.simpleNativeHandler().apply(
+                    client.execute_prepared_cql_query(id, key, toByteBufferParams(queryParams))
+            );
+        }
+
+        @Override
+        public Object createPreparedStatement(String cqlQuery) throws TException
+        {
+            return client.prepare_cql_query(cqlQuery, Compression.NONE);
+        }
+    }
+
+    // interface for building functions to standardise results from each client
+    protected static interface ResultHandler<V>
+    {
+        Function<ResultSet, V> javaDriverHandler();
+        Function<ResultMessage, V> thriftHandler();
+        Function<CqlResult, V> simpleNativeHandler();
+    }
+
+    protected static class RowCountHandler implements ResultHandler<Integer>
+    {
+        static final RowCountHandler INSTANCE = new RowCountHandler();
+
+        @Override
+        public Function<ResultSet, Integer> javaDriverHandler()
+        {
+            return new Function<ResultSet, Integer>()
+            {
+                @Override
+                public Integer apply(ResultSet rows)
+                {
+                    if (rows == null)
+                        return 0;
+                    return rows.all().size();
+                }
+            };
+        }
+
+        @Override
+        public Function<ResultMessage, Integer> thriftHandler()
+        {
+            return new Function<ResultMessage, Integer>()
+            {
+                @Override
+                public Integer apply(ResultMessage result)
+                {
+                    return result instanceof ResultMessage.Rows ? ((ResultMessage.Rows) result).result.size() : 0;
+                }
+            };
+        }
+
+        @Override
+        public Function<CqlResult, Integer> simpleNativeHandler()
+        {
+            return new Function<CqlResult, Integer>()
+            {
+
+                @Override
+                public Integer apply(CqlResult result)
+                {
+                    switch (result.getType())
+                    {
+                        case ROWS:
+                            return result.getRows().size();
+                        default:
+                            return 1;
+                    }
+                }
+            };
+        }
+
+    }
+
+    // Processes results from each client into an array of all key bytes returned
+    protected static final class RowsHandler implements ResultHandler<ByteBuffer[][]>
+    {
+        static final RowsHandler INSTANCE = new RowsHandler();
+
+        @Override
+        public Function<ResultSet, ByteBuffer[][]> javaDriverHandler()
+        {
+            return new Function<ResultSet, ByteBuffer[][]>()
+            {
+
+                @Override
+                public ByteBuffer[][] apply(ResultSet result)
+                {
+                    if (result == null)
+                        return new ByteBuffer[0][];
+                    List<Row> rows = result.all();
+
+                    ByteBuffer[][] r = new ByteBuffer[rows.size()][];
+                    for (int i = 0 ; i < r.length ; i++)
+                    {
+                        Row row = rows.get(i);
+                        r[i] = new ByteBuffer[row.getColumnDefinitions().size()];
+                        for (int j = 0 ; j < row.getColumnDefinitions().size() ; j++)
+                            r[i][j] = row.getBytes(j);
+                    }
+                    return r;
+                }
+            };
+        }
+
+        @Override
+        public Function<ResultMessage, ByteBuffer[][]> thriftHandler()
+        {
+            return new Function<ResultMessage, ByteBuffer[][]>()
+            {
+
+                @Override
+                public ByteBuffer[][] apply(ResultMessage result)
+                {
+                    if (!(result instanceof ResultMessage.Rows))
+                        return new ByteBuffer[0][];
+
+                    ResultMessage.Rows rows = ((ResultMessage.Rows) result);
+                    ByteBuffer[][] r = new ByteBuffer[rows.result.size()][];
+                    for (int i = 0 ; i < r.length ; i++)
+                    {
+                        List<ByteBuffer> row = rows.result.rows.get(i);
+                        r[i] = new ByteBuffer[row.size()];
+                        for (int j = 0 ; j < row.size() ; j++)
+                            r[i][j] = row.get(j);
+                    }
+                    return r;
+                }
+            };
+        }
+
+        @Override
+        public Function<CqlResult, ByteBuffer[][]> simpleNativeHandler()
+        {
+            return new Function<CqlResult, ByteBuffer[][]>()
+            {
+
+                @Override
+                public ByteBuffer[][] apply(CqlResult result)
+                {
+                    ByteBuffer[][] r = new ByteBuffer[result.getRows().size()][];
+                    for (int i = 0 ; i < r.length ; i++)
+                    {
+                        CqlRow row = result.getRows().get(i);
+                        r[i] = new ByteBuffer[row.getColumns().size()];
+                        for (int j = 0 ; j < r[i].length ; j++)
+                            r[i][j] = ByteBuffer.wrap(row.getColumns().get(j).getValue());
+                    }
+                    return r;
+                }
+            };
+        }
+
+    }
+    // Processes results from each client into an array of all key bytes returned
+    protected static final class KeysHandler implements ResultHandler<byte[][]>
+    {
+        static final KeysHandler INSTANCE = new KeysHandler();
+
+        @Override
+        public Function<ResultSet, byte[][]> javaDriverHandler()
+        {
+            return new Function<ResultSet, byte[][]>()
+            {
+
+                @Override
+                public byte[][] apply(ResultSet result)
+                {
+
+                    if (result == null)
+                        return new byte[0][];
+                    List<Row> rows = result.all();
+                    byte[][] r = new byte[rows.size()][];
+                    for (int i = 0 ; i < r.length ; i++)
+                        r[i] = rows.get(i).getBytes(0).array();
+                    return r;
+                }
+            };
+        }
+
+        @Override
+        public Function<ResultMessage, byte[][]> thriftHandler()
+        {
+            return new Function<ResultMessage, byte[][]>()
+            {
+
+                @Override
+                public byte[][] apply(ResultMessage result)
+                {
+                    if (result instanceof ResultMessage.Rows)
+                    {
+                        ResultMessage.Rows rows = ((ResultMessage.Rows) result);
+                        byte[][] r = new byte[rows.result.size()][];
+                        for (int i = 0 ; i < r.length ; i++)
+                            r[i] = rows.result.rows.get(i).get(0).array();
+                        return r;
+                    }
+                    return null;
+                }
+            };
+        }
+
+        @Override
+        public Function<CqlResult, byte[][]> simpleNativeHandler()
+        {
+            return new Function<CqlResult, byte[][]>()
+            {
+
+                @Override
+                public byte[][] apply(CqlResult result)
+                {
+                    byte[][] r = new byte[result.getRows().size()][];
+                    for (int i = 0 ; i < r.length ; i++)
+                        r[i] = result.getRows().get(i).getKey();
+                    return r;
+                }
+            };
+        }
+
+    }
+
+    private static String getUnQuotedCqlBlob(ByteBuffer term, boolean isCQL3)
+    {
+        return isCQL3
+                ? "0x" + ByteBufferUtil.bytesToHex(term)
+                : ByteBufferUtil.bytesToHex(term);
+    }
+
+    /**
+     * Constructs a CQL query string by replacing instances of the character
+     * '?', with the corresponding parameter.
+     *
+     * @param query base query string to format
+     * @param parms sequence of string query parameters
+     * @return formatted CQL query string
+     */
+    private static String formatCqlQuery(String query, List<Object> parms, boolean isCql3)
+    {
+        int marker, position = 0;
+        StringBuilder result = new StringBuilder();
+
+        if (-1 == (marker = query.indexOf('?')) || parms.size() == 0)
+            return query;
+
+        for (Object parm : parms)
+        {
+            result.append(query.substring(position, marker));
+            if (parm instanceof ByteBuffer)
+                result.append(getUnQuotedCqlBlob((ByteBuffer) parm, isCql3));
+            else if (parm instanceof Long)
+                result.append(parm.toString());
+            else throw new AssertionError();
+
+            position = marker + 1;
+            if (-1 == (marker = query.indexOf('?', position + 1)))
+                break;
+        }
+
+        if (position < query.length())
+            result.append(query.substring(position));
+
+        return result.toString();
+    }
+
+    private static List<ByteBuffer> toByteBufferParams(List<Object> params)
+    {
+        List<ByteBuffer> r = new ArrayList<>();
+        for (Object param : params)
+        {
+            if (param instanceof ByteBuffer)
+                r.add((ByteBuffer) param);
+            else if (param instanceof Long)
+                r.add(ByteBufferUtil.bytes((Long) param));
+            else throw new AssertionError();
+        }
+        return r;
+    }
+
+    protected String wrapInQuotesIfRequired(String string)
+    {
+        return settings.mode.cqlVersion == CqlVersion.CQL3
+                ? "\"" + string + "\""
+                : string;
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlReader.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlReader.java
new file mode 100644
index 0000000..3a7f75a
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/CqlReader.java

@@ -0,0 +1,87 @@
+package org.apache.cassandra.stress.operations.predefined;
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class CqlReader extends CqlOperation<ByteBuffer[][]>
+{
+
+    public CqlReader(Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(Command.READ, timer, generator, settings);
+    }
+
+    @Override
+    protected String buildQuery()
+    {
+        StringBuilder query = new StringBuilder("SELECT ");
+
+        if (settings.columns.slice)
+        {
+            if (isCql2())
+                query.append("FIRST ").append(settings.columns.maxColumnsPerKey).append(" ''..''");
+            else
+                query.append("*");
+        }
+        else
+        {
+            for (int i = 0; i < settings.columns.maxColumnsPerKey ; i++)
+            {
+                if (i > 0)
+                    query.append(",");
+                query.append(wrapInQuotesIfRequired(settings.columns.namestrs.get(i)));
+            }
+        }
+
+        query.append(" FROM ").append(wrapInQuotesIfRequired(type.table));
+
+        if (isCql2())
+            query.append(" USING CONSISTENCY ").append(settings.command.consistencyLevel);
+        query.append(" WHERE KEY=?");
+        return query.toString();
+    }
+
+    @Override
+    protected List<Object> getQueryParameters(byte[] key)
+    {
+        return Collections.<Object>singletonList(ByteBuffer.wrap(key));
+    }
+
+    @Override
+    protected CqlRunOp<ByteBuffer[][]> buildRunOp(ClientWrapper client, String query, Object queryId, List<Object> params, ByteBuffer key)
+    {
+        List<ByteBuffer> expectRow = getColumnValues();
+        return new CqlRunOpMatchResults(client, query, queryId, params, key, Arrays.asList(expectRow));
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java
new file mode 100644
index 0000000..dba2e51
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java

@@ -0,0 +1,248 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress.operations.predefined;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.StressMetrics;
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.DistributionFixed;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.Row;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.CqlVersion;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.ColumnParent;
+import org.apache.cassandra.thrift.SlicePredicate;
+import org.apache.cassandra.thrift.SliceRange;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public abstract class PredefinedOperation extends Operation
+{
+    public final Command type;
+    private final Distribution columnCount;
+    private Object cqlCache;
+
+    public PredefinedOperation(Command type, Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(timer, generator, settings, new DistributionFixed(1));
+        this.type = type;
+        this.columnCount = settings.columns.countDistribution.get();
+    }
+
+    public boolean isCql3()
+    {
+        return settings.mode.cqlVersion == CqlVersion.CQL3;
+    }
+    public boolean isCql2()
+    {
+        return settings.mode.cqlVersion == CqlVersion.CQL2;
+    }
+    public Object getCqlCache()
+    {
+        return cqlCache;
+    }
+    public void storeCqlCache(Object val)
+    {
+        cqlCache = val;
+    }
+
+    protected ByteBuffer getKey()
+    {
+        return (ByteBuffer) partitions.get(0).getPartitionKey(0);
+    }
+
+    final class ColumnSelection
+    {
+        final int[] indices;
+        final int lb, ub;
+        private ColumnSelection(int[] indices, int lb, int ub)
+        {
+            this.indices = indices;
+            this.lb = lb;
+            this.ub = ub;
+        }
+
+        public <V> List<V> select(List<V> in)
+        {
+            List<V> out = new ArrayList<>();
+            if (indices != null)
+            {
+                for (int i : indices)
+                    out.add(in.get(i));
+            }
+            else
+            {
+                out.addAll(in.subList(lb, ub));
+            }
+            return out;
+        }
+
+        int count()
+        {
+            return indices != null ? indices.length : ub - lb;
+        }
+
+        SlicePredicate predicate()
+        {
+            final SlicePredicate predicate = new SlicePredicate();
+            if (indices == null)
+            {
+                predicate.setSlice_range(new SliceRange()
+                                         .setStart(settings.columns.names.get(lb))
+                                         .setFinish(new byte[] {})
+                                         .setReversed(false)
+                                         .setCount(count())
+                );
+            }
+            else
+                predicate.setColumn_names(select(settings.columns.names));
+            return predicate;
+
+        }
+    }
+
+    public String toString()
+    {
+        return type.toString();
+    }
+
+    ColumnSelection select()
+    {
+        if (settings.columns.slice)
+        {
+            int count = (int) columnCount.next();
+            int start;
+            if (count == settings.columns.maxColumnsPerKey)
+                start = 0;
+            else
+                start = 1 + ThreadLocalRandom.current().nextInt(settings.columns.maxColumnsPerKey - count);
+            return new ColumnSelection(null, start, start + count);
+        }
+
+        int count = (int) columnCount.next();
+        int totalCount = settings.columns.names.size();
+        if (count == settings.columns.names.size())
+            return new ColumnSelection(null, 0, count);
+        ThreadLocalRandom rnd = ThreadLocalRandom.current();
+        int[] indices = new int[count];
+        int c = 0, o = 0;
+        while (c < count && count + o < totalCount)
+        {
+            int leeway = totalCount - (count + o);
+            int spreadover = count - c;
+            o += Math.round(rnd.nextDouble() * (leeway / (double) spreadover));
+            indices[c] = o + c;
+            c++;
+        }
+        while (c < count)
+        {
+            indices[c] = o + c;
+            c++;
+        }
+        return new ColumnSelection(indices, 0, 0);
+    }
+
+    protected List<ByteBuffer> getColumnValues()
+    {
+        return getColumnValues(new ColumnSelection(null, 0, settings.columns.names.size()));
+    }
+
+    protected List<ByteBuffer> getColumnValues(ColumnSelection columns)
+    {
+        Row row = partitions.get(0).iterator(1, false).next().iterator().next();
+        ByteBuffer[] r = new ByteBuffer[columns.count()];
+        int c = 0;
+        if (columns.indices != null)
+            for (int i : columns.indices)
+                r[c++] = (ByteBuffer) row.get(i);
+        else
+            for (int i = columns.lb ; i < columns.ub ; i++)
+                r[c++] = (ByteBuffer) row.get(i);
+        return Arrays.asList(r);
+    }
+
+    public static Operation operation(Command type, Timer timer, PartitionGenerator generator, StressSettings settings, DistributionFactory counteradd)
+    {
+        switch (type)
+        {
+            case READ:
+                switch(settings.mode.style)
+                {
+                    case THRIFT:
+                        return new ThriftReader(timer, generator, settings);
+                    case CQL:
+                    case CQL_PREPARED:
+                        return new CqlReader(timer, generator, settings);
+                    default:
+                        throw new UnsupportedOperationException();
+                }
+
+
+            case COUNTER_READ:
+                switch(settings.mode.style)
+                {
+                    case THRIFT:
+                        return new ThriftCounterGetter(timer, generator, settings);
+                    case CQL:
+                    case CQL_PREPARED:
+                        return new CqlCounterGetter(timer, generator, settings);
+                    default:
+                        throw new UnsupportedOperationException();
+                }
+
+            case WRITE:
+
+                switch(settings.mode.style)
+                {
+                    case THRIFT:
+                        return new ThriftInserter(timer, generator, settings);
+                    case CQL:
+                    case CQL_PREPARED:
+                        return new CqlInserter(timer, generator, settings);
+                    default:
+                        throw new UnsupportedOperationException();
+                }
+
+            case COUNTER_WRITE:
+                switch(settings.mode.style)
+                {
+                    case THRIFT:
+                        return new ThriftCounterAdder(counteradd, timer, generator, settings);
+                    case CQL:
+                    case CQL_PREPARED:
+                        return new CqlCounterAdder(counteradd, timer, generator, settings);
+                    default:
+                        throw new UnsupportedOperationException();
+                }
+
+        }
+
+        throw new UnsupportedOperationException();
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftCounterAdder.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftCounterAdder.java
new file mode 100644
index 0000000..4ee42e9
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftCounterAdder.java

@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress.operations.predefined;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.*;
+
+public class ThriftCounterAdder extends PredefinedOperation
+{
+
+    final Distribution counteradd;
+    public ThriftCounterAdder(DistributionFactory counteradd, Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(Command.COUNTER_WRITE, timer, generator, settings);
+        this.counteradd = counteradd.get();
+    }
+
+    public boolean isWrite()
+    {
+        return true;
+    }
+
+    public void run(final ThriftClient client) throws IOException
+    {
+        List<CounterColumn> columns = new ArrayList<>();
+        for (ByteBuffer name : select().select(settings.columns.names))
+            columns.add(new CounterColumn(name, counteradd.next()));
+
+        List<Mutation> mutations = new ArrayList<>(columns.size());
+        for (CounterColumn c : columns)
+        {
+            ColumnOrSuperColumn cosc = new ColumnOrSuperColumn().setCounter_column(c);
+            mutations.add(new Mutation().setColumn_or_supercolumn(cosc));
+        }
+        Map<String, List<Mutation>> row = Collections.singletonMap(type.table, mutations);
+
+        final ByteBuffer key = getKey();
+        final Map<ByteBuffer, Map<String, List<Mutation>>> record = Collections.singletonMap(key, row);
+
+        timeWithRetry(new RunOp()
+        {
+            @Override
+            public boolean run() throws Exception
+            {
+                client.batch_mutate(record, settings.command.consistencyLevel);
+                return true;
+            }
+
+            @Override
+            public int partitionCount()
+            {
+                return 1;
+            }
+
+            @Override
+            public int rowCount()
+            {
+                return 1;
+            }
+        });
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftCounterGetter.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftCounterGetter.java
new file mode 100644
index 0000000..10c6aab
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftCounterGetter.java

@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress.operations.predefined;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.ColumnParent;
+import org.apache.cassandra.thrift.SlicePredicate;
+
+public class ThriftCounterGetter extends PredefinedOperation
+{
+    public ThriftCounterGetter(Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(Command.COUNTER_READ, timer, generator, settings);
+    }
+
+    public void run(final ThriftClient client) throws IOException
+    {
+        final SlicePredicate predicate = select().predicate();
+        final ByteBuffer key = getKey();
+        timeWithRetry(new RunOp()
+        {
+            @Override
+            public boolean run() throws Exception
+            {
+                List<?> r = client.get_slice(key, new ColumnParent(type.table), predicate, settings.command.consistencyLevel);
+                return r != null && r.size() > 0;
+            }
+
+            @Override
+            public int partitionCount()
+            {
+                return 1;
+            }
+
+            @Override
+            public int rowCount()
+            {
+                return 1;
+            }
+        });
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftInserter.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftInserter.java
new file mode 100644
index 0000000..d6adbf9
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftInserter.java

@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress.operations.predefined;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.db.marshal.TimeUUIDType;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.*;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
+
+public final class ThriftInserter extends PredefinedOperation
+{
+
+    public ThriftInserter(Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(Command.WRITE, timer, generator, settings);
+    }
+
+    public boolean isWrite()
+    {
+        return true;
+    }
+
+    public void run(final ThriftClient client) throws IOException
+    {
+        final ByteBuffer key = getKey();
+        final List<Column> columns = getColumns();
+
+        List<Mutation> mutations = new ArrayList<>(columns.size());
+        for (Column c : columns)
+        {
+            ColumnOrSuperColumn column = new ColumnOrSuperColumn().setColumn(c);
+            mutations.add(new Mutation().setColumn_or_supercolumn(column));
+        }
+        Map<String, List<Mutation>> row = Collections.singletonMap(type.table, mutations);
+
+        final Map<ByteBuffer, Map<String, List<Mutation>>> record = Collections.singletonMap(key, row);
+
+        timeWithRetry(new RunOp()
+        {
+            @Override
+            public boolean run() throws Exception
+            {
+                client.batch_mutate(record, settings.command.consistencyLevel);
+                return true;
+            }
+
+            @Override
+            public int partitionCount()
+            {
+                return 1;
+            }
+
+            @Override
+            public int rowCount()
+            {
+                return 1;
+            }
+        });
+    }
+
+    protected List<Column> getColumns()
+    {
+        final ColumnSelection selection = select();
+        final List<ByteBuffer> values = getColumnValues(selection);
+        final List<Column> columns = new ArrayList<>(values.size());
+        final List<ByteBuffer> names = select().select(settings.columns.names);
+        for (int i = 0 ; i < values.size() ; i++)
+            columns.add(new Column(names.get(i))
+                        .setValue(values.get(i))
+                        .setTimestamp(settings.columns.timestamp != null
+                                      ? Long.parseLong(settings.columns.timestamp)
+                                      : FBUtilities.timestampMicros()));
+        return columns;
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftReader.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftReader.java
new file mode 100644
index 0000000..276d8c5
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/ThriftReader.java

@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress.operations.predefined;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.settings.Command;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.ColumnOrSuperColumn;
+import org.apache.cassandra.thrift.ColumnParent;
+import org.apache.cassandra.thrift.SlicePredicate;
+import org.apache.cassandra.thrift.SuperColumn;
+
+public final class ThriftReader extends PredefinedOperation
+{
+
+    public ThriftReader(Timer timer, PartitionGenerator generator, StressSettings settings)
+    {
+        super(Command.READ, timer, generator, settings);
+    }
+
+    public void run(final ThriftClient client) throws IOException
+    {
+        final ColumnSelection select = select();
+        final ByteBuffer key = getKey();
+        final List<ByteBuffer> expect = getColumnValues(select);
+        timeWithRetry(new RunOp()
+        {
+            @Override
+            public boolean run() throws Exception
+            {
+                List<ColumnOrSuperColumn> row = client.get_slice(key, new ColumnParent(type.table), select.predicate(), settings.command.consistencyLevel);
+                if (expect == null)
+                    return !row.isEmpty();
+                if (row == null)
+                    return false;
+                if (row.size() != expect.size())
+                    return false;
+                for (int i = 0 ; i < row.size() ; i++)
+                    if (!row.get(i).getColumn().bufferForValue().equals(expect.get(i)))
+                        return false;
+                return true;
+            }
+
+            @Override
+            public int partitionCount()
+            {
+                return 1;
+            }
+
+            @Override
+            public int rowCount()
+            {
+                return 1;
+            }
+        });
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaInsert.java b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaInsert.java
new file mode 100644
index 0000000..8e20ab3
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaInsert.java

@@ -0,0 +1,170 @@
+package org.apache.cassandra.stress.operations.userdefined;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.datastax.driver.core.BatchStatement;
+import com.datastax.driver.core.BoundStatement;
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.Statement;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.Partition;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.RatioDistribution;
+import org.apache.cassandra.stress.generate.Row;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.settings.ValidationType;
+import org.apache.cassandra.stress.util.JavaDriverClient;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+
+public class SchemaInsert extends SchemaStatement
+{
+
+    private final BatchStatement.Type batchType;
+    private final RatioDistribution selectChance;
+
+    public SchemaInsert(Timer timer, PartitionGenerator generator, StressSettings settings, Distribution batchSize, RatioDistribution selectChance, Integer thriftId, PreparedStatement statement, ConsistencyLevel cl, BatchStatement.Type batchType)
+    {
+        super(timer, generator, settings, batchSize, statement, thriftId, cl, ValidationType.NOT_FAIL);
+        this.batchType = batchType;
+        this.selectChance = selectChance;
+    }
+
+    private class JavaDriverRun extends Runner
+    {
+        final JavaDriverClient client;
+
+        private JavaDriverRun(JavaDriverClient client)
+        {
+            this.client = client;
+        }
+
+        public boolean run() throws Exception
+        {
+            Partition.RowIterator[] iterators = new Partition.RowIterator[partitions.size()];
+            for (int i = 0 ; i < iterators.length ; i++)
+                iterators[i] = partitions.get(i).iterator(selectChance.next(), true);
+            List<BoundStatement> stmts = new ArrayList<>();
+            partitionCount = partitions.size();
+
+            for (Partition.RowIterator iterator : iterators)
+            {
+                if (iterator.done())
+                    continue;
+
+                for (Row row : iterator.next())
+                    stmts.add(bindRow(row));
+            }
+            rowCount += stmts.size();
+
+            // 65535 is max number of stmts per batch, so if we have more, we need to manually batch them
+            for (int j = 0 ; j < stmts.size() ; j += 65535)
+            {
+                List<BoundStatement> substmts = stmts.subList(j, Math.min(j + stmts.size(), j + 65535));
+                Statement stmt;
+                if (stmts.size() == 1)
+                {
+                    stmt = substmts.get(0);
+                }
+                else
+                {
+                    BatchStatement batch = new BatchStatement(batchType);
+                    batch.setConsistencyLevel(JavaDriverClient.from(cl));
+                    batch.addAll(substmts);
+                    stmt = batch;
+                }
+
+                try
+                {
+                    validate(client.getSession().execute(stmt));
+                }
+                catch (ClassCastException e)
+                {
+                    e.printStackTrace();
+                }
+            }
+
+            for (Partition.RowIterator iterator : iterators)
+                iterator.markWriteFinished();
+
+            return true;
+        }
+    }
+
+    private class ThriftRun extends Runner
+    {
+        final ThriftClient client;
+
+        private ThriftRun(ThriftClient client)
+        {
+            this.client = client;
+        }
+
+        public boolean run() throws Exception
+        {
+            Partition.RowIterator[] iterators = new Partition.RowIterator[partitions.size()];
+            for (int i = 0 ; i < iterators.length ; i++)
+                iterators[i] = partitions.get(i).iterator(selectChance.next(), true);
+            partitionCount = partitions.size();
+
+            for (Partition.RowIterator iterator : iterators)
+            {
+                if (iterator.done())
+                    continue;
+
+                for (Row row : iterator.next())
+                {
+                    validate(client.execute_prepared_cql3_query(thriftId, iterator.partition().getToken(), thriftRowArgs(row), settings.command.consistencyLevel));
+                    rowCount += 1;
+                }
+            }
+
+            for (Partition.RowIterator iterator : iterators)
+                iterator.markWriteFinished();
+
+            return true;
+        }
+    }
+
+    @Override
+    public void run(JavaDriverClient client) throws IOException
+    {
+        timeWithRetry(new JavaDriverRun(client));
+    }
+
+    public boolean isWrite()
+    {
+        return true;
+    }
+
+    @Override
+    public void run(ThriftClient client) throws IOException
+    {
+        timeWithRetry(new ThriftRun(client));
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaQuery.java b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaQuery.java
new file mode 100644
index 0000000..866f6ab
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaQuery.java

@@ -0,0 +1,174 @@
+package org.apache.cassandra.stress.operations.userdefined;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import com.datastax.driver.core.BoundStatement;
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.stress.generate.Partition;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.Row;
+import org.apache.cassandra.stress.settings.OptionDistribution;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.settings.ValidationType;
+import org.apache.cassandra.stress.util.JavaDriverClient;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.CqlResult;
+import org.apache.cassandra.thrift.ThriftConversion;
+
+public class SchemaQuery extends SchemaStatement
+{
+
+    public static enum ArgSelect
+    {
+        MULTIROW, SAMEROW;
+        //TODO: FIRSTROW, LASTROW
+    }
+
+    final ArgSelect argSelect;
+    final Object[][] randomBuffer;
+    final Random random = new Random();
+
+    public SchemaQuery(Timer timer, PartitionGenerator generator, StressSettings settings, Integer thriftId, PreparedStatement statement, ConsistencyLevel cl, ValidationType validationType, ArgSelect argSelect)
+    {
+        super(timer, generator, settings, OptionDistribution.get("fixed(1)").get(), statement, thriftId, cl, validationType);
+        this.argSelect = argSelect;
+        randomBuffer = new Object[argumentIndex.length][argumentIndex.length];
+    }
+
+    private class JavaDriverRun extends Runner
+    {
+        final JavaDriverClient client;
+
+        private JavaDriverRun(JavaDriverClient client)
+        {
+            this.client = client;
+        }
+
+        public boolean run() throws Exception
+        {
+            ResultSet rs = client.getSession().execute(bindArgs(partitions.get(0)));
+            validate(rs);
+            rowCount = rs.all().size();
+            partitionCount = Math.min(1, rowCount);
+            return true;
+        }
+    }
+
+    private class ThriftRun extends Runner
+    {
+        final ThriftClient client;
+
+        private ThriftRun(ThriftClient client)
+        {
+            this.client = client;
+        }
+
+        public boolean run() throws Exception
+        {
+            CqlResult rs = client.execute_prepared_cql3_query(thriftId, partitions.get(0).getToken(), thriftArgs(partitions.get(0)), ThriftConversion.toThrift(cl));
+            validate(rs);
+            rowCount = rs.getRowsSize();
+            partitionCount = Math.min(1, rowCount);
+            return true;
+        }
+    }
+
+    private int fillRandom(Partition partition)
+    {
+        int c = 0;
+        while (c == 0)
+        {
+            for (Row row : partition.iterator(randomBuffer.length, false).next())
+            {
+                Object[] randomRow = randomBuffer[c++];
+                for (int i = 0 ; i < argumentIndex.length ; i++)
+                    randomRow[i] = row.get(argumentIndex[i]);
+                if (c >= randomBuffer.length)
+                    break;
+            }
+        }
+        return c;
+    }
+
+    BoundStatement bindArgs(Partition partition)
+    {
+        switch (argSelect)
+        {
+            case MULTIROW:
+                int c = fillRandom(partition);
+                for (int i = 0 ; i < argumentIndex.length ; i++)
+                {
+                    int argIndex = argumentIndex[i];
+                    bindBuffer[i] = randomBuffer[argIndex < 0 ? 0 : random.nextInt(c)][i];
+                }
+                return statement.bind(bindBuffer);
+            case SAMEROW:
+                for (Row row : partition.iterator(1, false).next())
+                    return bindRow(row);
+            default:
+                throw new IllegalStateException();
+        }
+    }
+
+    List<ByteBuffer> thriftArgs(Partition partition)
+    {
+        switch (argSelect)
+        {
+            case MULTIROW:
+                List<ByteBuffer> args = new ArrayList<>();
+                int c = fillRandom(partition);
+                for (int i = 0 ; i < argumentIndex.length ; i++)
+                {
+                    int argIndex = argumentIndex[i];
+                    args.add(generator.convert(argIndex, randomBuffer[argIndex < 0 ? 0 : random.nextInt(c)][i]));
+                }
+                return args;
+            case SAMEROW:
+                for (Row row : partition.iterator(1, false).next())
+                    return thriftRowArgs(row);
+            default:
+                throw new IllegalStateException();
+        }
+    }
+
+    @Override
+    public void run(JavaDriverClient client) throws IOException
+    {
+        timeWithRetry(new JavaDriverRun(client));
+    }
+
+    @Override
+    public void run(ThriftClient client) throws IOException
+    {
+        timeWithRetry(new ThriftRun(client));
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaStatement.java b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaStatement.java
new file mode 100644
index 0000000..1f7ed80
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaStatement.java

@@ -0,0 +1,146 @@
+package org.apache.cassandra.stress.operations.userdefined;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import com.datastax.driver.core.BoundStatement;
+import com.datastax.driver.core.ColumnDefinitions;
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.Partition;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.Row;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.settings.ValidationType;
+import org.apache.cassandra.stress.util.Timer;
+import org.apache.cassandra.thrift.CqlResult;
+import org.apache.cassandra.transport.SimpleClient;
+
+public abstract class SchemaStatement extends Operation
+{
+
+    final PartitionGenerator generator;
+    final PreparedStatement statement;
+    final Integer thriftId;
+    final ConsistencyLevel cl;
+    final ValidationType validationType;
+    final int[] argumentIndex;
+    final Object[] bindBuffer;
+
+    public SchemaStatement(Timer timer, PartitionGenerator generator, StressSettings settings, Distribution partitionCount,
+                           PreparedStatement statement, Integer thriftId, ConsistencyLevel cl, ValidationType validationType)
+    {
+        super(timer, generator, settings, partitionCount);
+        this.generator = generator;
+        this.statement = statement;
+        this.thriftId = thriftId;
+        this.cl = cl;
+        this.validationType = validationType;
+        argumentIndex = new int[statement.getVariables().size()];
+        bindBuffer = new Object[argumentIndex.length];
+        int i = 0;
+        for (ColumnDefinitions.Definition definition : statement.getVariables())
+            argumentIndex[i++] = generator.indexOf(definition.getName());
+    }
+
+    BoundStatement bindRow(Row row)
+    {
+        for (int i = 0 ; i < argumentIndex.length ; i++)
+        {
+            bindBuffer[i] = row.get(argumentIndex[i]);
+            if (bindBuffer[i] == null && !generator.permitNulls(argumentIndex[i]))
+                throw new IllegalStateException();
+        }
+        return statement.bind(bindBuffer);
+    }
+
+    List<ByteBuffer> thriftRowArgs(Row row)
+    {
+        List<ByteBuffer> args = new ArrayList<>();
+        for (int i : argumentIndex)
+            args.add(generator.convert(i, row.get(i)));
+        return args;
+    }
+
+    void validate(ResultSet rs)
+    {
+        switch (validationType)
+        {
+            case NOT_FAIL:
+                return;
+            case NON_ZERO:
+                if (rs.all().size() == 0)
+                    throw new IllegalStateException("Expected non-zero results");
+                break;
+            default:
+                throw new IllegalStateException("Unsupported validation type");
+        }
+    }
+
+    void validate(CqlResult rs)
+    {
+        switch (validationType)
+        {
+            case NOT_FAIL:
+                return;
+            case NON_ZERO:
+                if (rs.getRowsSize() == 0)
+                    throw new IllegalStateException("Expected non-zero results");
+                break;
+            default:
+                throw new IllegalStateException("Unsupported validation type");
+        }
+    }
+
+    @Override
+    public void run(SimpleClient client) throws IOException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    abstract class Runner implements RunOp
+    {
+        int partitionCount;
+        int rowCount;
+
+        @Override
+        public int partitionCount()
+        {
+            return partitionCount;
+        }
+
+        @Override
+        public int rowCount()
+        {
+            return rowCount;
+        }
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/server/StressThread.java b/tools/stress/src/org/apache/cassandra/stress/server/StressThread.java
deleted file mode 100644
index 158a09f..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/server/StressThread.java
+++ /dev/null

@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.server;
-
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.StressAction;
-
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.io.PrintStream;
-import java.net.Socket;
-
-public class StressThread extends Thread
-{
-    private final Socket socket;
-
-    public StressThread(Socket client)
-    {
-        this.socket = client;
-    }
-
-    public void run()
-    {
-        try
-        {
-            ObjectInputStream in = new ObjectInputStream(socket.getInputStream());
-            PrintStream out = new PrintStream(socket.getOutputStream());
-
-            StressAction action = new StressAction((Session) in.readObject(), out);
-            action.start();
-
-            while (action.isAlive())
-            {
-                try
-                {
-                    if (in.readInt() == 1)
-                    {
-                        action.stopAction();
-                        break;
-                    }
-                }
-                catch (Exception e)
-                {
-                    // continue without problem
-                }
-            }
-
-            out.close();
-            in.close();
-            socket.close();
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e.getMessage(), e);
-        }
-        catch (Exception e)
-        {
-            e.printStackTrace();
-        }
-    }
-
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/CliOption.java b/tools/stress/src/org/apache/cassandra/stress/settings/CliOption.java
new file mode 100644
index 0000000..4d7c039
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/CliOption.java

@@ -0,0 +1,80 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+public enum CliOption
+{
+    POP("Population distribution and intra-partition visit order", SettingsPopulation.helpPrinter()),
+    INSERT("Insert specific options relating to various methods for batching and splitting partition updates", SettingsInsert.helpPrinter()),
+    COL("Column details such as size and count distribution, data generator, names, comparator and if super columns should be used", SettingsColumn.helpPrinter()),
+    RATE("Thread count, rate limit or automatic mode (default is auto)", SettingsRate.helpPrinter()),
+    MODE("Thrift or CQL with options", SettingsMode.helpPrinter()),
+    SCHEMA("Replication settings, compression, compaction, etc.", SettingsSchema.helpPrinter()),
+    NODE("Nodes to connect to", SettingsNode.helpPrinter()),
+    LOG("Where to log progress to, and the interval at which to do it", SettingsLog.helpPrinter()),
+    TRANSPORT("Custom transport factories", SettingsTransport.helpPrinter()),
+    PORT("The port to connect to cassandra nodes on", SettingsPort.helpPrinter()),
+    SENDTO("-send-to", "Specify a stress server to send this command to", SettingsMisc.sendToDaemonHelpPrinter())
+    ;
+
+    private static final Map<String, CliOption> LOOKUP;
+    static
+    {
+        final Map<String, CliOption> lookup = new HashMap<>();
+        for (CliOption cmd : values())
+        {
+            lookup.put("-" + cmd.toString().toLowerCase(), cmd);
+            if (cmd.extraName != null)
+                lookup.put(cmd.extraName, cmd);
+        }
+        LOOKUP = lookup;
+    }
+
+    public static CliOption get(String command)
+    {
+        return LOOKUP.get(command.toLowerCase());
+    }
+
+    public final String extraName;
+    public final String description;
+    private final Runnable helpPrinter;
+
+    private CliOption(String description, Runnable helpPrinter)
+    {
+        this(null, description, helpPrinter);
+    }
+    private CliOption(String extraName, String description, Runnable helpPrinter)
+    {
+        this.extraName = extraName;
+        this.description = description;
+        this.helpPrinter = helpPrinter;
+    }
+
+    public void printHelp()
+    {
+        helpPrinter.run();
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/Command.java b/tools/stress/src/org/apache/cassandra/stress/settings/Command.java
new file mode 100644
index 0000000..7138cbb
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/Command.java

@@ -0,0 +1,140 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.*;
+
+import com.google.common.collect.ImmutableList;
+
+public enum Command
+{
+
+    READ(false, "Standard1", "Super1",
+            "Multiple concurrent reads - the cluster must first be populated by a write test",
+            CommandCategory.BASIC
+    ),
+    WRITE(true, "Standard1", "Super1",
+            "insert",
+            "Multiple concurrent writes against the cluster",
+            CommandCategory.BASIC
+    ),
+    MIXED(true, null, null,
+            "Interleaving of any basic commands, with configurable ratio and distribution - the cluster must first be populated by a write test",
+            CommandCategory.MIXED
+    ),
+    COUNTER_WRITE(true, "Counter1", "SuperCounter1",
+            "counter_add",
+            "Multiple concurrent updates of counters.",
+            CommandCategory.BASIC
+    ),
+    COUNTER_READ(false, "Counter1", "SuperCounter1",
+            "counter_get",
+            "Multiple concurrent reads of counters. The cluster must first be populated by a counterwrite test.",
+            CommandCategory.BASIC
+    ),
+    USER(true, null, null,
+          "Interleaving of user provided queries, with configurable ratio and distribution",
+          CommandCategory.USER
+    ),
+
+    HELP(false, null, null, "-?", "Print help for a command or option", null),
+    PRINT(false, null, null, "Inspect the output of a distribution definition", null),
+    LEGACY(false, null, null, "Legacy support mode", null)
+
+    ;
+
+    private static final Map<String, Command> LOOKUP;
+    static
+    {
+        final Map<String, Command> lookup = new HashMap<>();
+        for (Command cmd : values())
+        {
+            for (String name : cmd.names)
+                lookup.put(name, cmd);
+        }
+        LOOKUP = lookup;
+    }
+
+    public static Command get(String command)
+    {
+        return LOOKUP.get(command.toLowerCase());
+    }
+
+    public final boolean updates;
+    public final CommandCategory category;
+    public final List<String> names;
+    public final String description;
+    public final String table;
+    public final String supertable;
+
+    Command(boolean updates, String table, String supertable, String description, CommandCategory category)
+    {
+        this(updates, table, supertable, null, description, category);
+    }
+
+    Command(boolean updates, String table, String supertable, String extra, String description, CommandCategory category)
+    {
+        this.table = table;
+        this.supertable = supertable;
+        this.updates = updates;
+        this.category = category;
+        List<String> names = new ArrayList<>();
+        names.add(this.toString().toLowerCase());
+        names.add(this.toString().replaceAll("_", "").toLowerCase());
+        if (extra != null)
+        {
+            names.add(extra.toLowerCase());
+            names.add(extra.replaceAll("_", "").toLowerCase());
+        }
+        this.names = ImmutableList.copyOf(names);
+        this.description = description;
+    }
+
+    public void printHelp()
+    {
+        helpPrinter().run();
+    }
+
+    public final Runnable helpPrinter()
+    {
+        switch (this)
+        {
+            case PRINT:
+                return SettingsMisc.printHelpPrinter();
+            case HELP:
+                return SettingsMisc.helpHelpPrinter();
+            case LEGACY:
+                return Legacy.helpPrinter();
+        }
+        switch (category)
+        {
+            case USER:
+                return SettingsCommandUser.helpPrinter();
+            case BASIC:
+                return SettingsCommandPreDefined.helpPrinter(this);
+            case MIXED:
+                return SettingsCommandPreDefinedMixed.helpPrinter();
+        }
+        throw new AssertionError();
+    }
+
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/settings/CommandCategory.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/settings/CommandCategory.java
index e42574b..e9dd946 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/CommandCategory.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.settings;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,9 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public enum CommandCategory
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
+    BASIC,
+    MIXED,
+    USER
 }

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/settings/ConnectionAPI.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/settings/ConnectionAPI.java
index e42574b..942250f 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/ConnectionAPI.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.settings;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,8 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public enum ConnectionAPI
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
+    THRIFT, THRIFT_SMART, SIMPLE_NATIVE, JAVA_DRIVER_NATIVE
 }
+

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/settings/ConnectionStyle.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/settings/ConnectionStyle.java
index e42574b..6b408a9 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/ConnectionStyle.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.settings;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,10 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public enum ConnectionStyle
 {
-    private final ByteBuffer buffer;
-
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
-
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
+    CQL,
+    CQL_PREPARED,
+    THRIFT
 }
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/CqlVersion.java b/tools/stress/src/org/apache/cassandra/stress/settings/CqlVersion.java
new file mode 100644
index 0000000..d7d09f6
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/CqlVersion.java

@@ -0,0 +1,69 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+public enum CqlVersion
+{
+
+    NOCQL(null),
+    CQL2("2.0.0"),
+    CQL3("3.0.0");
+
+    public final String connectVersion;
+
+    private CqlVersion(String connectVersion)
+    {
+        this.connectVersion = connectVersion;
+    }
+
+    static CqlVersion get(String version)
+    {
+        if (version == null)
+            return NOCQL;
+        switch(version.charAt(0))
+        {
+            case '2':
+                return CQL2;
+            case '3':
+                return CQL3;
+            default:
+                throw new IllegalStateException();
+        }
+    }
+
+    public boolean isCql()
+    {
+        return this != NOCQL;
+    }
+
+    public boolean isCql2()
+    {
+        return this == CQL2;
+    }
+
+    public boolean isCql3()
+    {
+        return this == CQL3;
+    }
+
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/GroupedOptions.java b/tools/stress/src/org/apache/cassandra/stress/settings/GroupedOptions.java
new file mode 100644
index 0000000..8bbba15
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/GroupedOptions.java

@@ -0,0 +1,141 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.PrintStream;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.collect.ImmutableList;
+
+public abstract class GroupedOptions
+{
+
+    int accepted = 0;
+
+    public boolean accept(String param)
+    {
+        for (Option option : options())
+        {
+            if (option.accept(param))
+            {
+                accepted++;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public boolean happy()
+    {
+        for (Option option : options())
+            if (!option.happy())
+                return false;
+        return true;
+    }
+
+    public abstract List<? extends Option> options();
+
+    // hands the parameters to each of the option groups, and returns the first provided
+    // option group that is happy() after this is done, that also accepted all the parameters
+    public static <G extends GroupedOptions> G select(String[] params, G... groupings)
+    {
+        for (String param : params)
+        {
+            boolean accepted = false;
+            for (GroupedOptions grouping : groupings)
+                accepted |= grouping.accept(param);
+            if (!accepted)
+                throw new IllegalArgumentException("Invalid parameter " + param);
+        }
+        for (G grouping : groupings)
+            if (grouping.happy() && grouping.accepted == params.length)
+                return grouping;
+        return null;
+    }
+
+    // pretty prints all of the option groupings
+    public static void printOptions(PrintStream out, String command, GroupedOptions... groupings)
+    {
+        out.println();
+        boolean firstRow = true;
+        for (GroupedOptions grouping : groupings)
+        {
+            if (!firstRow)
+            {
+                out.println(" OR ");
+            }
+            firstRow = false;
+
+            StringBuilder sb = new StringBuilder("Usage: " + command);
+            for (Option option : grouping.options())
+            {
+                sb.append(" ");
+                sb.append(option.shortDisplay());
+            }
+            out.println(sb.toString());
+        }
+        out.println();
+        final Set<Option> printed = new HashSet<>();
+        for (GroupedOptions grouping : groupings)
+        {
+            for (Option option : grouping.options())
+            {
+                if (printed.add(option))
+                {
+                    if (option.longDisplay() != null)
+                    {
+                        out.println("  " + option.longDisplay());
+                        for (String row : option.multiLineDisplay())
+                            out.println("      " + row);
+                    }
+                }
+            }
+        }
+    }
+
+    public static List<? extends Option> merge(List<? extends Option> ... optionss)
+    {
+        ImmutableList.Builder<Option> builder = ImmutableList.builder();
+        for (List<? extends Option> options : optionss)
+            for (Option option : options)
+                if (option instanceof OptionSimple && ((OptionSimple) option).isRequired())
+                    builder.add(option);
+        for (List<? extends Option> options : optionss)
+            for (Option option : options)
+                if (!(option instanceof OptionSimple && ((OptionSimple) option).isRequired()))
+                    builder.add(option);
+        return builder.build();
+    }
+
+    public static String formatLong(String longDisplay, String description)
+    {
+        return String.format("%-40s %s", longDisplay, description);
+    }
+
+    public static String formatMultiLine(String longDisplay, String description)
+    {
+        return String.format("%-36s %s", longDisplay, description);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/Legacy.java b/tools/stress/src/org/apache/cassandra/stress/settings/Legacy.java
new file mode 100644
index 0000000..7f17893
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/Legacy.java

@@ -0,0 +1,364 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress.settings;
+
+import java.io.Serializable;
+import java.util.*;
+
+import org.apache.commons.cli.*;
+import org.apache.commons.cli.Option;
+
+public class Legacy implements Serializable
+{
+
+    // command line options
+    public static final Options availableOptions = new Options();
+
+    private static final String SSL_TRUSTSTORE = "truststore";
+    private static final String SSL_TRUSTSTORE_PW = "truststore-password";
+    private static final String SSL_PROTOCOL = "ssl-protocol";
+    private static final String SSL_ALGORITHM = "ssl-alg";
+    private static final String SSL_STORE_TYPE = "store-type";
+    private static final String SSL_CIPHER_SUITES = "ssl-ciphers";
+
+    static
+    {
+        availableOptions.addOption("h",  "help",                 false,  "Show this help message and exit");
+        availableOptions.addOption("n",  "num-keys",             true,   "Number of keys, default:1000000");
+        availableOptions.addOption("F",  "num-different-keys",   true,   "Number of different keys (if < NUM-KEYS, the same key will re-used multiple times), default:NUM-KEYS");
+        availableOptions.addOption("t",  "threadCount",              true,   "Number of threadCount to use, default:50");
+        availableOptions.addOption("c",  "columns",              true,   "Number of columns per key, default:5");
+        availableOptions.addOption("S",  "column-size",          true,   "Size of column values in bytes, default:34");
+        availableOptions.addOption("C",  "unique columns",       true,   "Max number of unique columns per key, default:50");
+        availableOptions.addOption("RC", "unique rows",          true,   "Max number of unique rows, default:50");
+        availableOptions.addOption("d",  "nodes",                true,   "Host nodes (comma separated), default:locahost");
+        availableOptions.addOption("D",  "nodesfile",            true,   "File containing host nodes (one per line)");
+        availableOptions.addOption("s",  "stdev",                true,   "Standard Deviation for gaussian read key generation, default:0.1");
+        availableOptions.addOption("r",  "random",               false,  "Use random key generator for read key generation (STDEV will have no effect), default:false");
+        availableOptions.addOption("f",  "file",                 true,   "Write output to given file");
+        availableOptions.addOption("p",  "port",                 true,   "Thrift port, default:9160");
+        availableOptions.addOption("o",  "operation",            true,   "Operation to perform (WRITE, READ, READWRITE, RANGE_SLICE, INDEXED_RANGE_SLICE, MULTI_GET, COUNTERWRITE, COUNTER_GET), default:WRITE");
+        availableOptions.addOption("u",  "supercolumns",         true,   "Number of super columns per key, default:1");
+        availableOptions.addOption("y",  "family-type",          true,   "Column Family Type (Super, Standard), default:Standard");
+        availableOptions.addOption("K",  "keep-trying",          true,   "Retry on-going operation N times (in case of failure). positive integer, default:10");
+        availableOptions.addOption("k",  "keep-going",           false,  "Ignore errors inserting or reading (when set, --keep-trying has no effect), default:false");
+        availableOptions.addOption("i",  "progress-interval",    true,   "Progress Report Interval (seconds), default:10");
+        availableOptions.addOption("g",  "keys-per-call",        true,   "Number of keys to get_range_slices or multiget per call, default:1000");
+        availableOptions.addOption("l",  "replication-factor",   true,   "Replication Factor to use when creating needed column families, default:1");
+        availableOptions.addOption("L",  "enable-cql",           false,  "Perform queries using CQL2 (Cassandra Query Language v 2.0.0)");
+        availableOptions.addOption("L3", "enable-cql3",          false,  "Perform queries using CQL3 (Cassandra Query Language v 3.0.0)");
+        availableOptions.addOption("b",  "enable-native-protocol",  false,  "Use the binary native protocol (only work along with -L3)");
+        availableOptions.addOption("P",  "use-prepared-statements", false, "Perform queries using prepared statements (only applicable to CQL).");
+        availableOptions.addOption("e",  "consistency-level",    true,   "Consistency Level to use (ONE, QUORUM, LOCAL_QUORUM, EACH_QUORUM, ALL, ANY), default:ONE");
+        availableOptions.addOption("x",  "create-index",         true,   "Type of index to create on needed column families (KEYS)");
+        availableOptions.addOption("R",  "replication-strategy", true,   "Replication strategy to use (only on insert if keyspace does not exist), default:org.apache.cassandra.locator.SimpleStrategy");
+        availableOptions.addOption("O",  "strategy-properties",  true,   "Replication strategy properties in the following format <dc_name>:<num>,<dc_name>:<num>,...");
+        availableOptions.addOption("V",  "average-size-values",  false,  "Generate column values of average rather than specific size");
+        availableOptions.addOption("T",  "send-to",              true,   "Send this as a request to the stress daemon at specified address.");
+        availableOptions.addOption("I",  "compression",          true,   "Specify the compression to use for sstable, default:no compression");
+        availableOptions.addOption("Q",  "query-names",          true,   "Comma-separated list of column names to retrieve from each row.");
+        availableOptions.addOption("Z",  "compaction-strategy",  true,   "CompactionStrategy to use.");
+        availableOptions.addOption("U",  "comparator",           true,   "Column Comparator to use. Currently supported types are: TimeUUIDType, AsciiType, UTF8Type.");
+        availableOptions.addOption("tf", "transport-factory",    true,   "Fully-qualified TTransportFactory class name for creating a connection. Note: For Thrift over SSL, use org.apache.cassandra.stress.SSLTransportFactory.");
+        availableOptions.addOption("ns", "no-statistics",        false,  "Turn off the aggegate statistics that is normally output after completion.");
+        availableOptions.addOption("ts", SSL_TRUSTSTORE,         true, "SSL: full path to truststore");
+        availableOptions.addOption("tspw", SSL_TRUSTSTORE_PW,    true, "SSL: full path to truststore");
+        availableOptions.addOption("prtcl", SSL_PROTOCOL,        true, "SSL: connections protocol to use (default: TLS)");
+        availableOptions.addOption("alg", SSL_ALGORITHM,         true, "SSL: algorithm (default: SunX509)");
+        availableOptions.addOption("st", SSL_STORE_TYPE,         true, "SSL: type of store");
+        availableOptions.addOption("ciphers", SSL_CIPHER_SUITES, true, "SSL: comma-separated list of encryption suites to use");
+        availableOptions.addOption("th",  "throttle",            true,   "Throttle the total number of operations per second to a maximum amount.");
+    }
+
+    public static StressSettings build(String[] arguments)
+    {
+        CommandLineParser parser = new PosixParser();
+
+        final Converter r = new Converter();
+        try
+        {
+            CommandLine cmd = parser.parse(availableOptions, arguments);
+
+            if (cmd.getArgs().length > 0)
+            {
+                System.err.println("Application does not allow arbitrary arguments: " + Arrays.asList(cmd.getArgList()));
+                System.exit(1);
+            }
+
+            if (cmd.hasOption("h"))
+                printHelpMessage();
+
+            if (cmd.hasOption("C"))
+                System.out.println("Ignoring deprecated option -C");
+
+            if (cmd.hasOption("o"))
+                r.setCommand(cmd.getOptionValue("o").toLowerCase());
+            else
+                r.setCommand("insert");
+
+            if (cmd.hasOption("K"))
+                r.add("command", "tries=" + cmd.getOptionValue("K"));
+
+            if (cmd.hasOption("k"))
+            {
+                if (!cmd.hasOption("K"))
+                    r.add("command", "retry=1");
+                r.add("command", "ignore_errors");
+            }
+
+            if (cmd.hasOption("g"))
+                r.add("command", "at-once=" + cmd.getOptionValue("g"));
+
+            if (cmd.hasOption("e"))
+                r.add("command", "cl=" + cmd.getOptionValue("e"));
+
+            String numKeys;
+            if (cmd.hasOption("n"))
+                numKeys = cmd.getOptionValue("n");
+            else
+                numKeys = "1000000";
+            r.add("command", "n=" + numKeys);
+
+            String uniqueKeys;
+            if (cmd.hasOption("F"))
+                uniqueKeys = cmd.getOptionValue("F");
+            else
+                uniqueKeys = numKeys;
+
+            if (r.opts.containsKey("write") || r.opts.containsKey("counterwrite"))
+            {
+                if (!uniqueKeys.equals(numKeys))
+                    r.add("-key", "populate=1.." + uniqueKeys);
+            }
+            else if (cmd.hasOption("r"))
+            {
+                r.add("-key", "dist=uniform(1.." + uniqueKeys + ")");
+            }
+            else
+            {
+                if (!cmd.hasOption("s"))
+                    r.add("-key", "dist=gauss(1.." + uniqueKeys + ",5)");
+                else
+                    r.add("-key", String.format("dist=gauss(1..%s,%.2f)", uniqueKeys,
+                            0.5 / Float.parseFloat(cmd.getOptionValue("s"))));
+            }
+
+            String colCount;
+            if (cmd.hasOption("c"))
+                colCount = cmd.getOptionValue("c");
+            else
+                colCount = "5";
+
+            String colSize;
+            if (cmd.hasOption("S"))
+                colSize = cmd.getOptionValue("S");
+            else
+                colSize = "34";
+
+            r.add("-col", "n=fixed(" + colCount + ")");
+            if (cmd.hasOption("V"))
+            {
+                r.add("-col", "size=uniform(1.." + Integer.parseInt(colSize) * 2 + ")");
+                r.add("-col", "data=rand()");
+            }
+            else
+            {
+                r.add("-col", "size=fixed(" + colSize + ")");
+                r.add("-col", "data=repeat(1)");
+            }
+            if (cmd.hasOption("Q"))
+                r.add("-col", "names=" + cmd.getOptionValue("Q"));
+
+            if (cmd.hasOption("U"))
+                r.add("-col", "comparator=" + cmd.getOptionValue("U"));
+
+            if (cmd.hasOption("y") && cmd.getOptionValue("y").equals("Super"))
+                r.add("-col", "super=" + (cmd.hasOption("u") ? cmd.getOptionValue("u") : "1"));
+
+            if (cmd.hasOption("t"))
+                r.add("-rate", "threads=" + cmd.getOptionValue("t"));
+            else
+                r.add("-rate", "threads=50");
+
+            if (cmd.hasOption("th"))
+                r.add("-rate", "limit=" + cmd.getOptionValue("th") + "/s");
+
+            if (cmd.hasOption("f"))
+                r.add("-log", "file=" + cmd.getOptionValue("f"));
+
+            if (cmd.hasOption("p"))
+                r.add("-port", cmd.getOptionValue("p"));
+
+            if (cmd.hasOption("i"))
+                r.add("-log", "interval=" + cmd.getOptionValue("i"));
+            else
+                r.add("-log", "interval=10");
+
+            if (cmd.hasOption("x"))
+                r.add("-schema", "index=" + cmd.getOptionValue("x"));
+
+            if (cmd.hasOption("R") || cmd.hasOption("l") || cmd.hasOption("O"))
+            {
+                StringBuilder rep = new StringBuilder();
+                if (cmd.hasOption("R"))
+                    rep.append("strategy=" + cmd.getOptionValue("R"));
+                if (cmd.hasOption("l"))
+                {
+                    if (rep.length() > 0)
+                        rep.append(",");
+                    rep.append("factor=" + cmd.getOptionValue("l"));
+                }
+                if (cmd.hasOption("O"))
+                {
+                    if (rep.length() > 0)
+                        rep.append(",");
+                    rep.append(cmd.getOptionValue("O").replace(':','='));
+                }
+                r.add("-schema", "replication(" + rep + ")");
+            }
+
+            if (cmd.hasOption("L"))
+                r.add("-mode", cmd.hasOption("P") ? "prepared cql2" : "cql2");
+            else if (cmd.hasOption("L3"))
+                r.add("-mode", (cmd.hasOption("P") ? "prepared" : "") + (cmd.hasOption("b") ? "native" : "") +  "cql3");
+            else
+                r.add("-mode", "thrift");
+
+            if (cmd.hasOption("I"))
+                r.add("-schema", "compression=" + cmd.getOptionValue("I"));
+
+            if (cmd.hasOption("d"))
+                r.add("-node", cmd.getOptionValue("d"));
+
+            if (cmd.hasOption("D"))
+                r.add("-node", "file=" + cmd.getOptionValue("D"));
+
+
+            if (cmd.hasOption("send-to"))
+                r.add("-send-to", cmd.getOptionValue("send-to"));
+
+            if (cmd.hasOption("Z"))
+                r.add("-schema", "compaction=" + cmd.getOptionValue("Z"));
+
+            if (cmd.hasOption("ns"))
+                r.add("-log", "no-summary");
+
+            if (cmd.hasOption("tf"))
+                r.add("-transport", "factory=" + cmd.getOptionValue("tf"));
+
+            if(cmd.hasOption(SSL_TRUSTSTORE))
+                r.add("-transport", "truststore=" + cmd.getOptionValue(SSL_TRUSTSTORE));
+
+            if(cmd.hasOption(SSL_TRUSTSTORE_PW))
+                r.add("-transport", "truststore-password=" + cmd.getOptionValue(SSL_TRUSTSTORE_PW));
+
+            if(cmd.hasOption(SSL_PROTOCOL))
+                r.add("-transport", "ssl-protocol=" + cmd.getOptionValue(SSL_PROTOCOL));
+
+            if(cmd.hasOption(SSL_ALGORITHM))
+                r.add("-transport", "ssl-alg=" +  cmd.getOptionValue(SSL_ALGORITHM));
+
+            if(cmd.hasOption(SSL_STORE_TYPE))
+                r.add("-transport", "store-type=" +  cmd.getOptionValue(SSL_STORE_TYPE));
+
+            if(cmd.hasOption(SSL_CIPHER_SUITES))
+                r.add("-transport", "ssl-ciphers=" +  cmd.getOptionValue(SSL_CIPHER_SUITES));
+
+        }
+        catch (ParseException e)
+        {
+            printHelpMessage();
+            System.exit(1);
+        }
+
+        r.printNewCommand();
+        return r.get();
+    }
+
+    private static final class Converter
+    {
+        private Map<String, List<String>> opts = new LinkedHashMap<>();
+        List<String> command;
+        public void add(String option, String suboption)
+        {
+            if (option.equals("command"))
+            {
+                command.add(suboption);
+                return;
+            }
+            List<String> params = opts.get(option);
+            if (params == null)
+                opts.put(option, params = new ArrayList());
+            params.add(suboption);
+        }
+        StressSettings get(){
+            Map<String, String[]> clArgs = new HashMap<>();
+            for (Map.Entry<String, List<String>> e : opts.entrySet())
+                clArgs .put(e.getKey(), e.getValue().toArray(new String[0]));
+            return StressSettings.get(clArgs);
+        }
+        void setCommand(String command)
+        {
+            command = Command.get(command).toString().toLowerCase();
+            opts.put(command, this.command = new ArrayList<>());
+        }
+        void printNewCommand()
+        {
+            StringBuilder sb = new StringBuilder("stress");
+            for (Map.Entry<String, List<String>> e : opts.entrySet())
+            {
+                sb.append(" ");
+                sb.append(e.getKey());
+                for (String opt : e.getValue())
+                {
+                    sb.append(" ");
+                    sb.append(opt);
+                }
+            }
+            System.out.println("Running in legacy support mode. Translating command to: ");
+            System.out.println(sb.toString());
+        }
+    }
+
+    public static void printHelpMessage()
+    {
+        System.out.println("Usage: ./bin/cassandra-stress legacy [options]\n\nOptions:");
+        System.out.println("THIS IS A LEGACY SUPPORT MODE");
+
+        for(Object o : availableOptions.getOptions())
+        {
+            Option option = (Option) o;
+            String upperCaseName = option.getLongOpt().toUpperCase();
+            System.out.println(String.format("-%s%s, --%s%s%n\t\t%s%n", option.getOpt(), (option.hasArg()) ? " "+upperCaseName : "",
+                    option.getLongOpt(), (option.hasArg()) ? "="+upperCaseName : "", option.getDescription()));
+        }
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelpMessage();
+            }
+        };
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/settings/Option.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/settings/Option.java
index e42574b..a9e669c 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/Option.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.settings;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,26 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
+import java.util.List;
 
-public class ByteBufferOutputStream extends OutputStream
+abstract class Option
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
+    abstract boolean accept(String param);
+    abstract boolean happy();
+    abstract String shortDisplay();
+    abstract String longDisplay();
+    abstract List<String> multiLineDisplay();
+    abstract boolean setByUser();
+
+    public int hashCode()
     {
-        this.buffer = buffer;
+        return getClass().hashCode();
     }
 
-    public void write(int b)
+    public boolean equals(Object that)
     {
-        buffer.put((byte) b);
+        return this.getClass() == that.getClass();
     }
 
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionAnyProbabilities.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionAnyProbabilities.java
new file mode 100644
index 0000000..b685294
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionAnyProbabilities.java

@@ -0,0 +1,96 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+
+
+public final class OptionAnyProbabilities extends OptionMulti
+{
+    public OptionAnyProbabilities(String name, String description)
+    {
+        super(name, description, false);
+    }
+
+    final CollectRatios ratios = new CollectRatios();
+
+    private static final class CollectRatios extends Option
+    {
+        Map<String, Double> options = new LinkedHashMap<>();
+
+        boolean accept(String param)
+        {
+            String[] args = param.split("=");
+            if (args.length == 2 && args[1].length() > 0 && args[0].length() > 0)
+            {
+                if (options.put(args[0], Double.parseDouble(args[1])) != null)
+                    throw new IllegalArgumentException(args[0] + " set twice");
+                return true;
+            }
+            return false;
+        }
+
+        boolean happy()
+        {
+            return !options.isEmpty();
+        }
+
+        String shortDisplay()
+        {
+            return null;
+        }
+
+        String longDisplay()
+        {
+            return null;
+        }
+
+        List<String> multiLineDisplay()
+        {
+            return Collections.emptyList();
+        }
+
+        boolean setByUser()
+        {
+            return !options.isEmpty();
+        }
+    }
+
+
+    @Override
+    public List<? extends Option> options()
+    {
+        return Arrays.asList(ratios);
+    }
+
+    Map<String, Double> ratios()
+    {
+        return ratios.options;
+    }
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionCompaction.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionCompaction.java
new file mode 100644
index 0000000..11d5403
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionCompaction.java

@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress.settings;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Function;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+/**
+ * For specifying replication options
+ */
+class OptionCompaction extends OptionMulti
+{
+
+    private final OptionSimple strategy = new OptionSimple("strategy=", new StrategyAdapter(), null, "The compaction strategy to use", false);
+
+    public OptionCompaction()
+    {
+        super("compaction", "Define the compaction strategy and any parameters", true);
+    }
+
+    public String getStrategy()
+    {
+        return strategy.value();
+    }
+
+    public Map<String, String> getOptions()
+    {
+        return extraOptions();
+    }
+
+    protected List<? extends Option> options()
+    {
+        return Arrays.asList(strategy);
+    }
+
+    @Override
+    public boolean happy()
+    {
+        return true;
+    }
+
+    private static final class StrategyAdapter implements Function<String, String>
+    {
+
+        public String apply(String name)
+        {
+            try
+            {
+                CFMetaData.createCompactionStrategy(name);
+            } catch (ConfigurationException e)
+            {
+                throw new IllegalArgumentException("Invalid compaction strategy: " + name);
+            }
+            return name;
+        }
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionDistribution.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionDistribution.java
new file mode 100644
index 0000000..ef3dbb1
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionDistribution.java

@@ -0,0 +1,465 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Function;
+
+import org.apache.cassandra.stress.generate.*;
+import org.apache.commons.math3.distribution.ExponentialDistribution;
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.commons.math3.distribution.UniformRealDistribution;
+import org.apache.commons.math3.distribution.WeibullDistribution;
+import org.apache.commons.math3.random.JDKRandomGenerator;
+
+/**
+ * For selecting a mathematical distribution
+ */
+public class OptionDistribution extends Option
+{
+
+    public static final Function<String, DistributionFactory> BUILDER = new Function<String, DistributionFactory>()
+    {
+        public DistributionFactory apply(String s)
+        {
+            return get(s);
+        }
+    };
+
+    private static final Pattern FULL = Pattern.compile("(~?)([A-Z]+)\\((.+)\\)", Pattern.CASE_INSENSITIVE);
+    private static final Pattern ARGS = Pattern.compile("[^,]+");
+
+    final String prefix;
+    private String spec;
+    private final String defaultSpec;
+    private final String description;
+    private final boolean required;
+
+    public OptionDistribution(String prefix, String defaultSpec, String description)
+    {
+        this(prefix, defaultSpec, description, defaultSpec == null);
+    }
+
+    public OptionDistribution(String prefix, String defaultSpec, String description, boolean required)
+    {
+        this.prefix = prefix;
+        this.defaultSpec = defaultSpec;
+        this.description = description;
+        this.required = required;
+    }
+
+    @Override
+    public boolean accept(String param)
+    {
+        if (!param.toLowerCase().startsWith(prefix))
+            return false;
+        spec = param.substring(prefix.length());
+        return true;
+    }
+
+    public static DistributionFactory get(String spec)
+    {
+        Matcher m = FULL.matcher(spec);
+        if (!m.matches())
+            throw new IllegalArgumentException("Illegal distribution specification: " + spec);
+        boolean inverse = m.group(1).equals("~");
+        String name = m.group(2);
+        Impl impl = LOOKUP.get(name.toLowerCase());
+        if (impl == null)
+            throw new IllegalArgumentException("Illegal distribution type: " + name);
+        List<String> params = new ArrayList<>();
+        m = ARGS.matcher(m.group(3));
+        while (m.find())
+            params.add(m.group());
+        DistributionFactory factory = impl.getFactory(params);
+        return inverse ? new InverseFactory(factory) : factory;
+    }
+
+    public DistributionFactory get()
+    {
+        return spec != null ? get(spec) : defaultSpec != null ? get(defaultSpec) : null;
+    }
+
+    @Override
+    public boolean happy()
+    {
+        return !required || spec != null;
+    }
+
+    public String longDisplay()
+    {
+        return shortDisplay() + ": " + description;
+    }
+
+    @Override
+    public List<String> multiLineDisplay()
+    {
+        return Arrays.asList(
+                GroupedOptions.formatMultiLine("EXP(min..max)", "An exponential distribution over the range [min..max]"),
+                GroupedOptions.formatMultiLine("EXTREME(min..max,shape)", "An extreme value (Weibull) distribution over the range [min..max]"),
+                GroupedOptions.formatMultiLine("QEXTREME(min..max,shape,quantas)", "An extreme value, split into quantas, within which the chance of selection is uniform"),
+                GroupedOptions.formatMultiLine("GAUSSIAN(min..max,stdvrng)", "A gaussian/normal distribution, where mean=(min+max)/2, and stdev is (mean-min)/stdvrng"),
+                GroupedOptions.formatMultiLine("GAUSSIAN(min..max,mean,stdev)", "A gaussian/normal distribution, with explicitly defined mean and stdev"),
+                GroupedOptions.formatMultiLine("UNIFORM(min..max)", "A uniform distribution over the range [min, max]"),
+                GroupedOptions.formatMultiLine("FIXED(val)", "A fixed distribution, always returning the same value"),
+                "Preceding the name with ~ will invert the distribution, e.g. ~exp(1..10) will yield 10 most, instead of least, often",
+                "Aliases: extr, qextr, gauss, normal, norm, weibull"
+        );
+    }
+
+    boolean setByUser()
+    {
+        return spec != null;
+    }
+
+    @Override
+    public String shortDisplay()
+    {
+        return (defaultSpec != null ? "[" : "") + prefix + "DIST(?)" + (defaultSpec != null ? "]" : "");
+    }
+
+    private static final Map<String, Impl> LOOKUP;
+    static
+    {
+        final Map<String, Impl> lookup = new HashMap<>();
+        lookup.put("exp", new ExponentialImpl());
+        lookup.put("extr", new ExtremeImpl());
+        lookup.put("qextr", new QuantizedExtremeImpl());
+        lookup.put("extreme", lookup.get("extr"));
+        lookup.put("qextreme", lookup.get("qextr"));
+        lookup.put("weibull", lookup.get("weibull"));
+        lookup.put("gaussian", new GaussianImpl());
+        lookup.put("normal", lookup.get("gaussian"));
+        lookup.put("gauss", lookup.get("gaussian"));
+        lookup.put("norm", lookup.get("gaussian"));
+        lookup.put("uniform", new UniformImpl());
+        lookup.put("fixed", new FixedImpl());
+        LOOKUP = lookup;
+    }
+
+    // factory builders
+
+    private static interface Impl
+    {
+        public DistributionFactory getFactory(List<String> params);
+    }
+
+    public static long parseLong(String value)
+    {
+        long multiplier = 1;
+        value = value.trim().toLowerCase();
+        switch (value.charAt(value.length() - 1))
+        {
+            case 'b':
+                multiplier *= 1000;
+            case 'm':
+                multiplier *= 1000;
+            case 'k':
+                multiplier *= 1000;
+                value = value.substring(0, value.length() - 1);
+        }
+        return Long.parseLong(value) * multiplier;
+    }
+
+    private static final class GaussianImpl implements Impl
+    {
+
+        @Override
+        public DistributionFactory getFactory(List<String> params)
+        {
+            if (params.size() > 3 || params.size() < 1)
+                throw new IllegalArgumentException("Invalid parameter list for gaussian distribution: " + params);
+            try
+            {
+                String[] bounds = params.get(0).split("\\.\\.+");
+                final long min = parseLong(bounds[0]);
+                final long max = parseLong(bounds[1]);
+                final double mean, stdev;
+                if (params.size() == 3)
+                {
+                    mean = Double.parseDouble(params.get(1));
+                    stdev = Double.parseDouble(params.get(2));
+                }
+                else
+                {
+                    final double stdevsToEdge = params.size() == 1 ? 3d : Double.parseDouble(params.get(1));
+                    mean = (min + max) / 2d;
+                    stdev = ((max - min) / 2d) / stdevsToEdge;
+                }
+                return new GaussianFactory(min, max, mean, stdev);
+            } catch (Exception _)
+            {
+                throw new IllegalArgumentException("Invalid parameter list for uniform distribution: " + params);
+            }
+        }
+    }
+
+    private static final class ExponentialImpl implements Impl
+    {
+        @Override
+        public DistributionFactory getFactory(List<String> params)
+        {
+            if (params.size() != 1)
+                throw new IllegalArgumentException("Invalid parameter list for gaussian distribution: " + params);
+            try
+            {
+                String[] bounds = params.get(0).split("\\.\\.+");
+                final long min = parseLong(bounds[0]);
+                final long max = parseLong(bounds[1]);
+                ExponentialDistribution findBounds = new ExponentialDistribution(1d);
+                // max probability should be roughly equal to accuracy of (max-min) to ensure all values are visitable,
+                // over entire range, but this results in overly skewed distribution, so take sqrt
+                final double mean = (max - min) / findBounds.inverseCumulativeProbability(1d - Math.sqrt(1d/(max-min)));
+                return new ExpFactory(min, max, mean);
+            } catch (Exception _)
+            {
+                throw new IllegalArgumentException("Invalid parameter list for uniform distribution: " + params);
+            }
+        }
+    }
+
+    private static final class ExtremeImpl implements Impl
+    {
+        @Override
+        public DistributionFactory getFactory(List<String> params)
+        {
+            if (params.size() != 2)
+                throw new IllegalArgumentException("Invalid parameter list for extreme (Weibull) distribution: " + params);
+            try
+            {
+                String[] bounds = params.get(0).split("\\.\\.+");
+                final long min = parseLong(bounds[0]);
+                final long max = parseLong(bounds[1]);
+                final double shape = Double.parseDouble(params.get(1));
+                WeibullDistribution findBounds = new WeibullDistribution(shape, 1d);
+                // max probability should be roughly equal to accuracy of (max-min) to ensure all values are visitable,
+                // over entire range, but this results in overly skewed distribution, so take sqrt
+                final double scale = (max - min) / findBounds.inverseCumulativeProbability(1d - Math.sqrt(1d/(max-min)));
+                return new ExtremeFactory(min, max, shape, scale);
+            } catch (Exception _)
+            {
+                throw new IllegalArgumentException("Invalid parameter list for extreme (Weibull) distribution: " + params);
+            }
+        }
+    }
+
+    private static final class QuantizedExtremeImpl implements Impl
+    {
+        @Override
+        public DistributionFactory getFactory(List<String> params)
+        {
+            if (params.size() != 3)
+                throw new IllegalArgumentException("Invalid parameter list for quantized extreme (Weibull) distribution: " + params);
+            try
+            {
+                String[] bounds = params.get(0).split("\\.\\.+");
+                final long min = parseLong(bounds[0]);
+                final long max = parseLong(bounds[1]);
+                final double shape = Double.parseDouble(params.get(1));
+                final int quantas = Integer.parseInt(params.get(2));
+                WeibullDistribution findBounds = new WeibullDistribution(shape, 1d);
+                // max probability should be roughly equal to accuracy of (max-min) to ensure all values are visitable,
+                // over entire range, but this results in overly skewed distribution, so take sqrt
+                final double scale = (max - min) / findBounds.inverseCumulativeProbability(1d - Math.sqrt(1d/(max-min)));
+                return new QuantizedExtremeFactory(min, max, shape, scale, quantas);
+            } catch (Exception _)
+            {
+                throw new IllegalArgumentException("Invalid parameter list for quantized extreme (Weibull) distribution: " + params);
+            }
+        }
+    }
+
+    private static final class UniformImpl implements Impl
+    {
+
+        @Override
+        public DistributionFactory getFactory(List<String> params)
+        {
+            if (params.size() != 1)
+                throw new IllegalArgumentException("Invalid parameter list for uniform distribution: " + params);
+            try
+            {
+                String[] bounds = params.get(0).split("\\.\\.+");
+                final long min = parseLong(bounds[0]);
+                final long max = parseLong(bounds[1]);
+                return new UniformFactory(min, max);
+            } catch (Exception _)
+            {
+                throw new IllegalArgumentException("Invalid parameter list for uniform distribution: " + params);
+            }
+        }
+    }
+
+    private static final class FixedImpl implements Impl
+    {
+
+        @Override
+        public DistributionFactory getFactory(List<String> params)
+        {
+            if (params.size() != 1)
+                throw new IllegalArgumentException("Invalid parameter list for uniform distribution: " + params);
+            try
+            {
+                final long key = parseLong(params.get(0));
+                return new FixedFactory(key);
+            } catch (Exception _)
+            {
+                throw new IllegalArgumentException("Invalid parameter list for uniform distribution: " + params);
+            }
+        }
+    }
+
+    private static final class InverseFactory implements DistributionFactory
+    {
+        final DistributionFactory wrapped;
+        private InverseFactory(DistributionFactory wrapped)
+        {
+            this.wrapped = wrapped;
+        }
+
+        public Distribution get()
+        {
+            return new DistributionInverted(wrapped.get());
+        }
+    }
+
+    // factories
+
+    private static final class ExpFactory implements DistributionFactory
+    {
+        final long min, max;
+        final double mean;
+        private ExpFactory(long min, long max, double mean)
+        {
+            this.min = min;
+            this.max = max;
+            this.mean = mean;
+        }
+
+        @Override
+        public Distribution get()
+        {
+            return new DistributionOffsetApache(new ExponentialDistribution(new JDKRandomGenerator(), mean, ExponentialDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY), min, max);
+        }
+    }
+
+    private static class ExtremeFactory implements DistributionFactory
+    {
+        final long min, max;
+        final double shape, scale;
+        private ExtremeFactory(long min, long max, double shape, double scale)
+        {
+            this.min = min;
+            this.max = max;
+            this.shape = shape;
+            this.scale = scale;
+        }
+
+        @Override
+        public Distribution get()
+        {
+            return new DistributionOffsetApache(new WeibullDistribution(new JDKRandomGenerator(), shape, scale, WeibullDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY), min, max);
+        }
+    }
+
+    private static final class QuantizedExtremeFactory extends ExtremeFactory
+    {
+        final int quantas;
+        private QuantizedExtremeFactory(long min, long max, double shape, double scale, int quantas)
+        {
+            super(min, max, shape, scale);
+            this.quantas = quantas;
+        }
+
+        @Override
+        public Distribution get()
+        {
+            return new DistributionQuantized(new DistributionOffsetApache(new WeibullDistribution(new JDKRandomGenerator(), shape, scale, WeibullDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY), min, max), quantas);
+        }
+    }
+
+    private static final class GaussianFactory implements DistributionFactory
+    {
+        final long min, max;
+        final double mean, stdev;
+        private GaussianFactory(long min, long max, double mean, double stdev)
+        {
+            this.min = min;
+            this.max = max;
+            this.stdev = stdev;
+            this.mean = mean;
+        }
+
+        @Override
+        public Distribution get()
+        {
+            return new DistributionBoundApache(new NormalDistribution(new JDKRandomGenerator(), mean, stdev, NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY), min, max);
+        }
+    }
+
+    private static final class UniformFactory implements DistributionFactory
+    {
+        final long min, max;
+        private UniformFactory(long min, long max)
+        {
+            this.min = min;
+            this.max = max;
+        }
+
+        @Override
+        public Distribution get()
+        {
+            return new DistributionBoundApache(new UniformRealDistribution(new JDKRandomGenerator(), min, max + 1), min, max);
+        }
+    }
+
+    private static final class FixedFactory implements DistributionFactory
+    {
+        final long key;
+        private FixedFactory(long key)
+        {
+            this.key = key;
+        }
+
+        @Override
+        public Distribution get()
+        {
+            return new DistributionFixed(key);
+        }
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return prefix.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object that)
+    {
+        return super.equals(that) && ((OptionDistribution) that).prefix.equals(this.prefix);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionEnumProbabilities.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionEnumProbabilities.java
new file mode 100644
index 0000000..71d7a8d
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionEnumProbabilities.java

@@ -0,0 +1,84 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+
+
+public final class OptionEnumProbabilities<T> extends OptionMulti
+{
+    final List<OptMatcher<T>> options;
+
+    public static class Opt<T>
+    {
+        final T option;
+        final String defaultValue;
+
+        public Opt(T option, String defaultValue)
+        {
+            this.option = option;
+            this.defaultValue = defaultValue;
+        }
+    }
+
+    private static final class OptMatcher<T> extends OptionSimple
+    {
+        final T opt;
+        OptMatcher(T opt, String defaultValue)
+        {
+            super(opt.toString().toLowerCase() + "=", "[0-9]+(\\.[0-9]+)?", defaultValue, "Performs this many " + opt + " operations out of total", false);
+            this.opt = opt;
+        }
+    }
+
+    public OptionEnumProbabilities(List<Opt<T>> universe, String name, String description)
+    {
+        super(name, description, false);
+        List<OptMatcher<T>> options = new ArrayList<>();
+        for (Opt<T> option : universe)
+            options.add(new OptMatcher<T>(option.option, option.defaultValue));
+        this.options = options;
+    }
+
+    @Override
+    public List<? extends Option> options()
+    {
+        return options;
+    }
+
+    Map<T, Double> ratios()
+    {
+        List<? extends Option> ratiosIn = setByUser() ? optionsSetByUser() : defaultOptions();
+        Map<T, Double> ratiosOut = new HashMap<>();
+        for (Option opt : ratiosIn)
+        {
+            OptMatcher<T> optMatcher = (OptMatcher<T>) opt;
+            double d = Double.parseDouble(optMatcher.value());
+            ratiosOut.put(optMatcher.opt, d);
+        }
+        return ratiosOut;
+    }
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionMulti.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionMulti.java
new file mode 100644
index 0000000..32bfc65
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionMulti.java

@@ -0,0 +1,218 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * For specifying multiple grouped sub-options in the form: group(arg1=,arg2,arg3) etc.
+ */
+abstract class OptionMulti extends Option
+{
+
+    private static final Pattern ARGS = Pattern.compile("([^,]+)", Pattern.CASE_INSENSITIVE);
+
+    private final class Delegate extends GroupedOptions
+    {
+        @Override
+        public List<? extends Option> options()
+        {
+            if (collectAsMap == null)
+                return OptionMulti.this.options();
+
+            List<Option> options = new ArrayList<>(OptionMulti.this.options());
+            options.add(collectAsMap);
+            return options;
+        }
+    }
+
+    protected abstract List<? extends Option> options();
+
+    public Map<String, String> extraOptions()
+    {
+        return collectAsMap == null ? new HashMap<String, String>() : collectAsMap.options;
+    }
+
+    private final String name;
+    private final Pattern pattern;
+    private final String description;
+    private final Delegate delegate = new Delegate();
+    private final CollectAsMap collectAsMap;
+
+    public OptionMulti(String name, String description, boolean collectExtraOptionsInMap)
+    {
+        this.name = name;
+        pattern = Pattern.compile(name + "\\((.*)\\)", Pattern.CASE_INSENSITIVE);
+        this.description = description;
+        this.collectAsMap = collectExtraOptionsInMap ? new CollectAsMap() : null;
+    }
+
+    @Override
+    public boolean accept(String param)
+    {
+        Matcher m = pattern.matcher(param);
+        if (!m.matches())
+            return false;
+        m = ARGS.matcher(m.group(1));
+        int last = -1;
+        while (m.find())
+        {
+            if (m.start() != last + 1)
+                throw new IllegalArgumentException("Invalid " + name + " specification: " + param);
+            last = m.end();
+            if (!delegate.accept(m.group()))
+            {
+
+                throw new IllegalArgumentException("Invalid " + name + " specification: " + m.group());
+            }
+        }
+        return true;
+    }
+
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append(name);
+        sb.append("(");
+        for (Option option : delegate.options())
+        {
+            sb.append(option);
+            sb.append(",");
+        }
+        sb.append(")");
+        return sb.toString();
+    }
+
+    @Override
+    public String shortDisplay()
+    {
+        return (happy() ? "[" : "") + name + "(?)" + (happy() ? "]" : "");
+    }
+
+    @Override
+    public String longDisplay()
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append(name);
+        sb.append("(");
+        for (Option opt : delegate.options())
+        {
+            sb.append(opt.shortDisplay());
+        }
+        sb.append("): ");
+        sb.append(description);
+        return sb.toString();
+    }
+
+    @Override
+    public List<String> multiLineDisplay()
+    {
+        final List<String> r = new ArrayList<>();
+        for (Option option : options())
+            r.add(option.longDisplay());
+        return r;
+    }
+
+    @Override
+    boolean happy()
+    {
+        return delegate.happy();
+    }
+
+    private static final class CollectAsMap extends Option
+    {
+
+        static final String description = "Extra options";
+        Map<String, String> options = new LinkedHashMap<>();
+
+        boolean accept(String param)
+        {
+            String[] args = param.split("=");
+            if (args.length == 2 && args[1].length() > 0 && args[0].length() > 0)
+            {
+                if (options.put(args[0], args[1]) != null)
+                    throw new IllegalArgumentException(args[0] + " set twice");
+                return true;
+            }
+            return false;
+        }
+
+        boolean happy()
+        {
+            return true;
+        }
+
+        String shortDisplay()
+        {
+            return "[<option 1..N>=?]";
+        }
+
+        String longDisplay()
+        {
+            return GroupedOptions.formatLong(shortDisplay(), description);
+        }
+
+        List<String> multiLineDisplay()
+        {
+            return Collections.emptyList();
+        }
+
+        boolean setByUser()
+        {
+            return !options.isEmpty();
+        }
+    }
+
+    List<Option> optionsSetByUser()
+    {
+        List<Option> r = new ArrayList<>();
+        for (Option option : delegate.options())
+            if (option.setByUser())
+                r.add(option);
+        return r;
+    }
+
+    List<Option> defaultOptions()
+    {
+        List<Option> r = new ArrayList<>();
+        for (Option option : delegate.options())
+            if (!option.setByUser() && option.happy())
+                r.add(option);
+        return r;
+    }
+
+    boolean setByUser()
+    {
+        for (Option option : delegate.options())
+            if (option.setByUser())
+                return true;
+        return false;
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionRatioDistribution.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionRatioDistribution.java
new file mode 100644
index 0000000..756536f
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionRatioDistribution.java

@@ -0,0 +1,178 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Function;
+import org.apache.commons.math3.distribution.ExponentialDistribution;
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.commons.math3.distribution.UniformRealDistribution;
+import org.apache.commons.math3.distribution.WeibullDistribution;
+import org.apache.commons.math3.random.JDKRandomGenerator;
+
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.DistributionBoundApache;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.DistributionFixed;
+import org.apache.cassandra.stress.generate.DistributionInverted;
+import org.apache.cassandra.stress.generate.DistributionOffsetApache;
+import org.apache.cassandra.stress.generate.RatioDistribution;
+import org.apache.cassandra.stress.generate.RatioDistributionFactory;
+
+/**
+ * For selecting a mathematical distribution
+ */
+public class OptionRatioDistribution extends Option
+{
+
+    public static final Function<String, RatioDistributionFactory> BUILDER = new Function<String, RatioDistributionFactory>()
+    {
+        public RatioDistributionFactory apply(String s)
+        {
+            return get(s);
+        }
+    };
+
+    private static final Pattern FULL = Pattern.compile("(.*)/([0-9]+[KMB]?)", Pattern.CASE_INSENSITIVE);
+
+    final OptionDistribution delegate;
+    private double divisor;
+    final String defaultSpec;
+
+    public OptionRatioDistribution(String prefix, String defaultSpec, String description)
+    {
+        this(prefix, defaultSpec, description, defaultSpec != null);
+    }
+
+    public OptionRatioDistribution(String prefix, String defaultSpec, String description, boolean required)
+    {
+        delegate = new OptionDistribution(prefix, null, description, required);
+        this.defaultSpec = defaultSpec;
+    }
+
+    @Override
+    public boolean accept(String param)
+    {
+        Matcher m = FULL.matcher(param);
+        if (!m.matches() || !delegate.accept(m.group(1)))
+            return false;
+        divisor = OptionDistribution.parseLong(m.group(2));
+        return true;
+    }
+
+    public static RatioDistributionFactory get(String spec)
+    {
+        OptionRatioDistribution opt = new OptionRatioDistribution("", "", "", true);
+        if (!opt.accept(spec))
+            throw new IllegalArgumentException("Invalid ratio definition: "+spec);
+        return opt.get();
+    }
+
+    public RatioDistributionFactory get()
+    {
+        if (delegate.setByUser())
+            return new DelegateFactory(delegate.get(), divisor);
+        if (defaultSpec == null)
+            return null;
+        OptionRatioDistribution sub = new OptionRatioDistribution(delegate.prefix, null, null, true);
+        if (!sub.accept(defaultSpec))
+            throw new IllegalStateException("Invalid default spec: " + defaultSpec);
+        return sub.get();
+    }
+
+    @Override
+    public boolean happy()
+    {
+        return delegate.happy();
+    }
+
+    public String longDisplay()
+    {
+        return delegate.longDisplay();
+    }
+
+    @Override
+    public List<String> multiLineDisplay()
+    {
+        return Arrays.asList(
+                GroupedOptions.formatMultiLine("EXP(min..max)/divisor", "An exponential ratio distribution over the range [min..max]/divisor"),
+                GroupedOptions.formatMultiLine("EXTREME(min..max,shape)/divisor", "An extreme value (Weibull) ratio distribution over the range [min..max]/divisor"),
+                GroupedOptions.formatMultiLine("GAUSSIAN(min..max,stdvrng)/divisor", "A gaussian/normal ratio distribution, where mean=(min+max)/2, and stdev is ((mean-min)/stdvrng)/divisor"),
+                GroupedOptions.formatMultiLine("GAUSSIAN(min..max,mean,stdev)/divisor", "A gaussian/normal ratio distribution, with explicitly defined mean and stdev"),
+                GroupedOptions.formatMultiLine("UNIFORM(min..max)/divisor", "A uniform ratio distribution over the range [min, max]/divisor"),
+                GroupedOptions.formatMultiLine("FIXED(val)/divisor", "A fixed ratio distribution, always returning the same value"),
+                "Preceding the name with ~ will invert the distribution, e.g. ~exp(1..10)/10 will yield 0.1 least, instead of most, often",
+                "Aliases: extr, gauss, normal, norm, weibull"
+        );
+    }
+
+    boolean setByUser()
+    {
+        return delegate.setByUser();
+    }
+
+    @Override
+    public String shortDisplay()
+    {
+        return delegate.shortDisplay();
+    }
+
+    // factories
+
+    private static final class DelegateFactory implements RatioDistributionFactory
+    {
+        final DistributionFactory delegate;
+        final double divisor;
+
+        private DelegateFactory(DistributionFactory delegate, double divisor)
+        {
+            this.delegate = delegate;
+            this.divisor = divisor;
+        }
+
+        @Override
+        public RatioDistribution get()
+        {
+            return new RatioDistribution(delegate.get(), divisor);
+        }
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return delegate.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object that)
+    {
+        return super.equals(that) && ((OptionRatioDistribution) that).delegate.equals(this.delegate);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionReplication.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionReplication.java
new file mode 100644
index 0000000..8b65587
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionReplication.java

@@ -0,0 +1,94 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Function;
+
+import org.apache.cassandra.locator.AbstractReplicationStrategy;
+
+/**
+ * For specifying replication options
+ */
+class OptionReplication extends OptionMulti
+{
+
+    private final OptionSimple strategy = new OptionSimple("strategy=", new StrategyAdapter(), "org.apache.cassandra.locator.SimpleStrategy", "The replication strategy to use", false);
+    private final OptionSimple factor = new OptionSimple("factor=", "[0-9]+", "1", "The number of replicas", false);
+
+    public OptionReplication()
+    {
+        super("replication", "Define the replication strategy and any parameters", true);
+    }
+
+    public String getStrategy()
+    {
+        return strategy.value();
+    }
+
+    public Map<String, String> getOptions()
+    {
+        Map<String, String> options = extraOptions();
+        if (!options.containsKey("replication_factor") && (strategy.value().equals("org.apache.cassandra.locator.SimpleStrategy") || factor.setByUser()))
+            options.put("replication_factor", factor.value());
+        return options;
+    }
+
+    protected List<? extends Option> options()
+    {
+        return Arrays.asList(strategy, factor);
+    }
+
+    @Override
+    public boolean happy()
+    {
+        return true;
+    }
+
+    private static final class StrategyAdapter implements Function<String, String>
+    {
+        public String apply(String name)
+        {
+            String strategy = null;
+            for (String fullname : new String[] { name, "org.apache.cassandra.locator." + name })
+            {
+                try
+                {
+                    Class<?> clazz = Class.forName(fullname);
+                    if (!AbstractReplicationStrategy.class.isAssignableFrom(clazz))
+                        throw new IllegalArgumentException(clazz + " is not a replication strategy");
+                    strategy = fullname;
+                    break;
+                } catch (Exception _)
+                {
+                }
+            }
+            if (strategy == null)
+                throw new IllegalArgumentException("Invalid replication strategy: " + name);
+            return strategy;
+        }
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionSimple.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionSimple.java
new file mode 100644
index 0000000..ba26c2a
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionSimple.java

@@ -0,0 +1,181 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Function;
+
+/**
+ * For parsing a simple (sub)option for a command/major option
+ */
+class OptionSimple extends Option implements Serializable
+{
+
+    final String displayPrefix;
+    private final Pattern matchPrefix;
+    private final String defaultValue;
+    private final Function<String, String> valueAdapter;
+    private final String description;
+    private final boolean required;
+    private String value;
+
+    private static final class ValueMatcher implements Function<String, String>, Serializable
+    {
+        final Pattern pattern;
+        private ValueMatcher(Pattern pattern)
+        {
+            this.pattern = pattern;
+        }
+        public String apply(String s)
+        {
+            if (!pattern.matcher(s).matches())
+                throw new IllegalArgumentException("Invalid value " + s + "; must match pattern " + pattern);
+            return s;
+        }
+    }
+
+    public OptionSimple(String prefix, String valuePattern, String defaultValue, String description, boolean required)
+    {
+        this(prefix, Pattern.compile(Pattern.quote(prefix), Pattern.CASE_INSENSITIVE),
+             Pattern.compile(valuePattern, Pattern.CASE_INSENSITIVE), defaultValue, description, required);
+    }
+
+    public OptionSimple(String prefix, Function<String, String> valueAdapter, String defaultValue, String description, boolean required)
+    {
+        this(prefix, Pattern.compile(Pattern.quote(prefix), Pattern.CASE_INSENSITIVE), valueAdapter, defaultValue, description, required);
+    }
+
+    public OptionSimple(String displayPrefix, Pattern matchPrefix, Pattern valuePattern, String defaultValue, String description, boolean required)
+    {
+        this(displayPrefix, matchPrefix, new ValueMatcher(valuePattern), defaultValue, description, required);
+    }
+
+    public OptionSimple(String displayPrefix, Pattern matchPrefix, Function<String, String> valueAdapter, String defaultValue, String description, boolean required)
+    {
+        this.displayPrefix = displayPrefix;
+        this.matchPrefix = matchPrefix;
+        this.valueAdapter = valueAdapter;
+        this.defaultValue = defaultValue;
+        this.description = description;
+        this.required = required;
+    }
+
+    public boolean setByUser()
+    {
+        return value != null;
+    }
+
+    public boolean isRequired()
+    {
+        return required;
+    }
+
+    public boolean present()
+    {
+        return value != null || defaultValue != null;
+    }
+
+    public String value()
+    {
+        return value != null ? value : defaultValue;
+    }
+
+    public boolean accept(String param)
+    {
+        if (matchPrefix.matcher(param).lookingAt())
+        {
+            if (value != null)
+                throw new IllegalArgumentException("Suboption " + displayPrefix + " has been specified more than once");
+            String v = param.substring(displayPrefix.length());
+            value = valueAdapter.apply(v);
+            assert value != null;
+            return true;
+        }
+        return false;
+    }
+
+    @Override
+    public boolean happy()
+    {
+        return !required || value != null;
+    }
+
+    public String shortDisplay()
+    {
+        StringBuilder sb = new StringBuilder();
+        if (!required)
+            sb.append("[");
+        sb.append(displayPrefix);
+        if (displayPrefix.endsWith("="))
+            sb.append("?");
+        if (displayPrefix.endsWith("<"))
+            sb.append("?");
+        if (displayPrefix.endsWith(">"))
+            sb.append("?");
+        if (!required)
+            sb.append("]");
+        return sb.toString();
+    }
+
+    public String longDisplay()
+    {
+        if (description.equals("") && defaultValue == null
+            && (valueAdapter instanceof ValueMatcher && ((ValueMatcher) valueAdapter).pattern.pattern().equals("")))
+            return null;
+        StringBuilder sb = new StringBuilder();
+        sb.append(displayPrefix);
+        if (displayPrefix.endsWith("="))
+            sb.append("?");
+        if (displayPrefix.endsWith("<"))
+            sb.append("?");
+        if (displayPrefix.endsWith(">"))
+            sb.append("?");
+        if (defaultValue != null)
+        {
+            sb.append(" (default=");
+            sb.append(defaultValue);
+            sb.append(")");
+        }
+        return GroupedOptions.formatLong(sb.toString(), description);
+    }
+
+    public List<String> multiLineDisplay()
+    {
+        return Collections.emptyList();
+    }
+
+    public int hashCode()
+    {
+        return displayPrefix.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object that)
+    {
+        return that instanceof OptionSimple && ((OptionSimple) that).displayPrefix.equals(this.displayPrefix);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsColumn.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsColumn.java
new file mode 100644
index 0000000..ecdef29
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsColumn.java

@@ -0,0 +1,238 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.stress.generate.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * For parsing column options
+ */
+public class SettingsColumn implements Serializable
+{
+
+    public final int maxColumnsPerKey;
+    public transient List<ByteBuffer> names;
+    public final List<String> namestrs;
+    public final String comparator;
+    public final String timestamp;
+    public final boolean variableColumnCount;
+    public final boolean slice;
+    public final DistributionFactory sizeDistribution;
+    public final DistributionFactory countDistribution;
+
+    public SettingsColumn(GroupedOptions options)
+    {
+        this((Options) options,
+                options instanceof NameOptions ? (NameOptions) options : null,
+                options instanceof CountOptions ? (CountOptions) options : null
+        );
+    }
+
+    public SettingsColumn(Options options, NameOptions name, CountOptions count)
+    {
+        sizeDistribution = options.size.get();
+        {
+            timestamp = options.timestamp.value();
+            comparator = options.comparator.value();
+            AbstractType parsed = null;
+
+            try
+            {
+                parsed = TypeParser.parse(comparator);
+            }
+            catch (Exception e)
+            {
+                System.err.println(e.getMessage());
+                System.exit(1);
+            }
+
+            if (!(parsed instanceof TimeUUIDType || parsed instanceof AsciiType || parsed instanceof UTF8Type))
+            {
+                System.err.println("Currently supported types are: TimeUUIDType, AsciiType, UTF8Type.");
+                System.exit(1);
+            }
+        }
+        if (name != null)
+        {
+            assert count == null;
+
+            AbstractType comparator;
+            try
+            {
+                comparator = TypeParser.parse(this.comparator);
+            } catch (Exception e)
+            {
+                throw new IllegalArgumentException(this.comparator + " is not a valid type");
+            }
+
+            final String[] names = name.name.value().split(",");
+            this.names = new ArrayList<>(names.length);
+
+            for (String columnName : names)
+                this.names.add(comparator.fromString(columnName));
+            Collections.sort(this.names, BytesType.instance);
+            this.namestrs = new ArrayList<>();
+            for (ByteBuffer columnName : this.names)
+                this.namestrs.add(comparator.getString(columnName));
+
+            final int nameCount = this.names.size();
+            countDistribution = new DistributionFactory()
+            {
+                @Override
+                public Distribution get()
+                {
+                    return new DistributionFixed(nameCount);
+                }
+            };
+        }
+        else
+        {
+            this.countDistribution = count.count.get();
+            ByteBuffer[] names = new ByteBuffer[(int) countDistribution.get().maxValue()];
+            String[] namestrs = new String[(int) countDistribution.get().maxValue()];
+            for (int i = 0 ; i < names.length ; i++)
+                names[i] = ByteBufferUtil.bytes("C" + i);
+            Arrays.sort(names, BytesType.instance);
+            try
+            {
+                for (int i = 0 ; i < names.length ; i++)
+                    namestrs[i] = ByteBufferUtil.string(names[i]);
+            }
+            catch (CharacterCodingException e)
+            {
+                throw new RuntimeException(e);
+            }
+            this.names = Arrays.asList(names);
+            this.namestrs = Arrays.asList(namestrs);
+        }
+        maxColumnsPerKey = (int) countDistribution.get().maxValue();
+        variableColumnCount = countDistribution.get().minValue() < maxColumnsPerKey;
+        slice = options.slice.setByUser();
+    }
+
+    // Option Declarations
+
+    private static abstract class Options extends GroupedOptions
+    {
+        final OptionSimple superColumns = new OptionSimple("super=", "[0-9]+", "0", "Number of super columns to use (no super columns used if not specified)", false);
+        final OptionSimple comparator = new OptionSimple("comparator=", "TimeUUIDType|AsciiType|UTF8Type", "AsciiType", "Column Comparator to use", false);
+        final OptionSimple slice = new OptionSimple("slice", "", null, "If set, range slices will be used for reads, otherwise a names query will be", false);
+        final OptionSimple timestamp = new OptionSimple("timestamp=", "[0-9]+", null, "If set, all columns will be written with the given timestamp", false);
+        final OptionDistribution size = new OptionDistribution("size=", "FIXED(34)", "Cell size distribution");
+    }
+
+    private static final class NameOptions extends Options
+    {
+        final OptionSimple name = new OptionSimple("names=", ".*", null, "Column names", true);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(name, slice, superColumns, comparator, timestamp, size);
+        }
+    }
+
+    private static final class CountOptions extends Options
+    {
+        final OptionDistribution count = new OptionDistribution("n=", "FIXED(5)", "Cell count distribution, per operation");
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(count, slice, superColumns, comparator, timestamp, size);
+        }
+    }
+
+    // CLI Utility Methods
+
+    static SettingsColumn get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-col");
+        if (params == null)
+            return new SettingsColumn(new CountOptions());
+
+        GroupedOptions options = GroupedOptions.select(params, new NameOptions(), new CountOptions());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -col options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsColumn(options);
+    }
+
+    static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-col", new NameOptions(), new CountOptions());
+    }
+
+    static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+
+    /* Custom serializaiton invoked here to make legacy thrift based table creation work with StressD. This code requires
+     * the names attribute to be populated. Since the names attribute is set as a List[ByteBuffer] we switch it
+     * to an array on the way out and back to a buffer when it's being read in.
+     */
+
+    private void writeObject(ObjectOutputStream oos) throws IOException
+    {
+        oos.defaultWriteObject();
+        ArrayList<byte[]> namesBytes = new ArrayList<>();
+        for (ByteBuffer buffer : this.names)
+            namesBytes.add(ByteBufferUtil.getArray(buffer));
+        oos.writeObject(namesBytes);
+    }
+
+    private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException
+    {
+        ois.defaultReadObject();
+        List<ByteBuffer> namesBuffer = new ArrayList<>();
+        List<byte[]> namesBytes = (List<byte[]>) ois.readObject();
+        for (byte[] bytes : namesBytes)
+            namesBuffer.add(ByteBuffer.wrap(bytes));
+        this.names = new ArrayList<>(namesBuffer);
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommand.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommand.java
new file mode 100644
index 0000000..a1c89e1
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommand.java

@@ -0,0 +1,208 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.stress.generate.SeedManager;
+import org.apache.cassandra.stress.operations.OpDistributionFactory;
+import org.apache.cassandra.thrift.ConsistencyLevel;
+
+// Generic command settings - common to read/write/etc
+public abstract class SettingsCommand implements Serializable
+{
+
+    public final Command type;
+    public final long count;
+    public final long duration;
+    public final TimeUnit durationUnits;
+    public final boolean noWarmup;
+    public final ConsistencyLevel consistencyLevel;
+    public final double targetUncertainty;
+    public final int minimumUncertaintyMeasurements;
+    public final int maximumUncertaintyMeasurements;
+
+    public abstract OpDistributionFactory getFactory(StressSettings settings);
+
+    public SettingsCommand(Command type, GroupedOptions options)
+    {
+        this(type, (Options) options,
+                options instanceof Count ? (Count) options : null,
+                options instanceof Duration ? (Duration) options : null,
+                options instanceof Uncertainty ? (Uncertainty) options : null
+        );
+    }
+
+    public SettingsCommand(Command type, Options options, Count count, Duration duration, Uncertainty uncertainty)
+    {
+        this.type = type;
+        this.consistencyLevel = ConsistencyLevel.valueOf(options.consistencyLevel.value().toUpperCase());
+        this.noWarmup = options.noWarmup.setByUser();
+        if (count != null)
+        {
+            this.count = Long.parseLong(count.count.value());
+            this.duration = 0;
+            this.durationUnits = null;
+            this.targetUncertainty = -1;
+            this.minimumUncertaintyMeasurements = -1;
+            this.maximumUncertaintyMeasurements = -1;
+        }
+        else if (duration != null)
+        {
+            this.count = -1;
+            this.duration = Long.parseLong(duration.duration.value().substring(0, duration.duration.value().length() - 1));
+            switch (duration.duration.value().toLowerCase().charAt(duration.duration.value().length() - 1))
+            {
+                case 's':
+                    this.durationUnits = TimeUnit.SECONDS;
+                    break;
+                case 'm':
+                    this.durationUnits = TimeUnit.MINUTES;
+                    break;
+                case 'h':
+                    this.durationUnits = TimeUnit.HOURS;
+                    break;
+                default:
+                    throw new IllegalStateException();
+            }
+            this.targetUncertainty = -1;
+            this.minimumUncertaintyMeasurements = -1;
+            this.maximumUncertaintyMeasurements = -1;
+        }
+        else
+        {
+            this.count = -1;
+            this.duration = 0;
+            this.durationUnits = null;
+            this.targetUncertainty = Double.parseDouble(uncertainty.uncertainty.value());
+            this.minimumUncertaintyMeasurements = Integer.parseInt(uncertainty.minMeasurements.value());
+            this.maximumUncertaintyMeasurements = Integer.parseInt(uncertainty.maxMeasurements.value());
+        }
+    }
+
+    // Option Declarations
+
+    static abstract class Options extends GroupedOptions
+    {
+        final OptionSimple noWarmup = new OptionSimple("no-warmup", "", null, "Do not warmup the process", false);
+        final OptionSimple consistencyLevel = new OptionSimple("cl=", "ONE|QUORUM|LOCAL_QUORUM|EACH_QUORUM|ALL|ANY", "ONE", "Consistency level to use", false);
+    }
+
+    static class Count extends Options
+    {
+        final OptionSimple count = new OptionSimple("n=", "[0-9]+", null, "Number of operations to perform", true);
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(count, noWarmup, consistencyLevel);
+        }
+    }
+
+    static class Duration extends Options
+    {
+        final OptionSimple duration = new OptionSimple("duration=", "[0-9]+[smh]", null, "Time to run in (in seconds, minutes or hours)", true);
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(duration, noWarmup, consistencyLevel);
+        }
+    }
+
+    static class Uncertainty extends Options
+    {
+        final OptionSimple uncertainty = new OptionSimple("err<", "0\\.[0-9]+", "0.02", "Run until the standard error of the mean is below this fraction", false);
+        final OptionSimple minMeasurements = new OptionSimple("n>", "[0-9]+", "30", "Run at least this many iterations before accepting uncertainty convergence", false);
+        final OptionSimple maxMeasurements = new OptionSimple("n<", "[0-9]+", "200", "Run at most this many iterations before accepting uncertainty convergence", false);
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(uncertainty, minMeasurements, maxMeasurements, noWarmup, consistencyLevel);
+        }
+    }
+
+    // CLI Utility Methods
+
+    static SettingsCommand get(Map<String, String[]> clArgs)
+    {
+        for (Command cmd : Command.values())
+        {
+            if (cmd.category == null)
+                continue;
+
+            for (String name : cmd.names)
+            {
+                final String[] params = clArgs.remove(name);
+                if (params == null)
+                    continue;
+
+                switch (cmd.category)
+                {
+                    case BASIC:
+                        return SettingsCommandPreDefined.build(cmd, params);
+                    case MIXED:
+                        return SettingsCommandPreDefinedMixed.build(params);
+                    case USER:
+                        return SettingsCommandUser.build(params);
+                }
+            }
+        }
+        return null;
+    }
+
+/*    static SettingsCommand build(Command type, String[] params)
+    {
+        GroupedOptions options = GroupedOptions.select(params, new Count(), new Duration(), new Uncertainty());
+        if (options == null)
+        {
+            printHelp(type);
+            System.out.println("Invalid " + type + " options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsCommand(type, options);
+    }*/
+
+    static void printHelp(Command type)
+    {
+        printHelp(type.toString().toLowerCase());
+    }
+
+    static void printHelp(String type)
+    {
+        GroupedOptions.printOptions(System.out, type.toLowerCase(), new Uncertainty(), new Count(), new Duration());
+    }
+
+    static Runnable helpPrinter(final Command type)
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp(type);
+            }
+        };
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefined.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefined.java
new file mode 100644
index 0000000..8583e90
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefined.java

@@ -0,0 +1,150 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.SeedManager;
+import org.apache.cassandra.stress.generate.values.Bytes;
+import org.apache.cassandra.stress.generate.values.Generator;
+import org.apache.cassandra.stress.generate.values.GeneratorConfig;
+import org.apache.cassandra.stress.generate.values.HexBytes;
+import org.apache.cassandra.stress.operations.FixedOpDistribution;
+import org.apache.cassandra.stress.operations.OpDistribution;
+import org.apache.cassandra.stress.operations.OpDistributionFactory;
+import org.apache.cassandra.stress.operations.predefined.PredefinedOperation;
+import org.apache.cassandra.stress.settings.SettingsCommandPreDefinedMixed.Options;
+import org.apache.cassandra.stress.util.Timer;
+
+// Settings unique to the mixed command type
+public class SettingsCommandPreDefined extends SettingsCommand
+{
+
+    public final DistributionFactory add;
+    public final int keySize;
+
+    public OpDistributionFactory getFactory(final StressSettings settings)
+    {
+        final SeedManager seeds = new SeedManager(settings);
+        return new OpDistributionFactory()
+        {
+            public OpDistribution get(Timer timer)
+            {
+                return new FixedOpDistribution(PredefinedOperation.operation(type, timer, newGenerator(settings, seeds), settings, add));
+            }
+
+            public String desc()
+            {
+                return type.toString();
+            }
+
+            public Iterable<OpDistributionFactory> each()
+            {
+                return Collections.<OpDistributionFactory>singleton(this);
+            }
+        };
+    }
+
+    PartitionGenerator newGenerator(StressSettings settings, SeedManager seeds)
+    {
+        List<String> names = settings.columns.namestrs;
+        List<Generator> partitionKey = Collections.<Generator>singletonList(new HexBytes("key",
+                                       new GeneratorConfig("randomstrkey", null,
+                                                           OptionDistribution.get("fixed(" + keySize + ")"), null)));
+
+        List<Generator> columns = new ArrayList<>();
+        for (int i = 0 ; i < settings.columns.maxColumnsPerKey ; i++)
+            columns.add(new Bytes(names.get(i), new GeneratorConfig("randomstr" + names.get(i), null, settings.columns.sizeDistribution, null)));
+        return new PartitionGenerator(partitionKey, Collections.<Generator>emptyList(), columns, PartitionGenerator.Order.ARBITRARY, seeds);
+    }
+
+    public SettingsCommandPreDefined(Command type, Options options)
+    {
+        super(type, options.parent);
+        add = options.add.get();
+        keySize = Integer.parseInt(options.keysize.value());
+    }
+
+    // Option Declarations
+
+    static class Options extends GroupedOptions
+    {
+        final SettingsCommand.Options parent;
+        protected Options(SettingsCommand.Options parent)
+        {
+            this.parent = parent;
+        }
+        final OptionDistribution add = new OptionDistribution("add=", "fixed(1)", "Distribution of value of counter increments");
+        final OptionSimple keysize = new OptionSimple("keysize=", "[0-9]+", "10", "Key size in bytes", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return merge(parent.options(), Arrays.asList(add, keysize));
+        }
+
+    }
+
+    // CLI utility methods
+
+    public static SettingsCommandPreDefined build(Command type, String[] params)
+    {
+        GroupedOptions options = GroupedOptions.select(params,
+                new Options(new Uncertainty()),
+                new Options(new Count()),
+                new Options(new Duration()));
+        if (options == null)
+        {
+            printHelp(type);
+            System.out.println("Invalid " + type + " options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsCommandPreDefined(type, (Options) options);
+    }
+
+    static void printHelp(Command type)
+    {
+        printHelp(type.toString().toLowerCase());
+    }
+
+    static void printHelp(String type)
+    {
+        GroupedOptions.printOptions(System.out, type.toLowerCase(), new Uncertainty(), new Count(), new Duration());
+    }
+
+    static Runnable helpPrinter(final Command type)
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp(type);
+            }
+        };
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefinedMixed.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefinedMixed.java
new file mode 100644
index 0000000..6beb55f
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefinedMixed.java

@@ -0,0 +1,153 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.SeedManager;
+import org.apache.cassandra.stress.operations.OpDistributionFactory;
+import org.apache.cassandra.stress.operations.SampledOpDistributionFactory;
+import org.apache.cassandra.stress.operations.predefined.PredefinedOperation;
+import org.apache.cassandra.stress.util.Timer;
+
+import org.apache.commons.math3.util.Pair;
+
+// Settings unique to the mixed command type
+public class SettingsCommandPreDefinedMixed extends SettingsCommandPreDefined
+{
+
+    // Ratios for selecting commands - index for each Command, NaN indicates the command is not requested
+    private final Map<Command, Double> ratios;
+    private final DistributionFactory clustering;
+
+    public SettingsCommandPreDefinedMixed(Options options)
+    {
+        super(Command.MIXED, options);
+
+        clustering = options.clustering.get();
+        ratios = options.probabilities.ratios();
+        if (ratios.size() == 0)
+            throw new IllegalArgumentException("Must specify at least one command with a non-zero ratio");
+    }
+
+    public OpDistributionFactory getFactory(final StressSettings settings)
+    {
+        final SeedManager seeds = new SeedManager(settings);
+        return new SampledOpDistributionFactory<Command>(ratios, clustering)
+        {
+            protected Operation get(Timer timer, PartitionGenerator generator, Command key)
+            {
+                return PredefinedOperation.operation(key, timer, generator, settings, add);
+            }
+
+            protected PartitionGenerator newGenerator()
+            {
+                return SettingsCommandPreDefinedMixed.this.newGenerator(settings, seeds);
+            }
+        };
+    }
+
+    // Option Declarations
+
+    static class Options extends SettingsCommandPreDefined.Options
+    {
+        static List<OptionEnumProbabilities.Opt<Command>> probabilityOptions = new ArrayList<>();
+        static
+        {
+            for (Command command : Command.values())
+            {
+                if (command.category == null)
+                    continue;
+                String defaultValue;
+                switch (command)
+                {
+                    case MIXED:
+                        continue;
+                    case READ:
+                    case WRITE:
+                        defaultValue = "1";
+                        break;
+                    default:
+                        defaultValue = null;
+                }
+                probabilityOptions.add(new OptionEnumProbabilities.Opt<>(command, defaultValue));
+            }
+        }
+
+        protected Options(SettingsCommand.Options parent)
+        {
+            super(parent);
+        }
+        final OptionDistribution clustering = new OptionDistribution("clustering=", "GAUSSIAN(1..10)", "Distribution clustering runs of operations of the same kind");
+        final OptionEnumProbabilities probabilities = new OptionEnumProbabilities<>(probabilityOptions, "ratio", "Specify the ratios for operations to perform; e.g. (read=2,write=1) will perform 2 reads for each write");
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return merge(Arrays.asList(clustering, probabilities), super.options());
+        }
+
+    }
+
+    // CLI utility methods
+
+    public static SettingsCommandPreDefinedMixed build(String[] params)
+    {
+        GroupedOptions options = GroupedOptions.select(params,
+                new Options(new SettingsCommand.Uncertainty()),
+                new Options(new SettingsCommand.Count()),
+                new Options(new SettingsCommand.Duration()));
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid MIXED options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsCommandPreDefinedMixed((Options) options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "mixed",
+                                    new Options(new SettingsCommand.Uncertainty()),
+                                    new Options(new SettingsCommand.Count()),
+                                    new Options(new SettingsCommand.Duration()));
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandUser.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandUser.java
new file mode 100644
index 0000000..4e2997f
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandUser.java

@@ -0,0 +1,149 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.ImmutableList;
+
+import com.datastax.driver.core.BatchStatement;
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.StressProfile;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.SeedManager;
+import org.apache.cassandra.stress.operations.OpDistributionFactory;
+import org.apache.cassandra.stress.operations.SampledOpDistributionFactory;
+import org.apache.cassandra.stress.util.Timer;
+
+import org.apache.commons.math3.util.Pair;
+
+// Settings unique to the mixed command type
+public class SettingsCommandUser extends SettingsCommand
+{
+
+    // Ratios for selecting commands - index for each Command, NaN indicates the command is not requested
+    private final Map<String, Double> ratios;
+    private final DistributionFactory clustering;
+    public final StressProfile profile;
+
+    public SettingsCommandUser(Options options)
+    {
+        super(Command.USER, options.parent);
+
+        clustering = options.clustering.get();
+        ratios = options.ops.ratios();
+
+        String yamlPath = options.profile.value();
+        File yamlFile = new File(yamlPath);
+        if (yamlFile.exists())
+        {
+            yamlPath = "file:///" + yamlFile.getAbsolutePath();
+        }
+
+        profile = StressProfile.load(URI.create(yamlPath));
+
+        if (ratios.size() == 0)
+            throw new IllegalArgumentException("Must specify at least one command with a non-zero ratio");
+    }
+
+    public OpDistributionFactory getFactory(final StressSettings settings)
+    {
+        final SeedManager seeds = new SeedManager(settings);
+        return new SampledOpDistributionFactory<String>(ratios, clustering)
+        {
+            protected Operation get(Timer timer, PartitionGenerator generator, String key)
+            {
+                if (key.equalsIgnoreCase("insert"))
+                    return profile.getInsert(timer, generator, settings);
+                return profile.getQuery(key, timer, generator, settings);
+            }
+
+            protected PartitionGenerator newGenerator()
+            {
+                return profile.newGenerator(settings, seeds);
+            }
+        };
+    }
+
+    static final class Options extends GroupedOptions
+    {
+        final SettingsCommand.Options parent;
+        protected Options(SettingsCommand.Options parent)
+        {
+            this.parent = parent;
+        }
+        final OptionDistribution clustering = new OptionDistribution("clustering=", "gaussian(1..10)", "Distribution clustering runs of operations of the same kind");
+        final OptionSimple profile = new OptionSimple("profile=", ".*", null, "Specify the path to a yaml cql3 profile", true);
+        final OptionAnyProbabilities ops = new OptionAnyProbabilities("ops", "Specify the ratios for inserts/queries to perform; e.g. ops(insert=2,<query1>=1) will perform 2 inserts for each query1");
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return merge(Arrays.asList(ops, profile, clustering), parent.options());
+        }
+    }
+
+    // CLI utility methods
+
+    public static SettingsCommandUser build(String[] params)
+    {
+        GroupedOptions options = GroupedOptions.select(params,
+                new Options(new Uncertainty()),
+                new Options(new Duration()),
+                new Options(new Count()));
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid USER options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsCommandUser((Options) options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "user",
+                                    new Options(new Uncertainty()),
+                                    new Options(new Count()),
+                                    new Options(new Duration()));
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsErrors.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsErrors.java
new file mode 100644
index 0000000..625f803
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsErrors.java

@@ -0,0 +1,92 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class SettingsErrors implements Serializable
+{
+
+    public final boolean ignore;
+    public final int tries;
+
+    public SettingsErrors(Options options)
+    {
+        ignore = options.ignore.setByUser();
+        this.tries = Math.max(1, Integer.parseInt(options.retries.value()) + 1);
+    }
+
+    // Option Declarations
+
+    public static final class Options extends GroupedOptions
+    {
+        final OptionSimple retries = new OptionSimple("retries=", "[0-9]+", "9", "Number of tries to perform for each operation before failing", false);
+        final OptionSimple ignore = new OptionSimple("ignore", "", null, "Do not fail on errors", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(retries, ignore);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsErrors get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-errors");
+        if (params == null)
+            return new SettingsErrors(new Options());
+
+        GroupedOptions options = GroupedOptions.select(params, new Options());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -errors options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsErrors((Options) options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-errors", new Options());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsInsert.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsInsert.java
new file mode 100644
index 0000000..a6c298b
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsInsert.java

@@ -0,0 +1,103 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import com.datastax.driver.core.BatchStatement;
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.RatioDistributionFactory;
+
+public class SettingsInsert implements Serializable
+{
+
+    public final DistributionFactory revisit;
+    public final DistributionFactory visits;
+    public final DistributionFactory batchsize;
+    public final RatioDistributionFactory selectRatio;
+    public final BatchStatement.Type batchType;
+
+    private SettingsInsert(InsertOptions options)
+    {
+        this.visits= options.visits.get();
+        this.revisit = options.revisit.get();
+        this.batchsize = options.partitions.get();
+        this.selectRatio = options.selectRatio.get();
+        this.batchType = !options.batchType.setByUser() ? null : BatchStatement.Type.valueOf(options.batchType.value());
+    }
+
+    // Option Declarations
+
+    private static class InsertOptions extends GroupedOptions
+    {
+        final OptionDistribution visits = new OptionDistribution("visits=", "fixed(1)", "The target number of inserts to split a partition into; if more than one, the partition will be placed in the revisit set");
+        final OptionDistribution revisit = new OptionDistribution("revisit=", "uniform(1..1M)", "The distribution with which we revisit partial writes (see visits); implicitly defines size of revisit collection");
+        final OptionDistribution partitions = new OptionDistribution("partitions=", null, "The number of partitions to update in a single batch", false);
+        final OptionSimple batchType = new OptionSimple("batchtype=", "unlogged|logged|counter", null, "Specify the type of batch statement (LOGGED, UNLOGGED or COUNTER)", false);
+        final OptionRatioDistribution selectRatio = new OptionRatioDistribution("select-ratio=", null, "The uniform probability of visiting any CQL row in the generated partition", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(revisit, visits, partitions, batchType, selectRatio);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsInsert get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-insert");
+        if (params == null)
+            return new SettingsInsert(new InsertOptions());
+
+        InsertOptions options = GroupedOptions.select(params, new InsertOptions());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -insert options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsInsert(options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-insert", new InsertOptions());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsLog.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsLog.java
new file mode 100644
index 0000000..5657fb2
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsLog.java

@@ -0,0 +1,120 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class SettingsLog implements Serializable
+{
+    public static enum Level
+    {
+        MINIMAL, NORMAL, VERBOSE
+    }
+
+    public final boolean noSummary;
+    public final File file;
+    public final int intervalMillis;
+    public final Level level;
+
+    public SettingsLog(Options options)
+    {
+        noSummary = options.noSummmary.setByUser();
+
+        if (options.outputFile.setByUser())
+            file = new File(options.outputFile.value());
+        else
+            file = null;
+
+        String interval = options.interval.value();
+        if (interval.endsWith("ms"))
+            intervalMillis = Integer.parseInt(interval.substring(0, interval.length() - 2));
+        else if (interval.endsWith("s"))
+            intervalMillis = 1000 * Integer.parseInt(interval.substring(0, interval.length() - 1));
+        else
+            intervalMillis = 1000 * Integer.parseInt(interval);
+        if (intervalMillis <= 0)
+            throw new IllegalArgumentException("Log interval must be greater than zero");
+        level = Level.valueOf(options.level.value().toUpperCase());
+    }
+
+    public PrintStream getOutput() throws FileNotFoundException
+    {
+        return file == null ? new PrintStream(System.out) : new PrintStream(file);
+    }
+
+    // Option Declarations
+
+    public static final class Options extends GroupedOptions
+    {
+        final OptionSimple noSummmary = new OptionSimple("no-summary", "", null, "Disable printing of aggregate statistics at the end of a test", false);
+        final OptionSimple outputFile = new OptionSimple("file=", ".*", null, "Log to a file", false);
+        final OptionSimple interval = new OptionSimple("interval=", "[0-9]+(ms|s|)", "1s", "Log progress every <value> seconds or milliseconds", false);
+        final OptionSimple level = new OptionSimple("level=", "(minimal|normal|verbose)", "normal", "Logging level (minimal, normal or verbose)", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(level, noSummmary, outputFile, interval);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsLog get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-log");
+        if (params == null)
+            return new SettingsLog(new Options());
+
+        GroupedOptions options = GroupedOptions.select(params, new Options());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -log options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsLog((Options) options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-log", new Options());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMisc.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMisc.java
new file mode 100644
index 0000000..5735f9d
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMisc.java

@@ -0,0 +1,222 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.PrintStream;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.stress.generate.Distribution;
+
+public class SettingsMisc implements Serializable
+{
+
+    static boolean maybeDoSpecial(Map<String, String[]> clArgs)
+    {
+        if (maybePrintHelp(clArgs))
+            return true;
+        if (maybePrintDistribution(clArgs))
+            return true;
+        return false;
+    }
+
+    static final class PrintDistribution extends GroupedOptions
+    {
+        final OptionDistribution dist = new OptionDistribution("dist=", null, "A mathematical distribution");
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(dist);
+        }
+    }
+
+    static boolean maybePrintDistribution(Map<String, String[]> clArgs)
+    {
+        final String[] args = clArgs.get("print");
+        if (args == null)
+            return false;
+        final PrintDistribution dist = new PrintDistribution();
+        if (null == GroupedOptions.select(args, dist))
+        {
+            printHelpPrinter().run();
+            System.out.println("Invalid print options provided, see output for valid options");
+            System.exit(1);
+        }
+        printDistribution(dist.dist.get().get());
+        return true;
+    }
+
+    static void printDistribution(Distribution dist)
+    {
+        PrintStream out = System.out;
+        out.println("% of samples    Range       % of total");
+        String format = "%-16.1f%-12d%12.1f";
+        double rangemax = dist.inverseCumProb(1d) / 100d;
+        for (double d : new double[] { 0.1d, 0.2d, 0.3d, 0.4d, 0.5d, 0.6d, 0.7d, 0.8d, 0.9d, 0.95d, 0.99d, 1d })
+        {
+            double sampleperc = d * 100;
+            long max = dist.inverseCumProb(d);
+            double rangeperc = max/ rangemax;
+            out.println(String.format(format, sampleperc, max, rangeperc));
+        }
+    }
+
+    private static boolean maybePrintHelp(Map<String, String[]> clArgs)
+    {
+        if (!clArgs.containsKey("-?") && !clArgs.containsKey("help"))
+            return false;
+        String[] params = clArgs.remove("-?");
+        if (params == null)
+            params = clArgs.remove("help");
+        if (params.length == 0)
+        {
+            if (!clArgs.isEmpty())
+            {
+                if (clArgs.size() == 1)
+                {
+                    String p = clArgs.keySet().iterator().next();
+                    if (clArgs.get(p).length == 0)
+                        params = new String[] {p};
+                }
+            }
+            else
+            {
+                printHelp();
+                return true;
+            }
+        }
+        if (params.length == 1)
+        {
+            printHelp(params[0]);
+            return true;
+        }
+        throw new IllegalArgumentException("Invalid command/option provided to help");
+    }
+
+    public static void printHelp()
+    {
+        System.out.println("Usage:      cassandra-stress <command> [options]");
+        System.out.println("Help usage: cassandra-stress help <command>");
+        System.out.println();
+        System.out.println("---Commands---");
+        for (Command cmd : Command.values())
+        {
+            System.out.println(String.format("%-20s : %s", cmd.toString().toLowerCase(), cmd.description));
+        }
+        System.out.println();
+        System.out.println("---Options---");
+        for (CliOption cmd : CliOption.values())
+        {
+            System.out.println(String.format("-%-20s : %s", cmd.toString().toLowerCase(), cmd.description));
+        }
+    }
+
+    public static void printHelp(String command)
+    {
+        Command cmd = Command.get(command);
+        if (cmd != null)
+        {
+            cmd.printHelp();
+            return;
+        }
+        CliOption opt = CliOption.get(command);
+        if (opt != null)
+        {
+            opt.printHelp();
+            return;
+        }
+        printHelp();
+        throw new IllegalArgumentException("Invalid command or option provided to command help");
+    }
+
+    public static Runnable helpHelpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                System.out.println("Usage: ./bin/cassandra-stress help <command|option>");
+                System.out.println("Commands:");
+                for (Command cmd : Command.values())
+                    System.out.println("    " + cmd.names.toString().replaceAll("\\[|\\]", ""));
+                System.out.println("Options:");
+                for (CliOption op : CliOption.values())
+                    System.out.println("    -" + op.toString().toLowerCase() + (op.extraName != null ? ", " + op.extraName : ""));
+            }
+        };
+    }
+
+    public static Runnable printHelpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                GroupedOptions.printOptions(System.out, "print", new GroupedOptions()
+                {
+                    @Override
+                    public List<? extends Option> options()
+                    {
+                        return Arrays.asList(new OptionDistribution("dist=", null, "A mathematical distribution"));
+                    }
+                });
+            }
+        };
+    }
+
+    public static Runnable sendToDaemonHelpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                System.out.println("Usage: -sendToDaemon <host>");
+                System.out.println();
+                System.out.println("Specify a host running the stress server to send this stress command to");
+            }
+        };
+    }
+
+    public static String getSendToDaemon(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-send-to");
+        if (params == null)
+            params = clArgs.remove("-sendto");
+        if (params == null)
+            return null;
+        if (params.length != 1)
+        {
+            sendToDaemonHelpPrinter().run();
+            System.out.println("Invalid -send-to specifier: " + Arrays.toString(params));
+            System.exit(1);
+        }
+        return params[0];
+
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMode.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMode.java
new file mode 100644
index 0000000..1aa745c
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMode.java

@@ -0,0 +1,197 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import com.datastax.driver.core.ProtocolOptions;
+
+public class SettingsMode implements Serializable
+{
+
+    public final ConnectionAPI api;
+    public final ConnectionStyle style;
+    public final CqlVersion cqlVersion;
+    private final String compression;
+
+    public SettingsMode(GroupedOptions options)
+    {
+        if (options instanceof Cql3Options)
+        {
+            cqlVersion = CqlVersion.CQL3;
+            Cql3Options opts = (Cql3Options) options;
+            api = opts.mode().displayPrefix.equals("native") ? ConnectionAPI.JAVA_DRIVER_NATIVE : ConnectionAPI.THRIFT;
+            style = opts.usePrepared.setByUser() ? ConnectionStyle.CQL_PREPARED : ConnectionStyle.CQL;
+            compression = ProtocolOptions.Compression.valueOf(opts.useCompression.value().toUpperCase()).name();
+        }
+        else if (options instanceof Cql3SimpleNativeOptions)
+        {
+            cqlVersion = CqlVersion.CQL3;
+            Cql3SimpleNativeOptions opts = (Cql3SimpleNativeOptions) options;
+            api = ConnectionAPI.SIMPLE_NATIVE;
+            style = opts.usePrepared.setByUser() ? ConnectionStyle.CQL_PREPARED : ConnectionStyle.CQL;
+            compression = ProtocolOptions.Compression.NONE.name();
+        }
+        else if (options instanceof Cql2ThriftOptions)
+        {
+            cqlVersion = CqlVersion.CQL2;
+            api = ConnectionAPI.THRIFT;
+            Cql2ThriftOptions opts = (Cql2ThriftOptions) options;
+            style = opts.usePrepared.setByUser() ? ConnectionStyle.CQL_PREPARED : ConnectionStyle.CQL;
+            compression = ProtocolOptions.Compression.NONE.name();
+        }
+        else if (options instanceof ThriftOptions)
+        {
+            ThriftOptions opts = (ThriftOptions) options;
+            cqlVersion = CqlVersion.NOCQL;
+            api = opts.smart.setByUser() ? ConnectionAPI.THRIFT_SMART : ConnectionAPI.THRIFT;
+            style = ConnectionStyle.THRIFT;
+            compression = ProtocolOptions.Compression.NONE.name();
+        }
+        else
+            throw new IllegalStateException();
+    }
+
+    public ProtocolOptions.Compression compression()
+    {
+        return ProtocolOptions.Compression.valueOf(compression);
+    }
+
+    // Option Declarations
+
+    private static final class Cql3NativeOptions extends Cql3Options
+    {
+        final OptionSimple mode = new OptionSimple("native", "", null, "", true);
+        OptionSimple mode()
+        {
+            return mode;
+        }
+    }
+
+    private static final class Cql3ThriftOptions extends Cql3Options
+    {
+        final OptionSimple mode = new OptionSimple("thrift", "", null, "", true);
+        OptionSimple mode()
+        {
+            return mode;
+        }
+    }
+
+    private static abstract class Cql3Options extends GroupedOptions
+    {
+        final OptionSimple api = new OptionSimple("cql3", "", null, "", true);
+        final OptionSimple usePrepared = new OptionSimple("prepared", "", null, "", false);
+        final OptionSimple useCompression = new OptionSimple("compression=", "none|lz4|snappy", "none", "", false);
+        final OptionSimple port = new OptionSimple("port=", "[0-9]+", "9046", "", false);
+
+        abstract OptionSimple mode();
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(mode(), usePrepared, api, useCompression, port);
+        }
+    }
+
+
+    private static final class Cql3SimpleNativeOptions extends GroupedOptions
+    {
+        final OptionSimple api = new OptionSimple("cql3", "", null, "", true);
+        final OptionSimple useSimpleNative = new OptionSimple("simplenative", "", null, "", true);
+        final OptionSimple usePrepared = new OptionSimple("prepared", "", null, "", false);
+        final OptionSimple port = new OptionSimple("port=", "[0-9]+", "9046", "", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(useSimpleNative, usePrepared, api, port);
+        }
+    }
+
+    private static final class Cql2ThriftOptions extends GroupedOptions
+    {
+        final OptionSimple api = new OptionSimple("cql2", "", null, "", true);
+        final OptionSimple mode = new OptionSimple("thrift", "", null, "", true);
+        final OptionSimple usePrepared = new OptionSimple("prepared", "", null, "", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(mode, api, usePrepared);
+        }
+    }
+
+    private static final class ThriftOptions extends GroupedOptions
+    {
+        final OptionSimple api = new OptionSimple("thrift", "", null, "", true);
+        final OptionSimple smart = new OptionSimple("smart", "", null, "", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(api, smart);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsMode get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-mode");
+        if (params == null)
+        {
+            Cql3NativeOptions opts = new Cql3NativeOptions();
+            opts.accept("cql3");
+            opts.accept("native");
+            opts.accept("prepared");
+            return new SettingsMode(opts);
+        }
+
+        GroupedOptions options = GroupedOptions.select(params, new ThriftOptions(), new Cql3NativeOptions(), new Cql3ThriftOptions(), new Cql3SimpleNativeOptions(), new Cql2ThriftOptions());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -mode options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsMode(options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-mode", new ThriftOptions(), new Cql3NativeOptions(), new Cql3ThriftOptions(), new Cql3SimpleNativeOptions(), new Cql2ThriftOptions());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsNode.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsNode.java
new file mode 100644
index 0000000..5297da8
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsNode.java

@@ -0,0 +1,188 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.*;
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.datastax.driver.core.Host;
+
+public class SettingsNode implements Serializable
+{
+    public final List<String> nodes;
+    public final boolean isWhiteList;
+
+    public SettingsNode(Options options)
+    {
+        if (options.file.setByUser())
+        {
+            try
+            {
+                String node;
+                List<String> tmpNodes = new ArrayList<String>();
+                BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(options.file.value())));
+                try
+                {
+                    while ((node = in.readLine()) != null)
+                    {
+                        if (node.length() > 0)
+                            tmpNodes.add(node);
+                    }
+                    nodes = Arrays.asList(tmpNodes.toArray(new String[tmpNodes.size()]));
+                }
+                finally
+                {
+                    in.close();
+                }
+            }
+            catch(IOException ioe)
+            {
+                throw new RuntimeException(ioe);
+            }
+
+        }
+        else
+            nodes = Arrays.asList(options.list.value().split(","));
+        isWhiteList = options.whitelist.setByUser();
+    }
+
+    public Set<String> resolveAllPermitted(StressSettings settings)
+    {
+        Set<String> r = new HashSet<>();
+        switch (settings.mode.api)
+        {
+            case THRIFT_SMART:
+            case JAVA_DRIVER_NATIVE:
+                if (!isWhiteList)
+                {
+                    for (Host host : settings.getJavaDriverClient().getCluster().getMetadata().getAllHosts())
+                        r.add(host.getAddress().getHostName());
+                    break;
+                }
+            case THRIFT:
+            case SIMPLE_NATIVE:
+                for (InetAddress address : resolveAllSpecified())
+                    r.add(address.getHostName());
+        }
+        return r;
+    }
+
+    public Set<InetAddress> resolveAllSpecified()
+    {
+        Set<InetAddress> r = new HashSet<>();
+        for (String node : nodes)
+        {
+            try
+            {
+                r.add(InetAddress.getByName(node));
+            }
+            catch (UnknownHostException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        return r;
+    }
+
+    public Set<InetSocketAddress> resolveAll(int port)
+    {
+        Set<InetSocketAddress> r = new HashSet<>();
+        for (String node : nodes)
+        {
+            try
+            {
+                r.add(new InetSocketAddress(InetAddress.getByName(node), port));
+            }
+            catch (UnknownHostException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        return r;
+    }
+
+    public String randomNode()
+    {
+        int index = (int) (Math.random() * nodes.size());
+        if (index >= nodes.size())
+            index = nodes.size() - 1;
+        return nodes.get(index);
+    }
+
+    // Option Declarations
+
+    public static final class Options extends GroupedOptions
+    {
+        final OptionSimple whitelist = new OptionSimple("whitelist", "", null, "Limit communications to the provided nodes", false);
+        final OptionSimple file = new OptionSimple("file=", ".*", null, "Node file (one per line)", false);
+        final OptionSimple list = new OptionSimple("", "[^=,]+(,[^=,]+)*", "localhost", "comma delimited list of nodes", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(whitelist, file, list);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsNode get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-node");
+        if (params == null)
+            return new SettingsNode(new Options());
+
+        GroupedOptions options = GroupedOptions.select(params, new Options());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -node options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsNode((Options) options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-node", new Options());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsPopulation.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsPopulation.java
new file mode 100644
index 0000000..da4c282
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsPopulation.java

@@ -0,0 +1,176 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.stress.generate.DistributionFactory;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+
+public class SettingsPopulation implements Serializable
+{
+
+    public final DistributionFactory distribution;
+    public final DistributionFactory readlookback;
+    public final PartitionGenerator.Order order;
+    public final boolean wrap;
+    public final long[] sequence;
+
+    public static enum GenerateOrder
+    {
+        ARBITRARY, SHUFFLED, SORTED
+    }
+
+    private SettingsPopulation(GenerateOptions options, DistributionOptions dist, SequentialOptions pop)
+    {
+        this.order = !options.contents.setByUser() ? PartitionGenerator.Order.ARBITRARY : PartitionGenerator.Order.valueOf(options.contents.value().toUpperCase());
+        if (dist != null)
+        {
+            this.distribution = dist.seed.get();
+            this.sequence = null;
+            this.readlookback = null;
+            this.wrap = false;
+        }
+        else
+        {
+            this.distribution = null;
+            String[] bounds = pop.populate.value().split("\\.\\.+");
+            this.sequence = new long[] { OptionDistribution.parseLong(bounds[0]), OptionDistribution.parseLong(bounds[1]) };
+            this.readlookback = pop.lookback.get();
+            this.wrap = !pop.nowrap.setByUser();
+        }
+    }
+
+    public SettingsPopulation(DistributionOptions options)
+    {
+        this(options, options, null);
+    }
+
+    public SettingsPopulation(SequentialOptions options)
+    {
+        this(options, null, options);
+    }
+
+    // Option Declarations
+
+    private static class GenerateOptions extends GroupedOptions
+    {
+        final OptionSimple contents = new OptionSimple("contents=", "(sorted|shuffled)", null, "SORTED or SHUFFLED (intra-)partition order; if not specified, will be consistent but arbitrary order", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(contents);
+        }
+    }
+
+    private static final class DistributionOptions extends GenerateOptions
+    {
+        final OptionDistribution seed;
+
+        public DistributionOptions(String defaultLimit)
+        {
+            seed = new OptionDistribution("dist=", "gaussian(1.." + defaultLimit + ")", "Seeds are selected from this distribution");
+        }
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return ImmutableList.<Option>builder().add(seed).addAll(super.options()).build();
+        }
+    }
+
+    private static final class SequentialOptions extends GenerateOptions
+    {
+        final OptionSimple populate;
+        final OptionDistribution lookback = new OptionDistribution("read-lookback=", "fixed(1)", "Select read seeds from the recently visited write seeds");
+        final OptionSimple nowrap = new OptionSimple("no-wrap", "", null, "Terminate the stress test once all seeds in the range have been visited", false);
+
+        public SequentialOptions(String defaultLimit)
+        {
+            populate = new OptionSimple("seq=", "[0-9]+\\.\\.+[0-9]+[MBK]?",
+                    "1.." + defaultLimit,
+                    "Generate all seeds in sequence", true);
+        }
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return ImmutableList.<Option>builder().add(populate, nowrap, lookback).addAll(super.options()).build();
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsPopulation get(Map<String, String[]> clArgs, SettingsCommand command)
+    {
+        // set default size to number of commands requested, unless set to err convergence, then use 1M
+        String defaultLimit = command.count <= 0 ? "1000000" : Long.toString(command.count);
+
+        String[] params = clArgs.remove("-pop");
+        if (params == null)
+        {
+            // return defaults:
+            switch(command.type)
+            {
+                case WRITE:
+                case COUNTER_WRITE:
+                    return new SettingsPopulation(new SequentialOptions(defaultLimit));
+                default:
+                    return new SettingsPopulation(new DistributionOptions(defaultLimit));
+            }
+        }
+        GroupedOptions options = GroupedOptions.select(params, new SequentialOptions(defaultLimit), new DistributionOptions(defaultLimit));
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -pop options provided, see output for valid options");
+            System.exit(1);
+        }
+        return options instanceof SequentialOptions ?
+                new SettingsPopulation((SequentialOptions) options) :
+                new SettingsPopulation((DistributionOptions) options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-pop", new SequentialOptions("N"), new DistributionOptions("N"));
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsPort.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsPort.java
new file mode 100644
index 0000000..1e10e37
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsPort.java

@@ -0,0 +1,94 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class SettingsPort implements Serializable
+{
+
+    public final int nativePort;
+    public final int thriftPort;
+    public final int jmxPort;
+
+    public SettingsPort(PortOptions options)
+    {
+        nativePort = Integer.parseInt(options.nativePort.value());
+        thriftPort = Integer.parseInt(options.thriftPort.value());
+        jmxPort = Integer.parseInt(options.jmxPort.value());
+    }
+
+    // Option Declarations
+
+    private static final class PortOptions extends GroupedOptions
+    {
+        final OptionSimple nativePort = new OptionSimple("native=", "[0-9]+", "9042", "Use this port for the Cassandra native protocol", false);
+        final OptionSimple thriftPort = new OptionSimple("thrift=", "[0-9]+", "9160", "Use this port for the thrift protocol", false);
+        final OptionSimple jmxPort = new OptionSimple("jmx=", "[0-9]+", "7199", "Use this port for retrieving statistics over jmx", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(nativePort, thriftPort);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsPort get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-port");
+        if (params == null)
+        {
+            return new SettingsPort(new PortOptions());
+        }
+        PortOptions options = GroupedOptions.select(params, new PortOptions());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -port options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsPort(options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-port", new PortOptions());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsRate.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsRate.java
new file mode 100644
index 0000000..0486678
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsRate.java

@@ -0,0 +1,139 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class SettingsRate implements Serializable
+{
+
+    public final boolean auto;
+    public final int minThreads;
+    public final int maxThreads;
+    public final int threadCount;
+    public final int opRateTargetPerSecond;
+
+    public SettingsRate(ThreadOptions options)
+    {
+        auto = false;
+        threadCount = Integer.parseInt(options.threads.value());
+        String rateOpt = options.rate.value();
+        opRateTargetPerSecond = Integer.parseInt(rateOpt.substring(0, rateOpt.length() - 2));
+        minThreads = -1;
+        maxThreads = -1;
+    }
+
+    public SettingsRate(AutoOptions auto)
+    {
+        this.auto = auto.auto.setByUser();
+        this.minThreads = Integer.parseInt(auto.minThreads.value());
+        this.maxThreads = Integer.parseInt(auto.maxThreads.value());
+        this.threadCount = -1;
+        this.opRateTargetPerSecond = 0;
+    }
+
+
+    // Option Declarations
+
+    private static final class AutoOptions extends GroupedOptions
+    {
+        final OptionSimple auto = new OptionSimple("auto", "", null, "stop increasing threads once throughput saturates", false);
+        final OptionSimple minThreads = new OptionSimple("threads>=", "[0-9]+", "4", "run at least this many clients concurrently", false);
+        final OptionSimple maxThreads = new OptionSimple("threads<=", "[0-9]+", "1000", "run at most this many clients concurrently", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(minThreads, maxThreads, auto);
+        }
+    }
+
+    private static final class ThreadOptions extends GroupedOptions
+    {
+        final OptionSimple threads = new OptionSimple("threads=", "[0-9]+", null, "run this many clients concurrently", true);
+        final OptionSimple rate = new OptionSimple("limit=", "[0-9]+/s", "0/s", "limit operations per second across all clients", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(threads, rate);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsRate get(Map<String, String[]> clArgs, SettingsCommand command)
+    {
+        String[] params = clArgs.remove("-rate");
+        if (params == null)
+        {
+            switch (command.type)
+            {
+                case WRITE:
+                case COUNTER_WRITE:
+                    if (command.count > 0)
+                    {
+                        ThreadOptions options = new ThreadOptions();
+                        options.accept("threads=200");
+                        return new SettingsRate(options);
+                    }
+            }
+            AutoOptions options = new AutoOptions();
+            options.accept("auto");
+            return new SettingsRate(options);
+        }
+        GroupedOptions options = GroupedOptions.select(params, new AutoOptions(), new ThreadOptions());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -rate options provided, see output for valid options");
+            System.exit(1);
+        }
+        if (options instanceof AutoOptions)
+            return new SettingsRate((AutoOptions) options);
+        else if (options instanceof ThreadOptions)
+            return new SettingsRate((ThreadOptions) options);
+        else
+            throw new IllegalStateException();
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-rate", new ThreadOptions(), new AutoOptions());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsSchema.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsSchema.java
new file mode 100644
index 0000000..6e3a02e
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsSchema.java

@@ -0,0 +1,210 @@
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+package org.apache.cassandra.stress.settings;
+
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.thrift.*;
+import org.apache.cassandra.thrift.ConsistencyLevel;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class SettingsSchema implements Serializable
+{
+
+    public static final String DEFAULT_VALIDATOR  = "BytesType";
+
+    private final String replicationStrategy;
+    private final Map<String, String> replicationStrategyOptions;
+
+    private final String compression;
+    private final String compactionStrategy;
+    private final Map<String, String> compactionStrategyOptions;
+    public final String keyspace;
+
+    public SettingsSchema(Options options, SettingsCommand command)
+    {
+        if (command instanceof SettingsCommandUser)
+            keyspace = ((SettingsCommandUser) command).profile.keyspaceName;
+        else
+            keyspace = options.keyspace.value();
+
+        replicationStrategy = options.replication.getStrategy();
+        replicationStrategyOptions = options.replication.getOptions();
+        compression = options.compression.value();
+        compactionStrategy = options.compaction.getStrategy();
+        compactionStrategyOptions = options.compaction.getOptions();
+    }
+
+    public void createKeySpaces(StressSettings settings)
+    {
+        createKeySpacesThrift(settings);
+    }
+
+
+    /**
+     * Create Keyspace with Standard and Super/Counter column families
+     */
+    public void createKeySpacesThrift(StressSettings settings)
+    {
+        KsDef ksdef = new KsDef();
+
+        // column family for standard columns
+        CfDef standardCfDef = new CfDef(keyspace, "Standard1");
+        Map<String, String> compressionOptions = new HashMap<>();
+        if (compression != null)
+            compressionOptions.put("sstable_compression", compression);
+
+        String comparator = settings.columns.comparator;
+        standardCfDef.setComparator_type(comparator)
+                .setDefault_validation_class(DEFAULT_VALIDATOR)
+                .setCompression_options(compressionOptions);
+
+        for (int i = 0; i < settings.columns.names.size(); i++)
+            standardCfDef.addToColumn_metadata(new ColumnDef(settings.columns.names.get(i), "BytesType"));
+
+        // column family for standard counters
+        CfDef counterCfDef = new CfDef(keyspace, "Counter1")
+                .setComparator_type(comparator)
+                .setDefault_validation_class("CounterColumnType")
+                .setCompression_options(compressionOptions);
+
+        ksdef.setName(keyspace);
+        ksdef.setStrategy_class(replicationStrategy);
+
+        if (!replicationStrategyOptions.isEmpty())
+        {
+            ksdef.setStrategy_options(replicationStrategyOptions);
+        }
+
+        if (compactionStrategy != null)
+        {
+            standardCfDef.setCompaction_strategy(compactionStrategy);
+            counterCfDef.setCompaction_strategy(compactionStrategy);
+            if (!compactionStrategyOptions.isEmpty())
+            {
+                standardCfDef.setCompaction_strategy_options(compactionStrategyOptions);
+                counterCfDef.setCompaction_strategy_options(compactionStrategyOptions);
+            }
+        }
+
+        ksdef.setCf_defs(new ArrayList<>(Arrays.asList(standardCfDef, counterCfDef)));
+
+        Cassandra.Client client = settings.getRawThriftClient(false);
+
+        try
+        {
+            client.system_add_keyspace(ksdef);
+
+            /* CQL3 counter cf */
+            client.set_cql_version("3.0.0"); // just to create counter cf for cql3
+
+            client.set_keyspace(keyspace);
+            client.execute_cql3_query(createCounterCFStatementForCQL3(settings), Compression.NONE, ConsistencyLevel.ONE);
+
+            if (settings.mode.cqlVersion.isCql())
+                client.set_cql_version(settings.mode.cqlVersion.connectVersion);
+            /* end */
+
+            System.out.println(String.format("Created keyspaces. Sleeping %ss for propagation.", settings.node.nodes.size()));
+            Thread.sleep(settings.node.nodes.size() * 1000); // seconds
+        }
+        catch (InvalidRequestException e)
+        {
+            System.err.println("Unable to create stress keyspace: " + e.getWhy());
+        }
+        catch (Exception e)
+        {
+            System.err.println("!!!! " + e.getMessage());
+        }
+    }
+
+    private ByteBuffer createCounterCFStatementForCQL3(StressSettings options)
+    {
+        StringBuilder counter3 = new StringBuilder("CREATE TABLE \"Counter3\" (KEY blob PRIMARY KEY, ");
+
+        for (int i = 0; i < options.columns.maxColumnsPerKey; i++)
+        {
+            counter3.append("c").append(i).append(" counter");
+            if (i != options.columns.maxColumnsPerKey - 1)
+                counter3.append(", ");
+        }
+        counter3.append(");");
+
+        return ByteBufferUtil.bytes(counter3.toString());
+    }
+
+    // Option Declarations
+
+    private static final class Options extends GroupedOptions
+    {
+        final OptionReplication replication = new OptionReplication();
+        final OptionCompaction compaction = new OptionCompaction();
+        final OptionSimple keyspace = new OptionSimple("keyspace=", ".*", "Keyspace1", "The keyspace name to use", false);
+        final OptionSimple compression = new OptionSimple("compression=", ".*", null, "Specify the compression to use for sstable, default:no compression", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(replication, keyspace, compaction, compression);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsSchema get(Map<String, String[]> clArgs, SettingsCommand command)
+    {
+        String[] params = clArgs.remove("-schema");
+        if (params == null)
+            return new SettingsSchema(new Options(), command);
+
+        if (command instanceof SettingsCommandUser)
+            throw new IllegalArgumentException("-schema can only be provided with predefined operations insert, read, etc.; the 'user' command requires a schema yaml instead");
+
+        GroupedOptions options = GroupedOptions.select(params, new Options());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -schema options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsSchema((Options) options, command);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-schema", new Options());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTransport.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTransport.java
new file mode 100644
index 0000000..8b0ef6a
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTransport.java

@@ -0,0 +1,166 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.config.EncryptionOptions;
+import org.apache.cassandra.thrift.ITransportFactory;
+import org.apache.cassandra.thrift.SSLTransportFactory;
+import org.apache.cassandra.thrift.TFramedTransportFactory;
+
+public class SettingsTransport implements Serializable
+{
+
+    private final String fqFactoryClass;
+    private final TOptions options;
+    private ITransportFactory factory;
+
+    public SettingsTransport(TOptions options)
+    {
+        this.options = options;
+        this.fqFactoryClass = options.factory.value();
+        try
+        {
+            Class<?> clazz = Class.forName(fqFactoryClass);
+            if (!ITransportFactory.class.isAssignableFrom(clazz))
+                throw new IllegalArgumentException(clazz + " is not a valid transport factory");
+            // check we can instantiate it
+            clazz.newInstance();
+        }
+        catch (Exception e)
+        {
+            throw new IllegalArgumentException("Invalid transport factory class: " + options.factory.value(), e);
+        }
+    }
+
+    private void configureTransportFactory(ITransportFactory transportFactory, TOptions options)
+    {
+        Map<String, String> factoryOptions = new HashMap<>();
+        // If the supplied factory supports the same set of options as our SSL impl, set those
+        if (transportFactory.supportedOptions().contains(SSLTransportFactory.TRUSTSTORE))
+            factoryOptions.put(SSLTransportFactory.TRUSTSTORE, options.trustStore.value());
+        if (transportFactory.supportedOptions().contains(SSLTransportFactory.TRUSTSTORE_PASSWORD))
+            factoryOptions.put(SSLTransportFactory.TRUSTSTORE_PASSWORD, options.trustStorePw.value());
+        if (transportFactory.supportedOptions().contains(SSLTransportFactory.PROTOCOL))
+            factoryOptions.put(SSLTransportFactory.PROTOCOL, options.protocol.value());
+        if (transportFactory.supportedOptions().contains(SSLTransportFactory.CIPHER_SUITES))
+            factoryOptions.put(SSLTransportFactory.CIPHER_SUITES, options.ciphers.value());
+        // Now check if any of the factory's supported options are set as system properties
+        for (String optionKey : transportFactory.supportedOptions())
+            if (System.getProperty(optionKey) != null)
+                factoryOptions.put(optionKey, System.getProperty(optionKey));
+
+        transportFactory.setOptions(factoryOptions);
+    }
+
+    public synchronized ITransportFactory getFactory()
+    {
+        if (factory == null)
+        {
+            try
+            {
+                this.factory = (ITransportFactory) Class.forName(fqFactoryClass).newInstance();
+                configureTransportFactory(this.factory, this.options);
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        return factory;
+    }
+
+    public EncryptionOptions.ClientEncryptionOptions getEncryptionOptions()
+    {
+        EncryptionOptions.ClientEncryptionOptions encOptions = new EncryptionOptions.ClientEncryptionOptions();
+        if (options.trustStore.present())
+        {
+            encOptions.enabled = true;
+            encOptions.truststore = options.trustStore.value();
+            encOptions.truststore_password = options.trustStorePw.value();
+            encOptions.algorithm = options.alg.value();
+            encOptions.protocol = options.protocol.value();
+            encOptions.cipher_suites = options.ciphers.value().split(",");
+        }
+        return encOptions;
+    }
+
+    // Option Declarations
+
+    static class TOptions extends GroupedOptions implements Serializable
+    {
+        final OptionSimple factory = new OptionSimple("factory=", ".*", TFramedTransportFactory.class.getName(), "Fully-qualified ITransportFactory class name for creating a connection. Note: For Thrift over SSL, use org.apache.cassandra.thrift.SSLTransportFactory.", false);
+        final OptionSimple trustStore = new OptionSimple("truststore=", ".*", null, "SSL: full path to truststore", false);
+        final OptionSimple trustStorePw = new OptionSimple("truststore-password=", ".*", null, "SSL: truststore password", false);
+        final OptionSimple protocol = new OptionSimple("ssl-protocol=", ".*", "TLS", "SSL: connection protocol to use", false);
+        final OptionSimple alg = new OptionSimple("ssl-alg=", ".*", "SunX509", "SSL: algorithm", false);
+        final OptionSimple storeType = new OptionSimple("store-type=", ".*", "JKS", "SSL: keystore format", false);
+        final OptionSimple ciphers = new OptionSimple("ssl-ciphers=", ".*", "TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA", "SSL: comma delimited list of encryption suites to use", false);
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return Arrays.asList(factory, trustStore, trustStorePw, protocol, alg, storeType, ciphers);
+        }
+    }
+
+    // CLI Utility Methods
+
+    public static SettingsTransport get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-transport");
+        if (params == null)
+            return new SettingsTransport(new TOptions());
+
+        GroupedOptions options = GroupedOptions.select(params, new TOptions());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -transport options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsTransport((TOptions) options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-transport", new TOptions());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                printHelp();
+            }
+        };
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java b/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java
new file mode 100644
index 0000000..ba72821
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java

@@ -0,0 +1,308 @@
+package org.apache.cassandra.stress.settings;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.io.Serializable;
+import java.util.*;
+
+import com.datastax.driver.core.Metadata;
+import com.datastax.driver.core.policies.DCAwareRoundRobinPolicy;
+import com.datastax.driver.core.policies.RoundRobinPolicy;
+import com.datastax.driver.core.policies.WhiteListPolicy;
+import org.apache.cassandra.config.EncryptionOptions;
+import org.apache.cassandra.stress.util.JavaDriverClient;
+import org.apache.cassandra.stress.util.SimpleThriftClient;
+import org.apache.cassandra.stress.util.SmartThriftClient;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.thrift.Cassandra;
+import org.apache.cassandra.thrift.InvalidRequestException;
+import org.apache.cassandra.transport.SimpleClient;
+import org.apache.thrift.protocol.TBinaryProtocol;
+import org.apache.thrift.transport.TTransport;
+
+public class StressSettings implements Serializable
+{
+    public final SettingsCommand command;
+    public final SettingsRate rate;
+    public final SettingsPopulation generate;
+    public final SettingsInsert insert;
+    public final SettingsColumn columns;
+    public final SettingsErrors errors;
+    public final SettingsLog log;
+    public final SettingsMode mode;
+    public final SettingsNode node;
+    public final SettingsSchema schema;
+    public final SettingsTransport transport;
+    public final SettingsPort port;
+    public final String sendToDaemon;
+
+    public StressSettings(SettingsCommand command, SettingsRate rate, SettingsPopulation generate, SettingsInsert insert, SettingsColumn columns, SettingsErrors errors, SettingsLog log, SettingsMode mode, SettingsNode node, SettingsSchema schema, SettingsTransport transport, SettingsPort port, String sendToDaemon)
+    {
+        this.command = command;
+        this.rate = rate;
+        this.insert = insert;
+        this.generate = generate;
+        this.columns = columns;
+        this.errors = errors;
+        this.log = log;
+        this.mode = mode;
+        this.node = node;
+        this.schema = schema;
+        this.transport = transport;
+        this.port = port;
+        this.sendToDaemon = sendToDaemon;
+    }
+
+    private SmartThriftClient tclient;
+
+    /**
+     * Thrift client connection
+     * @return cassandra client connection
+     */
+    public synchronized ThriftClient getThriftClient()
+    {
+        if (mode.api != ConnectionAPI.THRIFT_SMART)
+            return getSimpleThriftClient();
+
+        if (tclient == null)
+            tclient = getSmartThriftClient();
+
+        return tclient;
+    }
+
+    private SmartThriftClient getSmartThriftClient()
+    {
+        Metadata metadata = getJavaDriverClient().getCluster().getMetadata();
+        return new SmartThriftClient(this, schema.keyspace, metadata);
+    }
+
+    /**
+     * Thrift client connection
+     * @return cassandra client connection
+     */
+    private SimpleThriftClient getSimpleThriftClient()
+    {
+        return new SimpleThriftClient(getRawThriftClient(node.randomNode(), true));
+    }
+
+    public Cassandra.Client getRawThriftClient(boolean setKeyspace)
+    {
+        return getRawThriftClient(node.randomNode(), setKeyspace);
+    }
+
+    public Cassandra.Client getRawThriftClient(String host)
+    {
+        return getRawThriftClient(host, true);
+    }
+
+    public Cassandra.Client getRawThriftClient(String host, boolean setKeyspace)
+    {
+        Cassandra.Client client;
+
+        try
+        {
+            TTransport transport = this.transport.getFactory().openTransport(host, port.thriftPort);
+
+            client = new Cassandra.Client(new TBinaryProtocol(transport));
+
+            if (mode.cqlVersion.isCql())
+                client.set_cql_version(mode.cqlVersion.connectVersion);
+
+            if (setKeyspace)
+                client.set_keyspace(schema.keyspace);
+        }
+        catch (InvalidRequestException e)
+        {
+            throw new RuntimeException(e.getWhy());
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+
+        return client;
+    }
+
+
+    public SimpleClient getSimpleNativeClient()
+    {
+        try
+        {
+            String currentNode = node.randomNode();
+            SimpleClient client = new SimpleClient(currentNode, port.nativePort);
+            client.connect(false);
+            client.execute("USE \"" + schema.keyspace + "\";", org.apache.cassandra.db.ConsistencyLevel.ONE);
+            return client;
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e.getMessage());
+        }
+    }
+
+    private static volatile JavaDriverClient client;
+
+    public JavaDriverClient getJavaDriverClient()
+    {
+        return getJavaDriverClient(true);
+    }
+
+    public JavaDriverClient getJavaDriverClient(boolean setKeyspace)
+    {
+        if (client != null)
+            return client;
+
+        try
+        {
+            synchronized (this)
+            {
+                String currentNode = node.randomNode();
+                if (client != null)
+                    return client;
+
+                EncryptionOptions.ClientEncryptionOptions encOptions = transport.getEncryptionOptions();
+                JavaDriverClient c = new JavaDriverClient(this, currentNode, port.nativePort, encOptions);
+                c.connect(mode.compression());
+                if (setKeyspace)
+                    c.execute("USE \"" + schema.keyspace + "\";", org.apache.cassandra.db.ConsistencyLevel.ONE);
+
+                return client = c;
+            }
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void maybeCreateKeyspaces()
+    {
+        if (command.type == Command.WRITE || command.type == Command.COUNTER_WRITE)
+            schema.createKeySpaces(this);
+        else if (command.type == Command.USER)
+            ((SettingsCommandUser) command).profile.maybeCreateSchema(this);
+    }
+
+    public static StressSettings parse(String[] args)
+    {
+        try
+        {
+            final Map<String, String[]> clArgs = parseMap(args);
+            if (clArgs.containsKey("legacy"))
+                return Legacy.build(Arrays.copyOfRange(args, 1, args.length));
+            if (SettingsMisc.maybeDoSpecial(clArgs))
+                System.exit(1);
+            return get(clArgs);
+        }
+        catch (IllegalArgumentException e)
+        {
+            System.out.println(e.getMessage());
+            System.exit(1);
+            throw new AssertionError();
+        }
+    }
+
+    public static StressSettings get(Map<String, String[]> clArgs)
+    {
+        SettingsCommand command = SettingsCommand.get(clArgs);
+        if (command == null)
+            throw new IllegalArgumentException("No command specified");
+        String sendToDaemon = SettingsMisc.getSendToDaemon(clArgs);
+        SettingsPort port = SettingsPort.get(clArgs);
+        SettingsRate rate = SettingsRate.get(clArgs, command);
+        SettingsPopulation generate = SettingsPopulation.get(clArgs, command);
+        SettingsInsert insert = SettingsInsert.get(clArgs);
+        SettingsColumn columns = SettingsColumn.get(clArgs);
+        SettingsErrors errors = SettingsErrors.get(clArgs);
+        SettingsLog log = SettingsLog.get(clArgs);
+        SettingsMode mode = SettingsMode.get(clArgs);
+        SettingsNode node = SettingsNode.get(clArgs);
+        SettingsSchema schema = SettingsSchema.get(clArgs, command);
+        SettingsTransport transport = SettingsTransport.get(clArgs);
+        if (!clArgs.isEmpty())
+        {
+            printHelp();
+            System.out.println("Error processing command line arguments. The following were ignored:");
+            for (Map.Entry<String, String[]> e : clArgs.entrySet())
+            {
+                System.out.print(e.getKey());
+                for (String v : e.getValue())
+                {
+                    System.out.print(" ");
+                    System.out.print(v);
+                }
+                System.out.println();
+            }
+            System.exit(1);
+        }
+        return new StressSettings(command, rate, generate, insert, columns, errors, log, mode, node, schema, transport, port, sendToDaemon);
+    }
+
+    private static Map<String, String[]> parseMap(String[] args)
+    {
+        // first is the main command/operation, so specified without a -
+        if (args.length == 0)
+        {
+            System.out.println("No command provided");
+            printHelp();
+            System.exit(1);
+        }
+        final LinkedHashMap<String, String[]> r = new LinkedHashMap<>();
+        String key = null;
+        List<String> params = new ArrayList<>();
+        for (int i = 0 ; i < args.length ; i++)
+        {
+            if (i == 0 || args[i].startsWith("-"))
+            {
+                if (i > 0)
+                    putParam(key, params.toArray(new String[0]), r);
+                key = args[i].toLowerCase();
+                params.clear();
+            }
+            else
+                params.add(args[i]);
+        }
+        putParam(key, params.toArray(new String[0]), r);
+        return r;
+    }
+
+    private static void putParam(String key, String[] args, Map<String, String[]> clArgs)
+    {
+        String[] prev = clArgs.put(key, args);
+        if (prev != null)
+            throw new IllegalArgumentException(key + " is defined multiple times. Each option/command can be specified at most once.");
+    }
+
+    public static void printHelp()
+    {
+        SettingsMisc.printHelp();
+    }
+
+    public synchronized void disconnect()
+    {
+        if (client == null)
+            return;
+
+        client.disconnect();
+        client = null;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java b/tools/stress/src/org/apache/cassandra/stress/settings/ValidationType.java
similarity index 63%
copy from src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
copy to tools/stress/src/org/apache/cassandra/stress/settings/ValidationType.java
index e42574b..710b717 100644
--- a/src/java/org/apache/cassandra/io/util/ByteBufferOutputStream.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/ValidationType.java

@@ -1,4 +1,4 @@
-package org.apache.cassandra.io.util;
+package org.apache.cassandra.stress.settings;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,26 +21,9 @@
  */
 
 
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-public class ByteBufferOutputStream extends OutputStream
+public enum ValidationType
 {
-    private final ByteBuffer buffer;
 
-    public ByteBufferOutputStream(ByteBuffer buffer)
-    {
-        this.buffer = buffer;
-    }
+    NOT_FAIL, NON_ZERO, SUBSET, EQUAL
 
-    public void write(int b)
-    {
-        buffer.put((byte) b);
-    }
-
-    @Override
-    public void write(byte[] b, int off, int len)
-    {
-        buffer.put(b, off, len);
-    }
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/CassandraClient.java b/tools/stress/src/org/apache/cassandra/stress/util/CassandraClient.java
deleted file mode 100644
index 5136a55..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/util/CassandraClient.java
+++ /dev/null

@@ -1,34 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.util;
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.cassandra.thrift.Cassandra.Client;
-import org.apache.thrift.protocol.TProtocol;
-
-public class CassandraClient extends Client
-{
-    public Map<Integer, Integer> preparedStatements = new HashMap<Integer, Integer>();
-
-    public CassandraClient(TProtocol protocol)
-    {
-        super(protocol);
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/DynamicList.java b/tools/stress/src/org/apache/cassandra/stress/util/DynamicList.java
new file mode 100644
index 0000000..2a38e7d
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/DynamicList.java

@@ -0,0 +1,259 @@
+package org.apache.cassandra.stress.util;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.TreeSet;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.cassandra.stress.generate.FasterRandom;
+
+// simple thread-unsafe skiplist that permits indexing/removal by position, insertion at the end
+// (though easily extended to insertion at any position, not necessary here)
+// we use it for sampling items by position for visiting writes in the pool of pending writes
+public class DynamicList<E>
+{
+
+    // represents a value and an index simultaneously; each node maintains a list
+    // of next pointers for each height in the skip-list this node participates in
+    // (a contiguous range from [0..height))
+    public static class Node<E>
+    {
+        // stores the size of each descendant
+        private final int[] size;
+        // TODO: alternate links to save space
+        private final Node<E>[] links;
+        private final E value;
+
+        private Node(int height, E value)
+        {
+            this.value = value;
+            links = new Node[height * 2];
+            size = new int[height];
+            Arrays.fill(size, 1);
+        }
+
+        private int height()
+        {
+            return size.length;
+        }
+
+        private Node<E> next(int i)
+        {
+            return links[i * 2];
+        }
+
+        private Node<E> prev(int i)
+        {
+            return links[1 + i * 2];
+        }
+
+        private void setNext(int i, Node<E> next)
+        {
+            links[i * 2] = next;
+        }
+
+        private void setPrev(int i, Node<E> prev)
+        {
+            links[1 + i * 2] = prev;
+        }
+
+        private Node parent(int parentHeight)
+        {
+            Node prev = this;
+            while (true)
+            {
+                int height = prev.height();
+                if (parentHeight < height)
+                    return prev;
+                prev = prev.prev(height - 1);
+            }
+        }
+    }
+
+    private final ReadWriteLock lock = new ReentrantReadWriteLock();
+    private final int maxHeight;
+    private final Node<E> head;
+    private int size;
+
+    public DynamicList(int maxExpectedSize)
+    {
+        this.maxHeight = 3 + (int) Math.ceil(Math.log(maxExpectedSize) / Math.log(2));
+        head = new Node<>(maxHeight, null);
+    }
+
+    private int randomLevel()
+    {
+        return 1 + Integer.bitCount(ThreadLocalRandom.current().nextInt() & ((1 << (maxHeight - 1)) - 1));
+    }
+
+    // add the value to the end of the list, and return the associated Node that permits efficient removal
+    // regardless of its future position in the list from other modifications
+    public Node<E> append(E value)
+    {
+        Node<E> newTail = new Node<>(randomLevel(), value);
+
+        lock.writeLock().lock();
+        try
+        {
+            size++;
+
+            Node<E> tail = head;
+            for (int i = maxHeight - 1 ; i >= newTail.height() ; i--)
+            {
+                Node<E> next;
+                while ((next = tail.next(i)) != null)
+                    tail = next;
+                tail.size[i]++;
+            }
+
+            for (int i = newTail.height() - 1 ; i >= 0 ; i--)
+            {
+                Node<E> next;
+                while ((next = tail.next(i)) != null)
+                    tail = next;
+                tail.setNext(i, newTail);
+                newTail.setPrev(i, tail);
+            }
+
+            return newTail;
+        }
+        finally
+        {
+            lock.writeLock().unlock();
+        }
+    }
+
+    // remove the provided node and its associated value from the list
+    public void remove(Node<E> node)
+    {
+        lock.writeLock().lock();
+        try
+        {
+            size--;
+
+            // go up through each level in the skip list, unlinking this node; this entails
+            // simply linking each neighbour to each other, and appending the size of the
+            // current level owned by this node's index to the preceding neighbour (since
+            // ownership is defined as any node that you must visit through the index,
+            // removal of ourselves from a level means the preceding index entry is the
+            // entry point to all of the removed node's descendants)
+            for (int i = 0 ; i < node.height() ; i++)
+            {
+                Node<E> prev = node.prev(i);
+                Node<E> next = node.next(i);
+                assert prev != null;
+                prev.setNext(i, next);
+                if (next != null)
+                    next.setPrev(i, prev);
+                prev.size[i] += node.size[i] - 1;
+            }
+
+            // then go up the levels, removing 1 from the size at each height above ours
+            for (int i = node.height() ; i < maxHeight ; i++)
+            {
+                // if we're at our height limit, we backtrack at our top level until we
+                // hit a neighbour with a greater height
+                while (i == node.height())
+                    node = node.prev(i - 1);
+                node.size[i]--;
+            }
+        }
+        finally
+        {
+            lock.writeLock().unlock();
+        }
+    }
+
+    // retrieve the item at the provided index, or return null if the index is past the end of the list
+    public E get(int index)
+    {
+        lock.readLock().lock();
+        try
+        {
+            if (index >= size)
+                return null;
+
+            index++;
+            int c = 0;
+            Node<E> finger = head;
+            for (int i = maxHeight - 1 ; i >= 0 ; i--)
+            {
+                while (c + finger.size[i] <= index)
+                {
+                    c += finger.size[i];
+                    finger = finger.next(i);
+                }
+            }
+
+            assert c == index;
+            return finger.value;
+        }
+        finally
+        {
+            lock.readLock().unlock();
+        }
+    }
+
+    // some quick and dirty tests to confirm the skiplist works as intended
+    // don't create a separate unit test - tools tree doesn't currently warrant them
+
+    private boolean isWellFormed()
+    {
+        for (int i = 0 ; i < maxHeight ; i++)
+        {
+            int c = 0;
+            for (Node node = head ; node != null ; node = node.next(i))
+            {
+                if (node.prev(i) != null && node.prev(i).next(i) != node)
+                    return false;
+                if (node.next(i) != null && node.next(i).prev(i) != node)
+                    return false;
+                c += node.size[i];
+                if (i + 1 < maxHeight && node.parent(i + 1).next(i + 1) == node.next(i))
+                {
+                    if (node.parent(i + 1).size[i + 1] != c)
+                        return false;
+                    c = 0;
+                }
+            }
+            if (i == maxHeight - 1 && c != size + 1)
+                return false;
+        }
+        return true;
+    }
+
+    public static void main(String[] args)
+    {
+        DynamicList<Integer> list = new DynamicList<>(20);
+        TreeSet<Integer> canon = new TreeSet<>();
+        HashMap<Integer, Node> nodes = new HashMap<>();
+        int c = 0;
+        for (int i = 0 ; i < 100000 ; i++)
+        {
+            nodes.put(c, list.append(c));
+            canon.add(c);
+            c++;
+        }
+        FasterRandom rand = new FasterRandom();
+        assert list.isWellFormed();
+        for (int loop = 0 ; loop < 100 ; loop++)
+        {
+            System.out.println(loop);
+            for (int i = 0 ; i < 100000 ; i++)
+            {
+                int index = rand.nextInt(100000);
+                Integer seed = list.get(index);
+//                assert canon.headSet(seed, false).size() == index;
+                list.remove(nodes.remove(seed));
+                canon.remove(seed);
+                nodes.put(c, list.append(c));
+                canon.add(c);
+                c++;
+            }
+            assert list.isWellFormed();
+        }
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java b/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java
new file mode 100644
index 0000000..2105179
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java

@@ -0,0 +1,172 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.stress.util;
+
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import javax.net.ssl.SSLContext;
+
+import com.datastax.driver.core.*;
+import com.datastax.driver.core.policies.DCAwareRoundRobinPolicy;
+import com.datastax.driver.core.policies.WhiteListPolicy;
+import org.apache.cassandra.config.EncryptionOptions;
+import org.apache.cassandra.security.SSLFactory;
+import io.netty.util.internal.logging.InternalLoggerFactory;
+import io.netty.util.internal.logging.Slf4JLoggerFactory;
+import org.apache.cassandra.stress.settings.StressSettings;
+
+public class JavaDriverClient
+{
+
+    static
+    {
+        InternalLoggerFactory.setDefaultFactory(new Slf4JLoggerFactory());
+    }
+
+    public final String host;
+    public final int port;
+    private final EncryptionOptions.ClientEncryptionOptions encryptionOptions;
+    private Cluster cluster;
+    private Session session;
+    private final WhiteListPolicy whitelist;
+
+    private static final ConcurrentMap<String, PreparedStatement> stmts = new ConcurrentHashMap<>();
+
+    public JavaDriverClient(StressSettings settings, String host, int port)
+    {
+        this(settings, host, port, new EncryptionOptions.ClientEncryptionOptions());
+    }
+
+    public JavaDriverClient(StressSettings settings, String host, int port, EncryptionOptions.ClientEncryptionOptions encryptionOptions)
+    {
+        this.host = host;
+        this.port = port;
+        this.encryptionOptions = encryptionOptions;
+        if (settings.node.isWhiteList)
+            whitelist = new WhiteListPolicy(new DCAwareRoundRobinPolicy(), settings.node.resolveAll(settings.port.nativePort));
+        else
+            whitelist = null;
+    }
+
+    public PreparedStatement prepare(String query)
+    {
+        PreparedStatement stmt = stmts.get(query);
+        if (stmt != null)
+            return stmt;
+        synchronized (stmts)
+        {
+            stmt = stmts.get(query);
+            if (stmt != null)
+                return stmt;
+            stmt = getSession().prepare(query);
+            stmts.put(query, stmt);
+        }
+        return stmt;
+    }
+
+    public void connect(ProtocolOptions.Compression compression) throws Exception
+    {
+        Cluster.Builder clusterBuilder = Cluster.builder()
+                                                .addContactPoint(host)
+                                                .withPort(port)
+                                                .withoutMetrics(); // The driver uses metrics 3 with conflict with our version
+        if (whitelist != null)
+            clusterBuilder.withLoadBalancingPolicy(whitelist);
+        clusterBuilder.withCompression(compression);
+        if (encryptionOptions.enabled)
+        {
+            SSLContext sslContext;
+            sslContext = SSLFactory.createSSLContext(encryptionOptions, true);
+            SSLOptions sslOptions = new SSLOptions(sslContext, encryptionOptions.cipher_suites);
+            clusterBuilder.withSSL(sslOptions);
+        }
+        cluster = clusterBuilder.build();
+        Metadata metadata = cluster.getMetadata();
+        System.out.printf("Connected to cluster: %s%n",
+                metadata.getClusterName());
+        for (Host host : metadata.getAllHosts())
+        {
+            System.out.printf("Datatacenter: %s; Host: %s; Rack: %s%n",
+                    host.getDatacenter(), host.getAddress(), host.getRack());
+        }
+
+        session = cluster.connect();
+    }
+
+    public Cluster getCluster()
+    {
+        return cluster;
+    }
+
+    public Session getSession()
+    {
+        return session;
+    }
+
+    public ResultSet execute(String query, org.apache.cassandra.db.ConsistencyLevel consistency)
+    {
+        SimpleStatement stmt = new SimpleStatement(query);
+        stmt.setConsistencyLevel(from(consistency));
+        return getSession().execute(stmt);
+    }
+
+    public ResultSet executePrepared(PreparedStatement stmt, List<Object> queryParams, org.apache.cassandra.db.ConsistencyLevel consistency)
+    {
+
+        stmt.setConsistencyLevel(from(consistency));
+        BoundStatement bstmt = stmt.bind((Object[]) queryParams.toArray(new Object[queryParams.size()]));
+        return getSession().execute(bstmt);
+    }
+
+    /**
+     * Get ConsistencyLevel from a C* ConsistencyLevel. This exists in the Java Driver ConsistencyLevel,
+     * but it is not public.
+     *
+     * @param cl
+     * @return
+     */
+    public static ConsistencyLevel from(org.apache.cassandra.db.ConsistencyLevel cl)
+    {
+        switch (cl)
+        {
+            case ANY:
+                return com.datastax.driver.core.ConsistencyLevel.ANY;
+            case ONE:
+                return com.datastax.driver.core.ConsistencyLevel.ONE;
+            case TWO:
+                return com.datastax.driver.core.ConsistencyLevel.TWO;
+            case THREE:
+                return com.datastax.driver.core.ConsistencyLevel.THREE;
+            case QUORUM:
+                return com.datastax.driver.core.ConsistencyLevel.QUORUM;
+            case ALL:
+                return com.datastax.driver.core.ConsistencyLevel.ALL;
+            case LOCAL_QUORUM:
+                return com.datastax.driver.core.ConsistencyLevel.LOCAL_QUORUM;
+            case EACH_QUORUM:
+                return com.datastax.driver.core.ConsistencyLevel.EACH_QUORUM;
+        }
+        throw new AssertionError();
+    }
+
+    public void disconnect()
+    {
+        cluster.close();
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/JmxCollector.java b/tools/stress/src/org/apache/cassandra/stress/util/JmxCollector.java
new file mode 100644
index 0000000..8cfbebb
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/JmxCollector.java

@@ -0,0 +1,125 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.stress.util;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.tools.NodeProbe;
+
+public class JmxCollector implements Callable<JmxCollector.GcStats>
+{
+
+    public static class GcStats
+    {
+        public final double count;
+        public final double bytes;
+        public final double maxms;
+        public final double summs;
+        public final double sumsqms;
+        public final double sdvms;
+        public GcStats(double count, double bytes, double maxms, double summs, double sumsqms)
+        {
+            this.count = count;
+            this.bytes = bytes;
+            this.maxms = maxms;
+            this.summs = summs;
+            this.sumsqms = sumsqms;
+            double mean = summs / count;
+            double stdev = Math.sqrt((sumsqms / count) - (mean * mean));
+            if (Double.isNaN(stdev))
+                stdev = 0;
+            this.sdvms = stdev;
+        }
+        public GcStats(double fill)
+        {
+            this(fill, fill, fill, fill, fill);
+        }
+        public static GcStats aggregate(List<GcStats> stats)
+        {
+            double count = 0, bytes = 0, maxms = 0, summs = 0, sumsqms = 0;
+            for (GcStats stat : stats)
+            {
+                count += stat.count;
+                bytes += stat.bytes;
+                maxms += stat.maxms;
+                summs += stat.summs;
+                sumsqms += stat.sumsqms;
+            }
+            return new GcStats(count, bytes, maxms, summs, sumsqms);
+        }
+    }
+
+    final NodeProbe[] probes;
+
+    // TODO: should expand to whole cluster
+    public JmxCollector(Collection<String> hosts, int port)
+    {
+        probes = new NodeProbe[hosts.size()];
+        int i = 0;
+        for (String host : hosts)
+        {
+            probes[i] = connect(host, port);
+            probes[i].getAndResetGCStats();
+            i++;
+        }
+    }
+
+    private static NodeProbe connect(String host, int port)
+    {
+        try
+        {
+            return new NodeProbe(host, port);
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public GcStats call() throws Exception
+    {
+        final List<Future<GcStats>> futures = new ArrayList<>();
+        for (final NodeProbe probe : probes)
+        {
+            futures.add(TPE.submit(new Callable<GcStats>()
+            {
+                public GcStats call() throws Exception
+                {
+                    final double[] stats = probe.getAndResetGCStats();
+                    return new GcStats(stats[5], stats[4], stats[1], stats[2], stats[3]);
+                }
+            }));
+        }
+
+        List<GcStats> results = new ArrayList<>();
+        for (Future<GcStats> future : futures)
+            results.add(future.get());
+        return GcStats.aggregate(results);
+    }
+
+    private static final ExecutorService TPE = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors(), new NamedThreadFactory("JmxCollector"));
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/Operation.java b/tools/stress/src/org/apache/cassandra/stress/util/Operation.java
deleted file mode 100644
index e2e12f8..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/util/Operation.java
+++ /dev/null

@@ -1,334 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.util;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import java.io.IOException;
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-import java.util.Map;
-import java.util.HashMap;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Lists;
-
-import org.apache.cassandra.db.marshal.TimeUUIDType;
-import org.apache.cassandra.stress.Session;
-import org.apache.cassandra.stress.Stress;
-import org.apache.cassandra.transport.SimpleClient;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.CqlPreparedResult;
-import org.apache.cassandra.thrift.InvalidRequestException;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Hex;
-import org.apache.cassandra.utils.UUIDGen;
-
-public abstract class Operation
-{
-    public final int index;
-
-    protected final Session session;
-    protected static volatile Double nextGaussian = null;
-
-    public Operation(int idx)
-    {
-        index = idx;
-        session = Stress.session;
-    }
-
-    public Operation(Session client, int idx)
-    {
-        index = idx;
-        session = client;
-    }
-
-    /**
-     * Run operation
-     * @param client Cassandra Thrift client connection
-     * @throws IOException on any I/O error.
-     */
-    public abstract void run(CassandraClient client) throws IOException;
-
-    public void run(SimpleClient client) throws IOException {}
-
-    // Utility methods
-
-    protected List<ByteBuffer> generateValues()
-    {
-        if (session.averageSizeValues)
-        {
-            return generateRandomizedValues();
-        }
-
-        List<ByteBuffer> values = new ArrayList<ByteBuffer>();
-
-        for (int i = 0; i < session.getCardinality(); i++)
-        {
-            String hash = getMD5(Integer.toString(i));
-            int times = session.getColumnSize() / hash.length();
-            int sumReminder = session.getColumnSize() % hash.length();
-
-            String value = multiplyString(hash, times) + hash.substring(0, sumReminder);
-            values.add(ByteBuffer.wrap(value.getBytes()));
-        }
-
-        return values;
-    }
-
-    /**
-     * Generate values of average size specified by -S, up to cardinality specified by -C
-     * @return Collection of the values
-     */
-    protected List<ByteBuffer> generateRandomizedValues()
-    {
-        List<ByteBuffer> values = new ArrayList<ByteBuffer>();
-
-        int limit = 2 * session.getColumnSize();
-
-        for (int i = 0; i < session.getCardinality(); i++)
-        {
-            byte[] value = new byte[Stress.randomizer.nextInt(limit)];
-            Stress.randomizer.nextBytes(value);
-            values.add(ByteBuffer.wrap(value));
-        }
-
-        return values;
-    }
-
-    /**
-     * key generator using Gauss or Random algorithm
-     * @return byte[] representation of the key string
-     */
-    protected byte[] generateKey()
-    {
-        return (session.useRandomGenerator()) ? generateRandomKey() : generateGaussKey();
-    }
-
-    /**
-     * Random key generator
-     * @return byte[] representation of the key string
-     */
-    private byte[] generateRandomKey()
-    {
-        String format = "%0" + session.getTotalKeysLength() + "d";
-        return String.format(format, Stress.randomizer.nextInt(Stress.session.getNumDifferentKeys() - 1)).getBytes(UTF_8);
-    }
-
-    /**
-     * Gauss key generator
-     * @return byte[] representation of the key string
-     */
-    private byte[] generateGaussKey()
-    {
-        String format = "%0" + session.getTotalKeysLength() + "d";
-
-        for (;;)
-        {
-            double token = nextGaussian(session.getMean(), session.getSigma());
-
-            if (0 <= token && token < session.getNumDifferentKeys())
-            {
-                return String.format(format, (int) token).getBytes(UTF_8);
-            }
-        }
-    }
-
-    /**
-     * Gaussian distribution.
-     * @param mu is the mean
-     * @param sigma is the standard deviation
-     *
-     * @return next Gaussian distribution number
-     */
-    private static double nextGaussian(int mu, float sigma)
-    {
-        Random random = Stress.randomizer;
-
-        Double currentState = nextGaussian;
-        nextGaussian = null;
-
-        if (currentState == null)
-        {
-            double x2pi  = random.nextDouble() * 2 * Math.PI;
-            double g2rad = Math.sqrt(-2.0 * Math.log(1.0 - random.nextDouble()));
-
-            currentState = Math.cos(x2pi) * g2rad;
-            nextGaussian = Math.sin(x2pi) * g2rad;
-        }
-
-        return mu + currentState * sigma;
-    }
-
-    /**
-     * MD5 string generation
-     * @param input String
-     * @return md5 representation of the string
-     */
-    private String getMD5(String input)
-    {
-        MessageDigest md = FBUtilities.threadLocalMD5Digest();
-        byte[] messageDigest = md.digest(input.getBytes(UTF_8));
-        StringBuilder hash = new StringBuilder(new BigInteger(1, messageDigest).toString(16));
-
-        while (hash.length() < 32)
-            hash.append("0").append(hash);
-
-        return hash.toString();
-    }
-
-    /**
-     * Equal to python/ruby - 's' * times
-     * @param str String to multiple
-     * @param times multiplication times
-     * @return multiplied string
-     */
-    private String multiplyString(String str, int times)
-    {
-        StringBuilder result = new StringBuilder();
-
-        for (int i = 0; i < times; i++)
-            result.append(str);
-
-        return result.toString();
-    }
-
-    protected ByteBuffer columnName(int index, boolean timeUUIDComparator)
-    {
-        return timeUUIDComparator
-                ? TimeUUIDType.instance.decompose(UUIDGen.getTimeUUID())
-                : ByteBufferUtil.bytes(String.format("C%d", index));
-    }
-
-    protected String getExceptionMessage(Exception e)
-    {
-        String className = e.getClass().getSimpleName();
-        String message = (e instanceof InvalidRequestException) ? ((InvalidRequestException) e).getWhy() : e.getMessage();
-        return (message == null) ? "(" + className + ")" : String.format("(%s): %s", className, message);
-    }
-
-    protected void error(String message) throws IOException
-    {
-        if (!session.ignoreErrors())
-            throw new IOException(message);
-        else
-            System.err.println(message);
-    }
-
-    protected String getUnQuotedCqlBlob(String term, boolean isCQL3)
-    {
-        return getUnQuotedCqlBlob(term.getBytes(), isCQL3);
-    }
-
-    protected String getUnQuotedCqlBlob(byte[] term, boolean isCQL3)
-    {
-        return isCQL3
-             ? "0x" + Hex.bytesToHex(term)
-             : Hex.bytesToHex(term);
-    }
-
-    protected List<ByteBuffer> queryParamsAsByteBuffer(List<String> queryParams)
-    {
-        return Lists.transform(queryParams, new Function<String, ByteBuffer>()
-        {
-            public ByteBuffer apply(String param)
-            {
-                if (param.startsWith("0x"))
-                    param = param.substring(2);
-                return ByteBufferUtil.hexToBytes(param);
-            }
-        });
-    }
-
-    /**
-     * Constructs a CQL query string by replacing instances of the character
-     * '?', with the corresponding parameter.
-     *
-     * @param query base query string to format
-     * @param parms sequence of string query parameters
-     * @return formatted CQL query string
-     */
-    protected static String formatCqlQuery(String query, List<String> parms)
-    {
-        int marker, position = 0;
-        StringBuilder result = new StringBuilder();
-
-        if (-1 == (marker = query.indexOf('?')) || parms.size() == 0)
-            return query;
-
-        for (String parm : parms)
-        {
-            result.append(query.substring(position, marker));
-            result.append(parm);
-
-            position = marker + 1;
-            if (-1 == (marker = query.indexOf('?', position + 1)))
-                break;
-        }
-
-        if (position < query.length())
-            result.append(query.substring(position));
-
-        return result.toString();
-    }
-
-    protected Integer getPreparedStatement(CassandraClient client, String cqlQuery) throws Exception
-    {
-        Integer statementId = client.preparedStatements.get(cqlQuery.hashCode());
-        if (statementId == null)
-        {
-            CqlPreparedResult response = session.cqlVersion.startsWith("3")
-                                       ? client.prepare_cql3_query(ByteBufferUtil.bytes(cqlQuery), Compression.NONE)
-                                       : client.prepare_cql_query(ByteBufferUtil.bytes(cqlQuery), Compression.NONE);
-            statementId = response.itemId;
-            client.preparedStatements.put(cqlQuery.hashCode(), statementId);
-        }
-
-        return statementId;
-    }
-
-    private static final Map<Integer, byte[]> preparedStatementsNative = new HashMap<Integer, byte[]>();
-
-    protected static byte[] getPreparedStatement(SimpleClient client, String cqlQuery) throws Exception
-    {
-        byte[] statementId = preparedStatementsNative.get(cqlQuery.hashCode());
-        if (statementId == null)
-        {
-            statementId = client.prepare(cqlQuery).statementId.bytes;
-            preparedStatementsNative.put(cqlQuery.hashCode(), statementId);
-        }
-        return statementId;
-    }
-
-    protected String wrapInQuotesIfRequired(String string)
-    {
-        return session.cqlVersion.startsWith("3")
-                ? "\"" + string + "\""
-                : string;
-    }
-
-    public interface CQLQueryExecutor
-    {
-        public boolean execute(String query, List<String> queryParameters) throws Exception;
-    }
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/SampleOfLongs.java b/tools/stress/src/org/apache/cassandra/stress/util/SampleOfLongs.java
new file mode 100644
index 0000000..8be9f81
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/SampleOfLongs.java

@@ -0,0 +1,128 @@
+package org.apache.cassandra.stress.util;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+// represents a sample of long (latencies) together with the probability of selection of each sample (i.e. the ratio of
+// samples to total number of events). This is used to ensure that, when merging, the result has samples from each
+// with equal probability
+public final class SampleOfLongs
+{
+
+    // nanos
+    final long[] sample;
+
+    // probability with which each sample was selected
+    final double p;
+
+    SampleOfLongs(long[] sample, int p)
+    {
+        this.sample = sample;
+        this.p = 1 / (float) p;
+    }
+
+    SampleOfLongs(long[] sample, double p)
+    {
+        this.sample = sample;
+        this.p = p;
+    }
+
+    static SampleOfLongs merge(Random rnd, List<SampleOfLongs> merge, int maxSamples)
+    {
+        int maxLength = 0;
+        double targetp = 1;
+        for (SampleOfLongs sampleOfLongs : merge)
+        {
+            maxLength += sampleOfLongs.sample.length;
+            targetp = Math.min(targetp, sampleOfLongs.p);
+        }
+        long[] sample = new long[maxLength];
+        int count = 0;
+        for (SampleOfLongs latencies : merge)
+        {
+            long[] in = latencies.sample;
+            double p = targetp / latencies.p;
+            for (int i = 0 ; i < in.length ; i++)
+                if (rnd.nextDouble() < p)
+                    sample[count++] = in[i];
+        }
+        if (count > maxSamples)
+        {
+            targetp = subsample(rnd, maxSamples, sample, count, targetp);
+            count = maxSamples;
+        }
+        sample = Arrays.copyOf(sample, count);
+        Arrays.sort(sample);
+        return new SampleOfLongs(sample, targetp);
+    }
+
+    public SampleOfLongs subsample(Random rnd, int maxSamples)
+    {
+        if (maxSamples > sample.length)
+            return this;
+
+        long[] sample = this.sample.clone();
+        double p = subsample(rnd, maxSamples, sample, sample.length, this.p);
+        sample = Arrays.copyOf(sample, maxSamples);
+        return new SampleOfLongs(sample, p);
+    }
+
+    private static double subsample(Random rnd, int maxSamples, long[] sample, int count, double p)
+    {
+        // want exactly maxSamples, so select random indexes up to maxSamples
+        for (int i = 0 ; i < maxSamples ; i++)
+        {
+            int take = i + rnd.nextInt(count - i);
+            long tmp = sample[i];
+            sample[i] = sample[take];
+            sample[take] = tmp;
+        }
+
+        // calculate new p; have selected with probability maxSamples / count
+        // so multiply p by this probability
+        p *= maxSamples / (double) sample.length;
+        return p;
+    }
+
+    public double medianLatency()
+    {
+        if (sample.length == 0)
+            return 0;
+        return sample[sample.length >> 1] * 0.000001d;
+    }
+
+    // 0 < rank < 1
+    public double rankLatency(float rank)
+    {
+        if (sample.length == 0)
+            return 0;
+        int index = (int)(rank * sample.length);
+        if (index >= sample.length)
+            index = sample.length - 1;
+        return sample[index] * 0.000001d;
+    }
+
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/SimpleThriftClient.java b/tools/stress/src/org/apache/cassandra/stress/util/SimpleThriftClient.java
new file mode 100644
index 0000000..f813e93
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/SimpleThriftClient.java

@@ -0,0 +1,111 @@
+package org.apache.cassandra.stress.util;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.thrift.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.thrift.TException;
+
+public class SimpleThriftClient implements ThriftClient
+{
+
+    final Cassandra.Client client;
+    public SimpleThriftClient(Cassandra.Client client)
+    {
+        this.client = client;
+    }
+
+    public void batch_mutate(Map<ByteBuffer, Map<String, List<Mutation>>> record, ConsistencyLevel consistencyLevel) throws TException
+    {
+        client.batch_mutate(record, consistencyLevel);
+    }
+
+    @Override
+    public List<ColumnOrSuperColumn> get_slice(ByteBuffer key, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        return client.get_slice(key, column_parent, predicate, consistency_level);
+    }
+
+    @Override
+    public List<KeySlice> get_indexed_slices(ColumnParent column_parent, IndexClause index_clause, SlicePredicate column_predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        return client.get_indexed_slices(column_parent, index_clause, column_predicate, consistency_level);
+    }
+
+    @Override
+    public List<KeySlice> get_range_slices(ColumnParent column_parent, SlicePredicate predicate, KeyRange range, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        return client.get_range_slices(column_parent, predicate, range, consistency_level);
+    }
+
+    @Override
+    public Map<ByteBuffer, List<ColumnOrSuperColumn>> multiget_slice(List<ByteBuffer> keys, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        return client.multiget_slice(keys, column_parent, predicate, consistency_level);
+    }
+
+    @Override
+    public void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        client.insert(key, column_parent, column, consistency_level);
+    }
+
+    @Override
+    public Integer prepare_cql3_query(String query, Compression compression) throws InvalidRequestException, TException
+    {
+        return client.prepare_cql3_query(ByteBufferUtil.bytes(query), compression).itemId;
+    }
+
+    @Override
+    public CqlResult execute_prepared_cql_query(int itemId, ByteBuffer key, List<ByteBuffer> values) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    {
+        return client.execute_prepared_cql_query(itemId, values);
+    }
+
+    @Override
+    public Integer prepare_cql_query(String query, Compression compression) throws InvalidRequestException, TException
+    {
+        return client.prepare_cql_query(ByteBufferUtil.bytes(query), compression).itemId;
+    }
+
+    @Override
+    public CqlResult execute_cql3_query(String query, ByteBuffer key, Compression compression, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    {
+        return client.execute_cql3_query(ByteBufferUtil.bytes(query), compression, consistency);
+    }
+
+    @Override
+    public CqlResult execute_prepared_cql3_query(int itemId, ByteBuffer key, List<ByteBuffer> values, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    {
+        return client.execute_prepared_cql3_query(itemId, values, consistency);
+    }
+
+    @Override
+    public CqlResult execute_cql_query(String query, ByteBuffer key, Compression compression) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    {
+        return client.execute_cql_query(ByteBufferUtil.bytes(query), compression);
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/SmartThriftClient.java b/tools/stress/src/org/apache/cassandra/stress/util/SmartThriftClient.java
new file mode 100644
index 0000000..6367936
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/SmartThriftClient.java

@@ -0,0 +1,281 @@
+package org.apache.cassandra.stress.util;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.net.InetAddress;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.datastax.driver.core.Host;
+import com.datastax.driver.core.Metadata;
+import com.google.common.collect.Iterators;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.thrift.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.thrift.TException;
+
+public class SmartThriftClient implements ThriftClient
+{
+
+    final String keyspace;
+    final Metadata metadata;
+    final StressSettings settings;
+    final ConcurrentHashMap<InetAddress, ConcurrentLinkedQueue<Client>> cache = new ConcurrentHashMap<>();
+
+    final AtomicInteger queryIdCounter = new AtomicInteger();
+    final ConcurrentHashMap<Integer, String> queryStrings = new ConcurrentHashMap<>();
+    final ConcurrentHashMap<String, Integer> queryIds = new ConcurrentHashMap<>();
+    final Set<InetAddress> whiteset;
+    final List<InetAddress> whitelist;
+
+    public SmartThriftClient(StressSettings settings, String keyspace, Metadata metadata)
+    {
+        this.metadata = metadata;
+        this.keyspace = keyspace;
+        this.settings = settings;
+        if (!settings.node.isWhiteList)
+        {
+            whiteset = null;
+            whitelist = null;
+        }
+        else
+        {
+            whiteset = settings.node.resolveAllSpecified();
+            whitelist = Arrays.asList(whiteset.toArray(new InetAddress[0]));
+        }
+    }
+
+    private final AtomicInteger roundrobin = new AtomicInteger();
+
+    private Integer getId(String query)
+    {
+        Integer r;
+        if ((r = queryIds.get(query)) != null)
+            return r;
+        r = queryIdCounter.incrementAndGet();
+        if (queryIds.putIfAbsent(query, r) == null)
+        {
+            queryStrings.put(r, query);
+            return r;
+        }
+        return queryIds.get(query);
+    }
+
+    final class Client
+    {
+        final Cassandra.Client client;
+        final InetAddress server;
+        final Map<Integer, Integer> queryMap = new HashMap<>();
+
+        Client(Cassandra.Client client, InetAddress server)
+        {
+            this.client = client;
+            this.server = server;
+        }
+
+        Integer get(Integer id, boolean cql3) throws TException
+        {
+            Integer serverId = queryMap.get(id);
+            if (serverId != null)
+                return serverId;
+            prepare(id, cql3);
+            return queryMap.get(id);
+        }
+
+       void prepare(Integer id, boolean cql3) throws TException
+       {
+           String query;
+           while ( null == (query = queryStrings.get(id)) ) ;
+           if (cql3)
+           {
+               Integer serverId = client.prepare_cql3_query(ByteBufferUtil.bytes(query), Compression.NONE).itemId;
+               queryMap.put(id, serverId);
+           }
+           else
+           {
+               Integer serverId = client.prepare_cql_query(ByteBufferUtil.bytes(query), Compression.NONE).itemId;
+               queryMap.put(id, serverId);
+           }
+       }
+    }
+
+    private Client get(ByteBuffer pk)
+    {
+        Set<Host> hosts = metadata.getReplicas(metadata.quote(keyspace), pk);
+        InetAddress address = null;
+        if (hosts.size() > 0)
+        {
+            int pos = roundrobin.incrementAndGet() % hosts.size();
+            for (int i = 0 ; address == null && i < hosts.size() ; i++)
+            {
+                if (pos < 0)
+                    pos = -pos;
+                Host host = Iterators.get(hosts.iterator(), (pos + i) % hosts.size());
+                if (whiteset == null || whiteset.contains(host.getAddress()))
+                    address = host.getAddress();
+            }
+        }
+        if (address == null)
+            address = whitelist.get(ThreadLocalRandom.current().nextInt(whitelist.size()));
+        ConcurrentLinkedQueue<Client> q = cache.get(address);
+        if (q == null)
+        {
+            ConcurrentLinkedQueue<Client> newQ = new ConcurrentLinkedQueue<Client>();
+            q = cache.putIfAbsent(address, newQ);
+            if (q == null)
+                q = newQ;
+        }
+        Client tclient = q.poll();
+        if (tclient != null)
+            return tclient;
+        return new Client(settings.getRawThriftClient(address.getHostAddress()), address);
+    }
+
+    @Override
+    public void batch_mutate(Map<ByteBuffer, Map<String, List<Mutation>>> record, ConsistencyLevel consistencyLevel) throws TException
+    {
+        for (Map.Entry<ByteBuffer, Map<String, List<Mutation>>> e : record.entrySet())
+        {
+            Client client = get(e.getKey());
+            try
+            {
+                client.client.batch_mutate(Collections.singletonMap(e.getKey(), e.getValue()), consistencyLevel);
+            } finally
+            {
+                cache.get(client.server).add(client);
+            }
+        }
+    }
+
+    @Override
+    public List<ColumnOrSuperColumn> get_slice(ByteBuffer key, ColumnParent parent, SlicePredicate predicate, ConsistencyLevel consistencyLevel) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        Client client = get(key);
+        try
+        {
+            return client.client.get_slice(key, parent, predicate, consistencyLevel);
+        } finally
+        {
+            cache.get(client.server).add(client);
+        }
+    }
+
+    @Override
+    public void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        Client client = get(key);
+        try
+        {
+            client.client.insert(key, column_parent, column, consistency_level);
+        } finally
+        {
+            cache.get(client.server).add(client);
+        }
+    }
+
+    @Override
+    public CqlResult execute_cql_query(String query, ByteBuffer key, Compression compression) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    {
+        Client client = get(key);
+        try
+        {
+            return client.client.execute_cql_query(ByteBufferUtil.bytes(query), compression);
+        } finally
+        {
+            cache.get(client.server).add(client);
+        }
+    }
+
+    @Override
+    public CqlResult execute_cql3_query(String query, ByteBuffer key, Compression compression, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    {
+        Client client = get(key);
+        try
+        {
+            return client.client.execute_cql3_query(ByteBufferUtil.bytes(query), compression, consistency);
+        } finally
+        {
+            cache.get(client.server).add(client);
+        }
+    }
+
+    @Override
+    public Integer prepare_cql3_query(String query, Compression compression) throws InvalidRequestException, TException
+    {
+        return getId(query);
+    }
+
+    @Override
+    public CqlResult execute_prepared_cql3_query(int queryId, ByteBuffer key, List<ByteBuffer> values, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    {
+        Client client = get(key);
+        try
+        {
+            return client.client.execute_prepared_cql3_query(client.get(queryId, true), values, consistency);
+        } finally
+        {
+            cache.get(client.server).add(client);
+        }
+    }
+
+    @Override
+    public Integer prepare_cql_query(String query, Compression compression) throws InvalidRequestException, TException
+    {
+        return getId(query);
+    }
+
+    @Override
+    public CqlResult execute_prepared_cql_query(int queryId, ByteBuffer key, List<ByteBuffer> values) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    {
+        Client client = get(key);
+        try
+        {
+            return client.client.execute_prepared_cql_query(client.get(queryId, true), values);
+        } finally
+        {
+            cache.get(client.server).add(client);
+        }
+    }
+
+    @Override
+    public Map<ByteBuffer, List<ColumnOrSuperColumn>> multiget_slice(List<ByteBuffer> keys, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public List<KeySlice> get_range_slices(ColumnParent column_parent, SlicePredicate predicate, KeyRange range, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public List<KeySlice> get_indexed_slices(ColumnParent column_parent, IndexClause index_clause, SlicePredicate column_predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/ThriftClient.java b/tools/stress/src/org/apache/cassandra/stress/util/ThriftClient.java
new file mode 100644
index 0000000..3b13758
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/ThriftClient.java

@@ -0,0 +1,57 @@
+package org.apache.cassandra.stress.util;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.thrift.*;
+import org.apache.thrift.TException;
+
+public interface ThriftClient
+{
+
+    public void batch_mutate(Map<ByteBuffer, Map<String, List<Mutation>>> record, ConsistencyLevel consistencyLevel) throws TException;
+
+    List<ColumnOrSuperColumn> get_slice(ByteBuffer key, ColumnParent parent, SlicePredicate predicate, ConsistencyLevel consistencyLevel) throws InvalidRequestException, UnavailableException, TimedOutException, TException;
+
+    void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException;
+
+    Map<ByteBuffer, List<ColumnOrSuperColumn>> multiget_slice(List<ByteBuffer> keys, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException;
+
+    List<KeySlice> get_range_slices(ColumnParent column_parent, SlicePredicate predicate, KeyRange range, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException;
+
+    List<KeySlice> get_indexed_slices(ColumnParent column_parent, IndexClause index_clause, SlicePredicate column_predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException;
+
+    Integer prepare_cql3_query(String query, Compression compression) throws InvalidRequestException, TException;
+
+    CqlResult execute_prepared_cql3_query(int itemId, ByteBuffer key, List<ByteBuffer> values, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException;
+
+    CqlResult execute_cql_query(String query, ByteBuffer key, Compression compression) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException;
+
+    CqlResult execute_cql3_query(String query, ByteBuffer key, Compression compression, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException;
+
+    Integer prepare_cql_query(String query, Compression compression) throws InvalidRequestException, TException;
+
+    CqlResult execute_prepared_cql_query(int itemId, ByteBuffer key, List<ByteBuffer> values) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException;
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/Timer.java b/tools/stress/src/org/apache/cassandra/stress/util/Timer.java
new file mode 100644
index 0000000..4e2b0a3
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/Timer.java

@@ -0,0 +1,156 @@
+package org.apache.cassandra.stress.util;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.CountDownLatch;
+
+// a timer - this timer must be used by a single thread, and co-ordinates with other timers by
+public final class Timer
+{
+
+    private static final int SAMPLE_SIZE_SHIFT = 14;
+    private static final int SAMPLE_SIZE_MASK = (1 << SAMPLE_SIZE_SHIFT) - 1;
+
+    private final Random rnd = new Random();
+
+    // in progress snap start
+    private long sampleStartNanos;
+
+    // each entry is present with probability 1/p(opCount) or 1/(p(opCount)-1)
+    private final long[] sample = new long[1 << SAMPLE_SIZE_SHIFT];
+    private int opCount;
+
+    // aggregate info
+    private long partitionCount;
+    private long rowCount;
+    private long total;
+    private long max;
+    private long maxStart;
+    private long upToDateAsOf;
+    private long lastSnap = System.nanoTime();
+
+    // communication with summary/logging thread
+    private volatile CountDownLatch reportRequest;
+    volatile TimingInterval report;
+    private volatile TimingInterval finalReport;
+
+    public void start(){
+        // decide if we're logging this event
+        sampleStartNanos = System.nanoTime();
+    }
+
+    private static int p(int index)
+    {
+        return 1 + (index >>> SAMPLE_SIZE_SHIFT);
+    }
+
+    public boolean running()
+    {
+        return finalReport == null;
+    }
+
+    public void stop(long partitionCount, long rowCount)
+    {
+        maybeReport();
+        long now = System.nanoTime();
+        long time = now - sampleStartNanos;
+        if (rnd.nextInt(p(opCount)) == 0)
+            sample[index(opCount)] = time;
+        if (time > max)
+        {
+            maxStart = sampleStartNanos;
+            max = time;
+        }
+        total += time;
+        opCount += 1;
+        this.partitionCount += partitionCount;
+        this.rowCount += rowCount;
+        upToDateAsOf = now;
+    }
+
+    private static int index(int count)
+    {
+        return count & SAMPLE_SIZE_MASK;
+    }
+
+    private TimingInterval buildReport()
+    {
+        final List<SampleOfLongs> sampleLatencies = Arrays.asList
+                (       new SampleOfLongs(Arrays.copyOf(sample, index(opCount)), p(opCount)),
+                        new SampleOfLongs(Arrays.copyOfRange(sample, index(opCount), Math.min(opCount, sample.length)), p(opCount) - 1)
+                );
+        final TimingInterval report = new TimingInterval(lastSnap, upToDateAsOf, max, maxStart, max, partitionCount, rowCount, total, opCount,
+                SampleOfLongs.merge(rnd, sampleLatencies, Integer.MAX_VALUE));
+        // reset counters
+        opCount = 0;
+        partitionCount = 0;
+        rowCount = 0;
+        total = 0;
+        max = 0;
+        lastSnap = upToDateAsOf;
+        return report;
+    }
+
+    // checks to see if a report has been requested, and if so produces the report, signals and clears the request
+    private void maybeReport()
+    {
+        if (reportRequest != null)
+        {
+            synchronized (this)
+            {
+                report = buildReport();
+                reportRequest.countDown();
+                reportRequest = null;
+            }
+        }
+    }
+
+    // checks to see if the timer is dead; if not requests a report, and otherwise fulfills the request itself
+    synchronized void requestReport(CountDownLatch signal)
+    {
+        if (finalReport != null)
+        {
+            report = finalReport;
+            finalReport = new TimingInterval(0);
+            signal.countDown();
+        }
+        else
+            reportRequest = signal;
+    }
+
+    // closes the timer; if a request is outstanding, it furnishes the request, otherwise it populates finalReport
+    public synchronized void close()
+    {
+        if (reportRequest == null)
+            finalReport = buildReport();
+        else
+        {
+            finalReport = new TimingInterval(0);
+            report = buildReport();
+            reportRequest.countDown();
+            reportRequest = null;
+        }
+    }
+}
\ No newline at end of file

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/Timing.java b/tools/stress/src/org/apache/cassandra/stress/util/Timing.java
new file mode 100644
index 0000000..9a5fa56
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/Timing.java

@@ -0,0 +1,128 @@
+package org.apache.cassandra.stress.util;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+// relatively simple timing class for getting a uniform sample of latencies, and saving other metrics
+// ensures accuracy of timing by having single threaded timers that are check-pointed by the snapping thread,
+// which waits for them to report back. They report back the data up to the last event prior to the check-point.
+// if the threads are blocked/paused this may mean a period of time longer than the checkpoint elapses, but that all
+// metrics calculated over the interval are accurate
+public class Timing
+{
+
+    private final CopyOnWriteArrayList<Timer> timers = new CopyOnWriteArrayList<>();
+    private volatile TimingInterval history;
+    private final Random rnd = new Random();
+    private boolean done;
+
+    // TIMING
+
+    public static class TimingResult<E>
+    {
+        public final E extra;
+        public final TimingInterval timing;
+        public TimingResult(E extra, TimingInterval timing)
+        {
+            this.extra = extra;
+            this.timing = timing;
+        }
+    }
+
+    private <E> TimingResult<E> snap(Random rnd, Callable<E> call) throws InterruptedException
+    {
+        final Timer[] timers = this.timers.toArray(new Timer[0]);
+        final CountDownLatch ready = new CountDownLatch(timers.length);
+        for (int i = 0 ; i < timers.length ; i++)
+        {
+            final Timer timer = timers[i];
+            timer.requestReport(ready);
+        }
+
+        E extra;
+        try
+        {
+            extra = call.call();
+        }
+        catch (Exception e)
+        {
+            if (e instanceof InterruptedException)
+                throw (InterruptedException) e;
+            throw new RuntimeException(e);
+        }
+
+        // TODO fail gracefully after timeout if a thread is stuck
+        if (!ready.await(5L, TimeUnit.MINUTES))
+            throw new RuntimeException("Timed out waiting for a timer thread - seems one got stuck");
+
+        boolean done = true;
+        // reports have been filled in by timer threadCount, so merge
+        List<TimingInterval> intervals = new ArrayList<>();
+        for (Timer timer : timers)
+        {
+            intervals.add(timer.report);
+            done &= !timer.running();
+        }
+
+        this.done = done;
+        return new TimingResult<>(extra, TimingInterval.merge(rnd, intervals, Integer.MAX_VALUE, history.endNanos()));
+    }
+
+    // build a new timer and add it to the set of running timers
+    public Timer newTimer()
+    {
+        final Timer timer = new Timer();
+        timers.add(timer);
+        return timer;
+    }
+
+    public void start()
+    {
+        history = new TimingInterval(System.nanoTime());
+    }
+
+    public boolean done()
+    {
+        return done;
+    }
+
+    public <E> TimingResult<E> snap(Callable<E> call) throws InterruptedException
+    {
+        final TimingResult<E> result = snap(rnd, call);
+        history = TimingInterval.merge(rnd, Arrays.asList(result.timing, history), 200000, history.startNanos());
+        return result;
+    }
+
+    public TimingInterval getHistory()
+    {
+        return history;
+    }
+
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/TimingInterval.java b/tools/stress/src/org/apache/cassandra/stress/util/TimingInterval.java
new file mode 100644
index 0000000..50ab608
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/TimingInterval.java

@@ -0,0 +1,161 @@
+package org.apache.cassandra.stress.util;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+// represents measurements taken over an interval of time
+// used for both single timer results and merged timer results
+public final class TimingInterval
+{
+    // nanos
+    private final long start;
+    private final long end;
+    public final long maxLatency;
+    public final long pauseLength;
+    public final long pauseStart;
+    public final long totalLatency;
+
+    // discrete
+    public final long partitionCount;
+    public final long rowCount;
+    public final long operationCount;
+
+    final SampleOfLongs sample;
+
+    TimingInterval(long time)
+    {
+        start = end = time;
+        maxLatency = totalLatency = 0;
+        partitionCount = rowCount = operationCount = 0;
+        pauseStart = pauseLength = 0;
+        sample = new SampleOfLongs(new long[0], 1d);
+    }
+    TimingInterval(long start, long end, long maxLatency, long pauseStart, long pauseLength, long partitionCount, long rowCount, long totalLatency, long operationCount, SampleOfLongs sample)
+    {
+        this.start = start;
+        this.end = Math.max(end, start);
+        this.maxLatency = maxLatency;
+        this.partitionCount = partitionCount;
+        this.rowCount = rowCount;
+        this.totalLatency = totalLatency;
+        this.operationCount = operationCount;
+        this.pauseStart = pauseStart;
+        this.pauseLength = pauseLength;
+        this.sample = sample;
+    }
+
+    // merge multiple timer intervals together
+    static TimingInterval merge(Random rnd, List<TimingInterval> intervals, int maxSamples, long start)
+    {
+        long operationCount = 0, partitionCount = 0, rowCount = 0;
+        long maxLatency = 0, totalLatency = 0;
+        List<SampleOfLongs> latencies = new ArrayList<>();
+        long end = 0;
+        long pauseStart = 0, pauseEnd = Long.MAX_VALUE;
+        for (TimingInterval interval : intervals)
+        {
+            end = Math.max(end, interval.end);
+            operationCount += interval.operationCount;
+            maxLatency = Math.max(interval.maxLatency, maxLatency);
+            totalLatency += interval.totalLatency;
+            partitionCount += interval.partitionCount;
+            rowCount += interval.rowCount;
+            latencies.addAll(Arrays.asList(interval.sample));
+            if (interval.pauseLength > 0)
+            {
+                pauseStart = Math.max(pauseStart, interval.pauseStart);
+                pauseEnd = Math.min(pauseEnd, interval.pauseStart + interval.pauseLength);
+            }
+        }
+        if (pauseEnd < pauseStart)
+            pauseEnd = pauseStart = 0;
+        return new TimingInterval(start, end, maxLatency, pauseStart, pauseEnd - pauseStart, partitionCount, rowCount, totalLatency, operationCount,
+                SampleOfLongs.merge(rnd, latencies, maxSamples));
+
+    }
+
+    public double opRate()
+    {
+        return operationCount / ((end - start) * 0.000000001d);
+    }
+
+    public double adjustedRowRate()
+    {
+        return rowCount / ((end - (start + pauseLength)) * 0.000000001d);
+    }
+
+    public double partitionRate()
+    {
+        return partitionCount / ((end - start) * 0.000000001d);
+    }
+
+    public double rowRate()
+    {
+        return rowCount / ((end - start) * 0.000000001d);
+    }
+
+    public double meanLatency()
+    {
+        return (totalLatency / (double) operationCount) * 0.000001d;
+    }
+
+    public double maxLatency()
+    {
+        return maxLatency * 0.000001d;
+    }
+
+    public long runTime()
+    {
+        return (end - start) / 1000000;
+    }
+
+    public double medianLatency()
+    {
+        return sample.medianLatency();
+    }
+
+    // 0 < rank < 1
+    public double rankLatency(float rank)
+    {
+        return sample.rankLatency(rank);
+    }
+
+    public final long endNanos()
+    {
+        return end;
+    }
+
+    public final long endMillis()
+    {
+        return end / 1000000;
+    }
+
+    public long startNanos()
+    {
+        return start;
+    }
+}
+

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/Uncertainty.java b/tools/stress/src/org/apache/cassandra/stress/util/Uncertainty.java
new file mode 100644
index 0000000..dfc1b85
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/util/Uncertainty.java

@@ -0,0 +1,102 @@
+package org.apache.cassandra.stress.util;
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.CountDownLatch;
+
+// TODO: do not assume normal distribution of measurements.
+public class Uncertainty
+{
+
+    private int measurements;
+    private double sumsquares;
+    private double sum;
+    private double stdev;
+    private double mean;
+    private double uncertainty;
+
+    private CopyOnWriteArrayList<WaitForTargetUncertainty> waiting = new CopyOnWriteArrayList<>();
+
+    private static final class WaitForTargetUncertainty
+    {
+        final double targetUncertainty;
+        final int minMeasurements;
+        final int maxMeasurements;
+        final CountDownLatch latch = new CountDownLatch(1);
+
+        private WaitForTargetUncertainty(double targetUncertainty, int minMeasurements, int maxMeasurements)
+        {
+            this.targetUncertainty = targetUncertainty;
+            this.minMeasurements = minMeasurements;
+            this.maxMeasurements = maxMeasurements;
+        }
+
+        void await() throws InterruptedException
+        {
+            latch.await();
+        }
+
+    }
+
+    public void update(double value)
+    {
+        measurements++;
+        sumsquares += value * value;
+        sum += value;
+        mean = sum / measurements;
+        stdev = Math.sqrt((sumsquares / measurements) - (mean * mean));
+        uncertainty = (stdev / Math.sqrt(measurements)) / mean;
+
+        for (WaitForTargetUncertainty waiter : waiting)
+        {
+            if ((uncertainty < waiter.targetUncertainty && measurements >= waiter.minMeasurements) || (measurements >= waiter.maxMeasurements))
+            {
+                waiter.latch.countDown();
+                // can safely remove as working over snapshot with COWArrayList
+                waiting.remove(waiter);
+            }
+        }
+    }
+
+    public void await(double targetUncertainty, int minMeasurements, int maxMeasurements) throws InterruptedException
+    {
+        final WaitForTargetUncertainty wait = new WaitForTargetUncertainty(targetUncertainty, minMeasurements, maxMeasurements);
+        waiting.add(wait);
+        wait.await();
+    }
+
+    public double getUncertainty()
+    {
+        return uncertainty;
+    }
+
+    public void wakeAll()
+    {
+        for (WaitForTargetUncertainty waiting : this.waiting)
+        {
+            waiting.latch.countDown();
+            this.waiting.remove(waiting);
+        }
+    }
+
+}
commit	f4037edbfb1e471f104e836e96f61619ae030d42	[log] [tgz]
author	Sylvain Lebresne <sylvain@datastax.com>	Fri Oct 17 11:54:43 2014 +0200
committer	Sylvain Lebresne <sylvain@datastax.com>	Fri Oct 17 11:54:43 2014 +0200
tree	bf7507d25dc5d4853ad4099599ca5aceeea9bd14
parent	24e4210a4f7e6d18346aed6114c39e85a115dc6c [diff]
parent	29a8b882d8f4192588b85b77c41c00942508b8ce [diff]